update interfaces of activate function

80b83983 · xuchen · 2ab2afc9 · 80b83983 · 80b83983 · 80b83983
Commit 80b83983 authored Jul 21, 2019 by xuchen
--- a/Makefile
+++ b/Makefile
 # the prefix of the generated executable file
-PREFIX := niutrans
-TENSOR := $(PREFIX).tensor
-NETWORK := $(PREFIX).network
+PREFIX = NiuTrans
+NIUTRANS_EXE := $(PREFIX).Tensor

-# code path
-SRC = ./source
+# code path and generated file path
+ROOT = .
+SRC = $(ROOT)/source
+LIB_DIR = $(ROOT)/lib
+EXE_DIR = $(ROOT)/bin

-# use gpu ?
+# whether to generate dll
+dll = 0
+
+# 0 - use CPU 
+# 1 - use GPU
 USE_CUDA = 1
 # modify this path if neccessary
 CUDA_ROOT = /usr/local/cuda-9.0
@@ -65,8 +71,8 @@ ifeq ($(USE_MKL), 1)
 	                 $(MKL_LIB_DIR)/libmkl_core.a \
 					 $(MKL_LIB_DIR)/libmkl_intel_thread.a \
 					 $(INTEL_ROOT)/lib/intel64/libiomp5.a                                              
-    #DYNAMIC_DEPLIB += -liomp5 -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core
-endif   
+    DYNAMIC_DEPLIB += -liomp5 -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core
+endif
 ifeq ($(USE_OPENBLAS), 1)
    STATIC_DEPLIB += $(OPENBLAS_LIB_DIR)/libopenblas.a
    DYNAMIC_DEPLIB += -lopenblas
@@ -93,25 +99,22 @@ ifeq ($(USE_INTEL_COMPILER), 1)
 endif

 # main file
-MAIN_FILE = Main.cpp 
-Tensor_Main := $(SRC)/tensor/$(MAIN_FILE)
-Network_Main := $(SRC)/network/$(MAIN_FILE)
-
-TENSOR_CPU := $(TENSOR).cpu
-TENSOR_GPU := $(TENSOR).gpu
-NETWORK_CPU := $(NETWORK).cpu
-NETWORK_GPU := $(NETWORK).gpu
+MAIN_FILE = $(SRC)/network/Main.cpp
+Tensor_Main := $(SRC)/tensor/Main.cpp
+Network_Main := $(SRC)/network/Main.cpp

 ifeq ($(USE_CUDA), 1)
-	TENSOR := $(TENSOR_GPU)
-	NETWORK := $(NETWORK_GPU)
+	NIUTRANS_EXE := $(NIUTRANS_EXE).GPU
 else
-	TENSOR := $(TENSOR_CPU)
-	NETWORK := $(NETWORK_CPU)
+	NIUTRANS_EXE := $(NIUTRANS_EXE).CPU
 endif

+NIUTRANS_DLL := $(LIB_DIR)/lib$(NIUTRANS_EXE).so
+
+NIUTRANS_EXE := $(EXE_DIR)/$(NIUTRANS_EXE)
+
 # specify the compiling arguments here
-CFLAGS = -msse4.2 -w -march=native -Wno-enum-compare -Wno-sign-compare -Wno-reorder -Wno-format
+CFLAGS = -std=c++11 -msse4.2 -w -march=native -Wno-enum-compare -Wno-sign-compare -Wno-reorder -Wno-format

 # gtx 1080 arch=compute_61,code=sm_61
 # k80 arch=compute_37,code=sm_37
@@ -154,26 +157,52 @@ ifeq ($(USE_CUDA), 1)
 	OBJS := $(patsubst %.cu,%.cuo,$(OBJS))
 endif

-all: start tensor network finish
+all: start lib exe finish

-tensor: $(TENSOR)
+start:
+	@echo ""
+	@echo "Start building ..."

-network: $(NETWORK)
+lib: start_lib niutrans_dll finish_lib

-$(TENSOR): $(OBJS) $(Tensor_Main)
-	@echo "Making executable file: $(TENSOR)"
-	@$(CXX) $(Tensor_Main) $(CXXFLAGS) $(MACRO) $(LDFLAGS) $(OBJS) $(DEPLIBS) -o $@
+start_lib:
+	@mkdir -p $(LIB_DIR)
+	@echo ""
+	@echo "Start building library"

-$(NETWORK): $(OBJS) $(Network_Main)
-	@echo "Making executable file: $(NETWORK)"
-	@$(CXX) $(Network_Main) $(CXXFLAGS) $(MACRO) $(LDFLAGS) $(OBJS) $(DEPLIBS) -o $@
+niutrans_dll: $(NIUTRANS_DLL)

-start:
+$(NIUTRANS_DLL): $(OBJS)
+ifeq ($(dll), 1)
+	@echo "Building dynamic link library: $(NIUTRANS_DLL)"
+	@$(CXX) -shared -Wall $(CXXFLAGS) $(MACRO) $(LDFLAGS) $(OBJS) $(DEPLIBS) -o $@
+else
+	@echo "Skip building dynamic link library"
+endif
+	
+finish_lib:
+	@echo "Finish building library"
+	@echo ""
+
+exe: start_exe niutrans_exe finish_exe
+
+start_exe:
+	@mkdir -p $(EXE_DIR)
+	@echo ""
+	@echo "Start building executable file"
+
+niutrans_exe: $(NIUTRANS_EXE)
+
+$(NIUTRANS_EXE): $(OBJS) $(MAIN_FILE)
+	@echo "Building executable file: $(NIUTRANS_EXE)"
+	@$(CXX) $(MAIN_FILE) $(CXXFLAGS) $(MACRO) $(LDFLAGS) $(OBJS) $(DEPLIBS) -o $@
+
+finish_exe:
+	@echo "Finish building executable file"
 	@echo ""
-	@echo "Start Making ..."

 finish:
-	@echo "finish Making ..."
+	@echo "Finish building ..."
 	@echo ""

 %.o: %.c
@@ -183,13 +212,14 @@ finish:
 	@$(CXX) $(CXXFLAGS) $(MACRO) -c $< -o $@

 %.cuo: %.cu
+ifeq ($(dll), 1)
+	@$(NVCC) --shared --compiler-options '-fPIC' $(CUDA_FLAG) -c $< -o $@
+else
 	@$(NVCC) $(CUDA_FLAG) -c $< -o $@
+endif

 .PHONY: clean
 clean:
-	@echo "Making clean object files"
+	@echo "Cleaning object files"
 	@-rm -f $(OBJS)
-
-cleanexe:
-	@echo "Making clean executable files"
-	@-rm -f $(TENSOR_CPU) $(NETWORK_CPU) $(TENSOR_GPU) $(NETWORK_GPU)
\ No newline at end of file
+	
\ No newline at end of file
--- a/source/network/Main.cpp
+++ b/source/network/Main.cpp
@@ -35,8 +35,6 @@
 void BackwardTest();
 void TransposeTest();
 void SumDimTest();
-void SplitBackwardTest();
-void MemTest();

 using namespace nts;
 using namespace fnnlm;
@@ -44,15 +42,12 @@ using namespace transformer;

 int main( int argc, const char ** argv )
 {
-    //MemTest();
-    //return 0;
-    //SplitBackwardTest();
-    //return 0;
-    //_CrtSetBreakAlloc(896);
-    //BackwardTest();
-    //return 0;
-
-    if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
+    //_CrtSetDbgFlag(_CrtSetDbgFlag(_CRTDBG_REPORT_FLAG) | _CRTDBG_LEAK_CHECK_DF);
+    //_CrtSetBreakAlloc(2708);
+
+    if(argc > 1 && !strcmp(argv[1], "-test"))
+        Test();
+    else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
        FNNLMMain(argc - 1, argv + 1);
    else if(argc > 1 && !strcmp(argv[1], "-t2t"))
        TransformerMain(argc - 1, argv + 1);
@@ -61,6 +56,7 @@ int main( int argc, const char ** argv )
        fprintf(stderr, "neural networks in an easy way. \n\n");
        fprintf(stderr, "Run this program with \"-test\" for unit test!\n");
        fprintf(stderr, "Or run this program with \"-fnnlm\" for sample FNNLM!\n");
+        fprintf(stderr, "Or run this program with \"-t2t\" for sample Transformer!\n");
    }

    //_CrtDumpMemoryLeaks();
@@ -215,67 +211,3 @@ void SumDimTest()

    delete[] data;
 }
-
-void SplitBackwardTest()
-{
-    int * dimSize = new int[2];
-    dimSize[0] = 2;
-    dimSize[1] = 4;
-
-    XTensor t1;
-    InitTensor2D(&t1, 2, 4, X_FLOAT, 0, NULL);
-    XTensor t2;
-    InitTensor2D(&t2, 2, 4, X_FLOAT, 0, NULL);
-    XTensor tensor;
-    
-    //_SetDataFixedFloat(&t1, 1.0F);
-    //_SetDataFixedFloat(&t2, 2.0F);
-    t1.SetDataRand();
-    t2.SetDataRand();
-
-    tensor = t1 + t2;
-
-    XList smalls;
-
-    XTensor first;
-    XTensor second;
-    InitTensor2D(&first, 2, 2, X_FLOAT, 0, NULL);
-    InitTensor2D(&second, 2, 2, X_FLOAT, 0, NULL);
-    smalls.Add(&first);
-    smalls.Add(&second);
-
-    Split(tensor, smalls, 1, 2);
-
-    XTensor mul;
-    mul = Sum(first, second);
-
-    XNet net;
-    net.Backward(mul);
-    net.Dump(stderr);
-
-    printf("Done!");
-}
-
-void MemTest()
-{
-    XMem * mem;
-    mem = new XMem(0, FREE_ON_THE_FLY, (MTYPE)MILLION, 1024, MILLION);
-    
-    XTensor tensor;
-    InitTensor2D(&tensor, 2, 4, X_FLOAT, 0, mem);
-    
-    tensor.SetZeroAll();
-
-    tensor.Dump(stderr);
-
-    delete mem;
-
-    if (tensor.mem != NULL) {
-        printf("It isn't null!\n");
-        printf("%d\n", (int)tensor.mem->signature);
-    }
-    else {
-        printf("It's null\n");
-    }
-    tensor.Dump(stderr);
-}
\ No newline at end of file
--- a/source/network/XBackwardFunc.cpp
+++ b/source/network/XBackwardFunc.cpp
@@ -43,18 +43,18 @@ void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient)
    XNoder::MakeGrad(input);

    if(operID == FUNC_HARDTANH)
-        _HardTanHBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
+        _HardTanHBackward(output, input, output->grad, input->grad);
    else if(operID == FUNC_IDENTITY)
-        _IdentityBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
+        _IdentityBackward(output, input, output->grad, input->grad);
    else if(operID == FUNC_LOGSOFTMAX){
        int leadDim = income.GetParamInt(0);
        CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in logsoftmax!");
        _LogSoftmaxBackward(NULL, output, input, output->grad, input->grad, NULL, leadDim, NOLOSS);
    }
    else if(operID == FUNC_RECTIFY)
-        _RectifyBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
+        _RectifyBackward(output, input, output->grad, input->grad);
    else if(operID == FUNC_SIGMOID)
-        _SigmoidBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
+        _SigmoidBackward(output, input, output->grad, input->grad);
    else if(operID == FUNC_SOFTMAX){
        int leadDim = income.GetParamInt(0);
        CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in softmax!");

--- a/source/network/XBackwardLoss.cpp
+++ b/source/network/XBackwardLoss.cpp
@@ -20,7 +20,9 @@
 */

 #include "XBackwardLoss.h"
+#include "XNoder.h"
 #include "../tensor/XName.h"
+#include "../tensor/function/FHeader.h"
 #include "../tensor/core/getandset/SetData.h"
 #include "../tensor/function/HardTanH.h"
 #include "../tensor/function/Identity.h"
@@ -31,6 +33,60 @@

 namespace nts{

+
+/* compute dE/dx of a node */
+void XLossGrad::MakeGrad(XTensor * node, bool isEfficient)
+{
+    XLink &income = node->income;
+    int operID = income.typeID;
+
+    CheckNTErrors(income.tailNum >= 1, "Wrong number of tensors for loss computation!");
+
+    XTensor * output = income.tails[0];
+    XTensor * gold = NULL;
+    XTensor * weight = NULL;
+    XTensor * padding = NULL;
+    int leadingDim;
+
+    XNoder::MakeGrad(output);
+    XTensor * dedy = output->grad;
+
+    if (income.tailNum == 1) {
+        if(dedy->dataType == X_FLOAT)
+            _SetDataFixedFloat(dedy, 1.0F);
+        else if(dedy->dataType == X_DOUBLE)
+            _SetDataFixedDouble(dedy, 1.0);
+        else if(dedy->dataType == X_INT)
+            _SetDataFixedInt(dedy, 1);
+        else
+            ShowNTErrors("TODO");
+
+        return;
+    }
+
+    gold = income.tails[1];
+
+    if(operID == LOSS_CROSSENTROPY) {
+        if (income.tailNum == 3) 
+            padding = income.tails[2];
+        leadingDim = income.GetParamInt(0);
+        CheckNTErrors(leadingDim >= 0 && leadingDim < output->order, "wrong leading dimension in logsoftmax!");
+        _CrossEntropyBackward(dedy, output, gold, weight, padding, leadingDim);
+    }
+    else{
+        ShowNTErrors("Wrong activation function type!");
+    }
+
+    node->visitMark = NODE_FINISHED;
+}
+
+/* indicates whether the node is for a loss computation */
+bool XLossGrad::IsLossOP(XTensor * node)
+{
+    XLink &income = node->income;
+    return (income.typeID & LOSS_BASE) != 0;
+}
+
 /* 
 compute dE/dx for a given function y = f(x) 
 >> gold - gold standard to measure error (or loss)
@@ -42,39 +98,39 @@ compute dE/dx for a given function y = f(x)
 >> params - parameters of the function
 >> lossName - name of the loss, e.g., cross entropy
 */
-void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x, 
-                        XTensor * dedy, XTensor * dedx, XTensor * padding,
-                        int funcID, void * params,
-                        LOSS_FUNCTION_NAME lossName)
-{
-    CheckNTErrors(gold && y && x, "Empty input tensors!");
-    CheckNTErrors(dedx, "Empty gradient tensors!");
-    CheckNTErrors((funcID & FUNCTION_BASE) != 0, "Illegal function id");
-
-    if(funcID == FUNC_HARDTANH){
-        _HardTanHBackward(gold, y, x, dedy, dedx, lossName);
-    }
-    else if(funcID == FUNC_IDENTITY){
-        _IdentityBackward(gold, y, x, dedy, dedx, lossName);
-    }
-    else if(funcID == FUNC_LOGSOFTMAX){
-        int leadDim = *(int*)params;
-        _LogSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
-    }
-    else if(funcID == FUNC_RECTIFY){
-        _RectifyBackward(gold, y, x, dedy, dedx, lossName);
-    }
-    else if(funcID == FUNC_SIGMOID){
-        _SigmoidBackward(gold, y, x, dedy, dedx, lossName);
-    }else if(funcID == FUNC_SOFTMAX){
-        int leadDim = *(int*)params;
-        _SoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
-    }
-    else{
-        ShowNTErrors("wrong function found when call the backward process!");
-    }
-
-}
+//void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x, 
+//                        XTensor * dedy, XTensor * dedx, XTensor * padding,
+//                        int funcID, void * params,
+//                        LOSS_FUNCTION_NAME lossName)
+//{
+//    CheckNTErrors(gold && y && x, "Empty input tensors!");
+//    CheckNTErrors(dedx, "Empty gradient tensors!");
+//    CheckNTErrors((funcID & FUNCTION_BASE) != 0, "Illegal function id");
+//
+//    if(funcID == FUNC_HARDTANH){
+//        _HardTanHBackward(gold, y, x, dedy, dedx, lossName);
+//    }
+//    else if(funcID == FUNC_IDENTITY){
+//        _IdentityBackward(gold, y, x, dedy, dedx, lossName);
+//    }
+//    else if(funcID == FUNC_LOGSOFTMAX){
+//        int leadDim = *(int*)params;
+//        _LogSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
+//    }
+//    else if(funcID == FUNC_RECTIFY){
+//        _RectifyBackward(gold, y, x, dedy, dedx, lossName);
+//    }
+//    else if(funcID == FUNC_SIGMOID){
+//        _SigmoidBackward(gold, y, x, dedy, dedx, lossName);
+//    }else if(funcID == FUNC_SOFTMAX){
+//        int leadDim = *(int*)params;
+//        _SoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
+//    }
+//    else{
+//        ShowNTErrors("wrong function found when call the backward process!");
+//    }
+//
+//}

 /* 
 compute dE/dy for variable y and error(loss) function E
@@ -83,27 +139,27 @@ compute dE/dy for variable y and error(loss) function E
 >> dedy - dE/dy
 >> lossName - name of the loss, e.g., cross entropy
 */
-void XLossGrad::Compute(XTensor * gold, XTensor * y, 
-                        XTensor * dedy, XTensor * padding,
-                        LOSS_FUNCTION_NAME lossName)
-{
-    if(gold == NULL){
-        if(dedy->dataType == X_FLOAT)
-            _SetDataFixedFloat(dedy, 1.0F);
-        else if(dedy->dataType == X_DOUBLE)
-            _SetDataFixedDouble(dedy, 1.0);
-        else if(dedy->dataType == X_INT)
-            _SetDataFixedInt(dedy, 1);
-        else{
-            ShowNTErrors("TODO");
-        }
-        return;
-    }
-
-    //_LossBackward(dedy, gold, y, lossName);
-    if(lossName == CROSSENTROPY)
-        _CrossEntropyBackward(dedy, y, gold, NULL, padding);
-
-}
+//void XLossGrad::Compute(XTensor * gold, XTensor * y, 
+//                        XTensor * dedy, XTensor * padding,
+//                        LOSS_FUNCTION_NAME lossName)
+//{
+//    if(gold == NULL){
+//        if(dedy->dataType == X_FLOAT)
+//            _SetDataFixedFloat(dedy, 1.0F);
+//        else if(dedy->dataType == X_DOUBLE)
+//            _SetDataFixedDouble(dedy, 1.0);
+//        else if(dedy->dataType == X_INT)
+//            _SetDataFixedInt(dedy, 1);
+//        else{
+//            ShowNTErrors("TODO");
+//        }
+//        return;
+//    }
+//
+//    //_LossBackward(dedy, gold, y, lossName);
+//    if(lossName == CROSSENTROPY)
+//        _CrossEntropyBackward(dedy, y, gold, NULL, padding);
+//
+//}

 }
\ No newline at end of file
--- a/source/network/XBackwardLoss.h
+++ b/source/network/XBackwardLoss.h
@@ -23,6 +23,7 @@

 #include "../tensor/XTensor.h"
 #include "../tensor/function/FHeader.h"
+#include "../tensor/loss/LHeader.h"

 #ifndef __XBACKWARDLOSS_H__
 #define __XBACKWARDLOSS_H__
@@ -34,11 +35,19 @@ namespace nts{
 class XLossGrad
 {
 public:
-    /* compute dE/dx for a given function y = f(x) */
-    void Compute(XTensor * gold, XTensor * y, XTensor * x, 
-                 XTensor * dedy, XTensor * dedx, XTensor * padding,
-                 int funcID, void * params,
-                 LOSS_FUNCTION_NAME lossName);
+    /* compute dE/dx of a node */
+    static
+    void MakeGrad(XTensor * node, bool isEfficient);
+
+    /* indicates whether the node is for a Loss computation */
+    static
+    bool IsLossOP(XTensor * node);
+
+    ///* compute dE/dx for a given function y = f(x) */
+    //void Compute(XTensor * gold, XTensor * y, XTensor * x, 
+    //             XTensor * dedy, XTensor * dedx, XTensor * padding,
+    //             int funcID, void * params,
+    //             LOSS_FUNCTION_NAME lossName);

    /* compute dE/dy for variable y and error(loss) function E */
    void Compute(XTensor * gold, XTensor * y, 

--- a/source/network/XBackwardMath.cpp
+++ b/source/network/XBackwardMath.cpp
--- a/source/network/XBackwardMath.h
+++ b/source/network/XBackwardMath.h
@@ -109,6 +109,11 @@ private:
    static
    void GradMultiplyDim(XTensor * node, bool isEfficient);

+    /* gradient for multiply one dimension: c =  a * b
+       where some dimensions of b are of size 1 */
+    static
+    void GradMultiplyBroadcast(XTensor * node, bool isEfficient);
+
    /* gradient for negate */
    static
    void GradNegate(XTensor * node, bool isEfficient);
@@ -125,6 +130,18 @@ private:
    static
    void GradScaleAndShift(XTensor * node, bool isEfficient);

+    /* gradient for Scale */
+    static
+    void GradScale(XTensor * node, bool isEfficient);
+
+    /* gradient for Shift */
+    static
+    void GradShift(XTensor * node, bool isEfficient);
+
+    /* gradient for Descale */
+    static
+    void GradDescale(XTensor * node, bool isEfficient);
+
    /* gradient for Minus */
    static
    void GradSub(XTensor * node, bool isEfficient);
@@ -143,6 +160,11 @@ private:
    static
    void GradSumDim(XTensor * node, bool isEfficient);

+    /* gradient for sum by broadcasting: c = a + b * \beta
+       where some dimensions of b are of size 1 */
+    static
+    void GradSumBroadcast(XTensor * node, bool isEfficient);
+
    /* gradient for reduceMean */
    static
    void GradReduceMean(XTensor * node, bool isEfficient);
@@ -158,6 +180,10 @@ private:
    /* gradient for reduceVariance */
    static
    void GradReduceVariance(XTensor * node, bool isEfficient);
+
+    /* gradient for operation */
+    static
+    void GradMulAndShift(XTensor * node, bool isEfficient);
 };

 }

--- a/source/network/XBackwardShape.cpp
+++ b/source/network/XBackwardShape.cpp
@@ -43,6 +43,8 @@ void XShapeGrad::MakeGrad(XTensor * node, bool isEfficent)
        GradCopyIndexed(node, isEfficent);
    else if(operID == MOVEMENT_GATHER)
        GradGather(node, isEfficent);
+    else if (operID == MOVEMENT_DROPOUTWITHINDEX)
+        GradDropoutWithIndex(node, isEfficent);
    else if(operID == SHAPE_MERGE)
        GradMerge(node, isEfficent);
    else if(operID == SHAPE_MERGE_LIST)
@@ -62,7 +64,7 @@ void XShapeGrad::MakeGrad(XTensor * node, bool isEfficent)
    }
 }

-/* indicates whether the node is for a shape operation */
+/* indicates whether the node is for a math operation */
 bool XShapeGrad::IsShapeOP(XTensor * node)
 {
    XLink &income = node->income;
@@ -115,7 +117,7 @@ dE/da = spreadforgather(b)
 void XShapeGrad::GradGather(XTensor * node, bool isEfficent)
 {
    XLink &income = node->income;
-    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for CopyIndexed!");
+    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for Gather!");

    XTensor * input = income.tails[0];
    XTensor * index = income.tails[1];
@@ -126,6 +128,43 @@ void XShapeGrad::GradGather(XTensor * node, bool isEfficent)
    node->visitMark = NODE_FINISHED;
 }

+/*
+gradient computation for DropoutWithIndex function
+*/
+void XShapeGrad::GradDropoutWithIndex(XTensor * node, bool isEfficent)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for DropoutWithIndex!");
+
+    XTensor * input = income.tails[0];
+    XTensor * index = income.tails[1];
+    DTYPE scale = income.GetParam(0);
+    XNoder::MakeGrad(input);
+
+    //_Identity(node->grad, input->grad);
+    _CopyValues(node->grad, input->grad);
+
+    int order = node->grad->order;
+    int * dimSize = new int[order];
+
+    for (int i = 0; i < order; i++) {
+        dimSize[i] = node->grad->dimSize[i];
+    }
+
+    int order1 = 1;
+    int * dimSize1 = new int[order1];
+    dimSize1[0] = input->grad->unitNum;
+    
+    input->grad->Reshape(order1, dimSize1);
+
+    _DropoutWithIndex(node->grad, index, input->grad);
+    _ScaleAndShiftMe(input->grad, scale);
+
+    input->grad->Reshape(order, dimSize);
+
+    node->visitMark = NODE_FINISHED;
+}
+
 /* 
 gradient for merge
 for 
@@ -232,8 +271,8 @@ void XShapeGrad::GradMergeList(XTensor * node, bool isEfficient)
    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for MERGE!");

    XTensor * last = NULL;
-    XList smalls(income.tailNum);
-    XList smallsGrad(income.tailNum);
+    TensorList smalls(income.tailNum);
+    TensorList smallsGrad(income.tailNum);
    bool mergeOnly = true;
    for(int i = 0; i < income.tailNum; i++){
        XTensor * tail = income.tails[i];
@@ -401,7 +440,7 @@ void XShapeGrad::GradSplitListPost(XTensor * node, bool isEfficient)
    /* we compute the gradient for current node, rather than for
       child node, i.e., we use the outgoing edge here */
    XLink &outgo = node->outgo;
-    XList splits(outgo.tailNum);
+    TensorList splits(outgo.tailNum);
    int whereToSplit = -1;
    int splitNum = 0;


--- a/source/network/XBackwardShape.h
+++ b/source/network/XBackwardShape.h
@@ -54,6 +54,10 @@ private:
    static
    void GradGather(XTensor * node, bool isEfficent);

+    /* gradient computation for dropout with index: b = dropoutwithindex(a, index) */
+    static
+    void GradDropoutWithIndex(XTensor * node, bool isEfficent);
+
    /* gradient computation for merge: c = merge(a, b, ...) */
    static
    void GradMerge(XTensor * node, bool isEfficent);

--- a/source/network/XNet.cpp
+++ b/source/network/XNet.cpp
@@ -79,13 +79,13 @@ backward propagation to obtain gradient
 */
 void XNet::Backward(XTensor &root, LOSS_FUNCTION_NAME loss)
 {
-    XList roots(1);
+    TensorList roots(1);
    roots.Add(&root);

-    XList golds(1);
+    TensorList golds(1);
    golds.Add(NULL);

-    XList paddings(1);
+    TensorList paddings(1);
    paddings.Add(NULL);

    Backward(roots, golds, paddings, loss);
@@ -99,13 +99,13 @@ backward propagation to obtain gradient wrt. the loss/error function
 */
 void XNet::Backward(XTensor &root, XTensor &gold, LOSS_FUNCTION_NAME loss)
 {
-    XList roots(1);
+    TensorList roots(1);
    roots.Add(&root);

-    XList golds(1);
+    TensorList golds(1);
    golds.Add(&gold);

-    XList paddings(1);
+    TensorList paddings(1);
    paddings.Add(NULL);

    Backward(roots, golds, paddings, loss);
@@ -120,13 +120,13 @@ backward propagation to obtain gradient wrt. the loss/error function
 */
 void XNet::Backward(XTensor &root, XTensor &gold, XTensor &padding, LOSS_FUNCTION_NAME loss)
 {
-    XList roots(1);
+    TensorList roots(1);
    roots.Add(&root);

-    XList golds(1);
+    TensorList golds(1);
    golds.Add(&gold);

-    XList paddings(1);
+    TensorList paddings(1);
    paddings.Add(&padding);

    Backward(roots, golds, paddings, loss);
@@ -138,10 +138,10 @@ with a number of root nodes
 >> roots - a list of root nodes (output) of the network
 >> loss - name of loss function
 */
-void XNet::Backward(XList &roots, LOSS_FUNCTION_NAME loss)
+void XNet::Backward(TensorList &roots, LOSS_FUNCTION_NAME loss)
 {
-    XList golds(roots.count);
-    XList paddings(roots.count);
+    TensorList golds(roots.count);
+    TensorList paddings(roots.count);
    for (int i = 0; i < roots.count; i++) {
        golds.Add(NULL);
        paddings.Add(NULL);
@@ -157,9 +157,9 @@ with a number of root nodes
 >> golds - a list of gold standard for the output
 >> loss - name of loss function
 */
-void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
+void XNet::Backward(TensorList &roots, TensorList &golds, LOSS_FUNCTION_NAME loss)
 {
-    XList paddings(roots.count);
+    TensorList paddings(roots.count);
    for (int i = 0; i < roots.count; i++)
        paddings.Add(NULL);

@@ -174,7 +174,7 @@ with a number of root nodes
 >> paddings - specify a target value that is ignored
 >> loss - name of loss function
 */
-void XNet::Backward(XList &roots, XList &golds, XList &paddings, LOSS_FUNCTION_NAME loss)
+void XNet::Backward(TensorList &roots, TensorList &golds, TensorList &paddings, LOSS_FUNCTION_NAME loss)
 {
    Traverse(roots);

@@ -187,21 +187,21 @@ void XNet::Backward(XList &roots, XList &golds, XList &paddings, LOSS_FUNCTION_N
        node->visitMark = NODE_UNFINISHED;
    }

-    XLossGrad lossGrad;
+    //XLossGrad lossGrad;

    /* we start with the gradient with respect to the loss for output layers */
-    for(int i = 0; i < roots.count; i++){
+    /*for(int i = 0; i < roots.count; i++){
        XTensor * root = (XTensor*)roots.Get(i);
        XTensor * gold = (XTensor*)golds.Get(i);
        XTensor * padding = (XTensor*)paddings.Get(i);
        XLink &income = root->income;
        int funcID = income.typeID;
-        void * params = income.params;
+        void * params = income.params;*/

        /* we compute dE/dx if the output is generated by an activation function y = f(x).
           Note that we do not need to obtain dE/dy here because it is no use in the 
           folloing process of back-propagation */
-        if(gold != NULL && income.tailNum == 1 && (funcID & FUNCTION_BASE)){
+        /*if(gold != NULL && income.tailNum == 1 && (funcID & FUNCTION_BASE)){
            if(funcID == FUNC_LOGSOFTMAX || funcID == FUNC_SOFTMAX) {
                XTensor * x = income.tails[0];
                XNoder::MakeGrad(x);
@@ -212,13 +212,13 @@ void XNet::Backward(XList &roots, XList &golds, XList &paddings, LOSS_FUNCTION_N
                XNoder::MakeGrad(root);
                lossGrad.Compute(gold, root, root->grad, padding, loss);
            }
-        }
+        }*/
        /* we compuate dE/dy (y is the output) if no predefined activation function is used */
-        else{
+        /*else{
            XNoder::MakeGrad(root);
            lossGrad.Compute(gold, root, root->grad, NULL, loss);
        }
-    }
+    }*/
    
    /* back-propagation from output to input */
    for(int i = nodes.count - 1; i >= 0; i--){
@@ -266,6 +266,8 @@ void XNet::BackwardNode(XTensor * node, bool isEfficent)
            XFuncGrad::MakeGrad(node, isEfficent);
        else if(XShapeGrad::IsShapeOP(node))
            XShapeGrad::MakeGrad(node, isEfficent);
+        else if(XLossGrad::IsLossOP(node))
+			XLossGrad::MakeGrad(node, isEfficent);
        else{
            ShowNTErrors("Wrong node type!");
        }
@@ -300,7 +302,7 @@ depth-first search (Tarjan's algorithm)
 */
 void XNet::Traverse(XTensor &root)
 {
-    XList roots(1);
+    TensorList roots(1);
    roots.Add(&root);

    Traverse(roots);
@@ -311,7 +313,7 @@ traverse the net and find the topological order by
 depth-first search (Tarjan's algorithm) 
 >> roots - a list of roots (or output nodes)
 */
-void XNet::Traverse(XList &roots)
+void XNet::Traverse(TensorList &roots)
 {
    id = MakeNetID();
    nodes.Clear();
@@ -336,7 +338,7 @@ depth-first search given a node (Tarjan's algorithm for topological ordering)
 >> orders - topological order of the nodes
 >> code - code of the network
 */
-void XNet::TarjanVisit(XTensor * node, XList &orders, const unsigned int code)
+void XNet::TarjanVisit(XTensor * node, TensorList &orders, const unsigned int code)
 {
    if(node == NULL)
        return;
@@ -444,7 +446,7 @@ show network topology
 */
 void XNet::ShowNetwork(FILE * file, XTensor * node)
 {
-    XList roots(1);
+    TensorList roots(1);
    roots.Add(node);

    Traverse(roots);
@@ -458,4 +460,15 @@ void XNet::ShowNetwork(FILE * file, XTensor * node)
    }
 }

-}
\ No newline at end of file
+
+/*
+search for a node in a top-down manner by its name
+>> top - the top most node
+<< return - the node we found
+*/
+//XTensor * XNet::SearchNode(XTensor * top, const char * name)
+//{
+	//return XLink::SearchNode(top, name);
+//}
+
+}
--- a/source/network/XNet.h
+++ b/source/network/XNet.h
@@ -23,6 +23,7 @@

 #include "../tensor/XTensor.h"
 #include "../tensor/function/FHeader.h"
+#include "../tensor/loss/LHeader.h"

 #ifndef __XNET_H__
 #define __XNET_H__
@@ -36,16 +37,16 @@ struct XNet
    unsigned int id;

    /* tensor nodes of the network (in order) */
-    XList nodes;
+    TensorList nodes;

    /* tensor nodes to keep gradient for output (e.g., SGD)*/
-    XList gradNodes;
+    TensorList gradNodes;

    /* output nodes of the network */
-    XList outputs;
+    TensorList outputs;

    /* input nodes of the network */
-    XList inputs;
+    TensorList inputs;

    /* indicates whether the network just keeps the gradient for parameter tensors */
    bool isGradEfficient;
@@ -70,15 +71,15 @@ struct XNet

    /* backward propagation to obtain gradient
       with a number of root nodes */
-    void Backward(XList &roots, LOSS_FUNCTION_NAME loss = NOLOSS);
+    void Backward(TensorList &roots, LOSS_FUNCTION_NAME loss = NOLOSS);

    /* backward propagation to obtain gradient
       with a number of root nodes */
-    void Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss = NOLOSS);
+    void Backward(TensorList &roots, TensorList &golds, LOSS_FUNCTION_NAME loss = NOLOSS);

    /* backward propagation to obtain gradient wrt. the loss/error function
       with a number of root nodes */
-    void Backward(XList &roots, XList &golds, XList &paddings, LOSS_FUNCTION_NAME loss = NOLOSS);
+    void Backward(TensorList &roots, TensorList &golds, TensorList &paddings, LOSS_FUNCTION_NAME loss = NOLOSS);

    /* backward computation for a given node */
    void BackwardNode(XTensor * node, bool isEfficent = false);
@@ -92,10 +93,10 @@ struct XNet

    /* traverse the net and find the topological order by 
       depth-first search (Tarjan's algorithm) */
-    void Traverse(XList &roots);
+    void Traverse(TensorList &roots);

    /* depth-first search given a node (Tarjan's algorithm for topological ordering) */
-    void TarjanVisit(XTensor * node, XList &orders, const unsigned int code);
+    void TarjanVisit(XTensor * node, TensorList &orders, const unsigned int code);

    /* dump network information */
    void Dump(FILE * file);
@@ -111,6 +112,10 @@ struct XNet

    /* show network topology */
    void ShowNetwork(FILE * file, XTensor * node);
+
+    /* search a node in a top-down manner by its name */
+    //static
+    //XTensor * SearchNode(XTensor * top, const char * name);
 };

 /* we make a unique id for every tensor */

--- a/source/sample/fnnlm/FNNLM.cpp
+++ b/source/sample/fnnlm/FNNLM.cpp
@@ -20,7 +20,7 @@
 * This is a simple impelementation of the feed-forward network-baesd language
 * model (FNNLM). See more details about FNNLM in
 * "A Neural Probabilistic Language Model" by Bengio et al.
- * Journal of Machine Learning Research 3 (2003) 1137C1155
+ * Journal of Machine Learning Research 3 (2003) 1137–1155
 *
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-06-22
 */
@@ -231,8 +231,8 @@ void LoadArgs(int argc, const char ** argv, FNNModel &model)
    }

    for(int i = 0; i < argc; i++){
-        if (!strcmp(argv[i], "-mempool"))
-            model.mem = new XMem(model.devID);
+        if (!strcmp(argv[i], "-mem"))
+            model.mem = new XMem(model.devID, FREE_ON_THE_FLY, 256 * MILLION, 512, 256 * MILLION);
    }
 }

@@ -247,13 +247,13 @@ void Check(FNNModel &model)
 /* make a hard copy of the fnn model */
 void Copy(FNNModel &tgt, FNNModel &src)
 {
-    InitTensor(&tgt.embeddingW, &src.embeddingW);
+    InitTensorV2(&tgt.embeddingW, &src.embeddingW);
    for(int i = 0; i < MAX_HIDDEN_NUM; i++){
-        InitTensor(&tgt.hiddenW[i], &src.hiddenW[i]);
-        InitTensor(&tgt.hiddenB[i], &src.hiddenB[i]);
+        InitTensorV2(&tgt.hiddenW[i], &src.hiddenW[i]);
+        InitTensorV2(&tgt.hiddenB[i], &src.hiddenB[i]);
    }
-    InitTensor(&tgt.outputW, &src.outputW);
-    InitTensor(&tgt.outputB, &src.outputB);
+    InitTensorV2(&tgt.outputW, &src.outputW);
+    InitTensorV2(&tgt.outputB, &src.outputB);

    tgt.n = src.n;
    tgt.eSize = src.eSize;
@@ -310,7 +310,7 @@ initialize a 1d tensor using the fnn model setting
 */
 void InitModelTensor1D(XTensor &tensor, int num, FNNModel &model)
 {
-    InitTensor1D(&tensor, num, X_FLOAT, model.devID, model.mem);
+    InitTensor1DV2(&tensor, num, X_FLOAT, model.devID);
 }

 /* 
@@ -322,7 +322,7 @@ initialize a 2d tensor using the fnn model setting
 */
 void InitModelTensor2D(XTensor &tensor, int rowNum, int colNum, FNNModel &model)
 {
-    InitTensor2D(&tensor, rowNum, colNum, X_FLOAT, model.devID, model.mem);
+    InitTensor2DV2(&tensor, rowNum, colNum, X_FLOAT, model.devID);
 }


@@ -449,6 +449,9 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
            /* the gold standard */
            XTensor gold;

+            /* the loss tensor */
+            XTensor lossTensor;
+
            /* make the input tensor for position i */
            for(int i = 0; i < model.n - 1; i++)
                MakeWordBatch(inputs[i], ngrams, ngramNum, i, model.vSize, model.devID, model.mem);
@@ -471,30 +474,30 @@ void Train(const char * train, bool isShuffled, FNNModel &model)

                /* update model parameters */
                Update(model, grad, learningRate, false);
+
+                /* get probabilities */
+                float prob = GetProb(output, gold);
+                loss -= prob;
            }
            else{
                /* gradient = 0 */
                Clear(model, true);

                /* forward + backward process */
-				
-				/* this is implemented by gather function */
                ForwardAutoDiff(ngrams, ngramNum, output, model);
-				
-				/* this is implemented by multiply function */
-				//ForwardAutoDiff(inputs, output, model);
+                lossTensor = CrossEntropy(output, gold);

                /* automatic differentiation */
-                autoDiffer.Backward(output, gold, CROSSENTROPY);
+                autoDiffer.Backward(lossTensor);

                /* update model parameters */
                Update(model, grad, learningRate, true);
+
+                /* get probabilities */
+                float prob = ReduceSumAll(lossTensor);
+                loss += prob;
            }
-                
-            /* get probabilities */
-            float prob = GetProb(output, gold);
-                
-            loss += -prob;
+
            wordCount += ngramNum;
            wordCountTotal += ngramNum;
            
@@ -537,8 +540,8 @@ update the model parameters using the delta rule
 */
 void Update(FNNModel &model, FNNModel &grad, float epsilon, bool isNodeGrad)
 {
-    XList paraList(10);
-    XList gradList(10);
+    TensorList paraList(10);
+    TensorList gradList(10);

    paraList.Add(&model.outputW);
    paraList.Add(&model.outputB);
@@ -595,14 +598,14 @@ get prediction probabilites of the gold words
 float GetProb(XTensor &output, XTensor &gold, XTensor * wordProbs)
 {
    XTensor probs;
-    InitTensor(&probs, &output);
+    InitTensorV2(&probs, &output);
    
    /* probs[i,j] = output[i,j] * gold[i,j] */
    _Multiply(&output, &gold, &probs);

    /* probability of each word */
    XTensor wprobs;
-    InitTensor1D(&wprobs, output.GetDim(0), output.dataType, output.devID, output.mem);
+    InitTensor1DV2(&wprobs, output.GetDim(0), output.dataType, output.devID);
    _ReduceSum(&probs, &wprobs, 1);
    if(wordProbs != NULL)
        _CopyValues(&wprobs, wordProbs);
@@ -616,7 +619,7 @@ float GetProb(XTensor &output, XTensor &gold, XTensor * wordProbs)
 
    /* probability for the batch */
    XTensor result;
-    InitTensor1D(&result, 1, X_FLOAT, output.devID, output.mem);
+    InitTensor1DV2(&result, 1, X_FLOAT, output.devID);
    _ReduceSum(&probs, &result, 1);
    
    return result.Get1D(0);
@@ -718,7 +721,7 @@ The indexed cell is set to 1, and 0 otherwise.
 void InitZeroOneTensor2D(XTensor &tensor, int rowNum, int colNum, int * rows, int * cols, 
                         int itemNum, int devID, XMem * mem)
 {
-    InitTensor2D(&tensor, rowNum, colNum, X_FLOAT, devID, mem);
+    InitTensor2DV2(&tensor, rowNum, colNum, X_FLOAT, devID);

    tensor.SetZeroAll();

@@ -765,7 +768,7 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
    int batchSize = -1;
    int n = model.n;
    int depth = model.hDepth;
-    XList eList(n - 1);
+    TensorList eList(n - 1);

    /* previoius n - 1 words */
    for(int i = 0; i < n - 1; i++){
@@ -811,7 +814,7 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)

        /* make a 2d tensor for the bias term */
        XTensor b2D;
-        InitTensor(&b2D, &s);
+        InitTensorV2(&b2D, &s);
        _Unsqueeze(&b, &b2D, 0, batchSize);

        /* introduce bias term:
@@ -843,7 +846,7 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
        _MatrixMul(&h_last, X_NOTRANS, &w, X_NOTRANS, &s);

        XTensor b2D;
-        InitTensor(&b2D, &s);
+        InitTensorV2(&b2D, &s);
        _Unsqueeze(&b, &b2D, 0, batchSize);

        _Sum(&s, &b2D, &s);
@@ -908,8 +911,8 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
    XTensor dedsHidden;
    XTensor dedxBottom;
    if (depth > 0)
-        InitTensor(&dedsHidden, &dedx);
-    InitTensor(&dedxBottom, &net.embeddingCat);
+        InitTensorV2(&dedsHidden, &dedx);
+    InitTensorV2(&dedxBottom, &net.embeddingCat);

    /* back-propagation from top to bottom in the stack of hidden layers
       for each layer, h = f(s)
@@ -927,7 +930,7 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
        
        /* backpropagation through the activation fucntion: 
           dE/ds = dE/dh * dh/ds */
-        _HardTanHBackward(NULL, &h, &s, &dedh, &deds, NOLOSS);
+        _HardTanHBackward(&h, &s, &dedh, &deds);

        /* gradient of the weight: dE/dw = x^T * dE/ds   */
        _MatrixMul(&x, X_TRANS, &deds, X_NOTRANS, &dedw);
@@ -943,11 +946,11 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
            _CopyValues(&dedx, &gradPassed);
    }

-    XList eList(n - 1);
+    TensorList eList(n - 1);

    /* back-propagation for the embedding layer */
    for (int i = 0; i < n - 1; i++) {
-        XTensor * dedy = NewTensor2D(batchSize, model.eSize, X_FLOAT, model.devID, model.mem);
+        XTensor * dedy = NewTensor2DV2(batchSize, model.eSize, X_FLOAT, model.devID);
        eList.Add(dedy);
    }

@@ -999,7 +1002,7 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model
        }
    }

-    InitTensor1D(&words, size, X_INT, model.devID, model.mem);
+    InitTensor1DV2(&words, size, X_INT, model.devID);
    words.SetData(index, size);

    embeddingBig = Gather(model.embeddingW, words);
@@ -1017,7 +1020,8 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model
        hidden = HardTanH(MMul(hidden, model.hiddenW[i]) + model.hiddenB[i]);

    /* output layer */
-    output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1);
+    //output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1);
+    output = Softmax(MMul(hidden, model.outputW) + model.outputB, 1);
 }

 /*
@@ -1036,7 +1040,7 @@ void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model)
    XTensor hidden;
    XTensor b;

-    XList inputList(n - 1);
+    TensorList inputList(n - 1);
    for(int i = 0; i < n - 1; i++)
        inputList.Add(inputs + i);

@@ -1170,6 +1174,7 @@ void Test(const char * test, const char * result, FNNModel &model)
        else {			
 			/* this is implemented by gather function */
            ForwardAutoDiff(ngrams, ngramNum, output, model);
+            output = Log(output);
 				
 			/* this is implemented by multiply function */
 			//ForwardAutoDiff(inputs, output, model);
@@ -1177,7 +1182,7 @@ void Test(const char * test, const char * result, FNNModel &model)

        /* prediction probabilities */
        XTensor probs;
-        InitTensor1D(&probs, ngramNum);
+        InitTensor1DV2(&probs, ngramNum);

        /* get probabilities */
        float prob = GetProb(output, gold, &probs);
@@ -1200,6 +1205,7 @@ void Test(const char * test, const char * result, FNNModel &model)
    }

    fclose(file);
+    fclose(ofile);

    double elapsed = GetClockSec() - startT;


--- a/source/sample/transformer/T2TAttention.cpp
+++ b/source/sample/transformer/T2TAttention.cpp
@@ -75,16 +75,19 @@ void T2TAttention::InitModel(int argc, char ** argv,
    InitTensor2D(&wq, d, dk, X_FLOAT, devID, mem);
    InitTensor2D(&wv, d, dv, X_FLOAT, devID, mem);
    InitTensor2D(&wa, d, d, X_FLOAT, devID, mem);
-    
+    InitTensor2D(&wbig, d, 3 * d, X_FLOAT, devID, mem);
+
    float scale = 1.0F;
    float finfoutk = (float)sqrt(6.0F * scale/(d + dk));
    float finfoutv = (float)sqrt(6.0F * scale/(d + dv));
    float finfouta = (float)sqrt(6.0F * scale / (d + d));
+    float finfoutbig = (float)sqrt(6.0F * scale / (d + 3*d));

    wk.SetDataRand(-finfoutk, finfoutk);
    wq.SetDataRand(-finfoutk, finfoutk);
    wv.SetDataRand(-finfoutv, finfoutv);
    wa.SetDataRand(-finfouta, finfouta);
+    wbig.SetDataRand(-finfoutbig, finfoutbig);
 }

 /* 
@@ -103,40 +106,88 @@ XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bo
    XTensor k2;
    XTensor q2;
    XTensor v2;
-
-    /* linear transofmration before self-attention */
+    
+    /* linear transformation before self-attention */
    k2 = MMul(k, wk);
    q2 = MMul(q, wq);
    v2 = MMul(v, wv);
-
+    
+    return MakeAttention(k2, q2, v2, mask, isTraining);
+}
+    
+/*
+make the network given a big tensor that keeps keys, queries and values
+>> kqv - the big tensor
+>> mask - as it is
+>> isTraining - indicates whether the model is used for training
+*/
+XTensor T2TAttention::MakeBig(XTensor &kqv, XTensor &mask, bool isTraining)
+{
+    XTensor k2;
+    XTensor q2;
+    XTensor v2;
+    XTensor kqv2;
+    TensorList split;
+    
+    kqv2 = MMul(kqv, wbig);
+    
+    int d1 = kqv2.GetDim(0);
+    int d2 = kqv2.GetDim(1);
+    int d3 = kqv2.GetDim(2) / 3;
+    
+    InitTensor3D(&k2, d1, d2, d3, X_FLOAT, devID, mem);
+    InitTensor3D(&q2, d1, d2, d3, X_FLOAT, devID, mem);
+    InitTensor3D(&v2, d1, d2, d3, X_FLOAT, devID, mem);
+    
+    split.Add(&q2);
+    split.Add(&k2);
+    split.Add(&v2);
+    
+    Split(kqv2, split, 2, 3);
+    
+    return MakeAttention(k2, q2, v2, mask, isTraining);
+}
+    
+/*
+make the attention network given keys, queries and values (after linear transformation)
+>> k - keys. It might be of size B * L * H
+       where B = batch size, L = sequence length,
+       and H = vector size of each position
+>> q - queries
+>> v - values
+>> mask - as it is
+>> isTraining - indicates whether the model is used for training
+*/
+XTensor T2TAttention::MakeAttention(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining)
+{
    XTensor kheads;
    XTensor qheads;
    XTensor vheads;
-
+    
    /* multi head */
-    kheads = Split(k2, k2.order - 1, nhead);
-    qheads = Split(q2, q2.order - 1, nhead);
-    vheads = Split(v2, v2.order - 1, nhead);
-
+    kheads = Split(k, k.order - 1, nhead);
+    qheads = Split(q, q.order - 1, nhead);
+    vheads = Split(v, v.order - 1, nhead);
+    
    XTensor att;
    XTensor dot;
    XTensor scalar;
-
+    
    /* scalar = softmax(Q * K^T / sqrt(dk)) * V */
    dot = BMMul(qheads, X_NOTRANS, kheads, X_TRANS);
-
+    
    if(isMasked)
        dot = dot + mask;
-
+    
    dot = Linear(dot, 1.0F/(float)sqrt((float)dk/nhead));
-
-    scalar = Softmax(dot, -1);
    
+    scalar = Softmax(dot, -1);
+
    if(isTraining && dropoutP > 0)
        scalar = Dropout(scalar, dropoutP);
-
+    
    att = BMMul(scalar, vheads);
-
+    
    /* concatenate the heads */
    return MMul(Merge(att, att.order - 1), wa);
 }

--- a/source/sample/transformer/T2TAttention.h
+++ b/source/sample/transformer/T2TAttention.h
@@ -59,7 +59,9 @@ public:

    /* transformation after dot-product attention */
    XTensor wa;
-
+    
+    XTensor wbig;
+	
    /* size of transformed Q and K */
    int dk;

@@ -96,6 +98,12 @@ public:

    /* make the network */
    XTensor Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining);
+    
+    /* make the network given a big tensor that keeps keys, queries and values */
+    XTensor MakeBig(XTensor &kqv, XTensor &mask, bool isTraining);
+    
+    /* make the attention network given keys, queries and values (after linear transformation) */
+    XTensor MakeAttention(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining);
 };

 }

--- a/source/sample/transformer/T2TBatchLoader.cpp
+++ b/source/sample/transformer/T2TBatchLoader.cpp
--- a/source/sample/transformer/T2TBatchLoader.h
+++ b/source/sample/transformer/T2TBatchLoader.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-04-25
+ * it is cold today but i'll move to a warm place tomorrow :)
+ */
+
+#ifndef __T2TBATCHLOADER_H__
+#define __T2TBATCHLOADER_H__
+
+#include "../../network/XNet.h"
+
+using namespace nts;
+
+namespace transformer
+{
+
+#define MAX_SEQUENCE_LENGTH 1024 * 4
+
+/* node to keep batch information */
+struct BatchNode
+{
+    /* begining position */
+    int beg;
+
+    /* end position */
+    int end;
+
+    /* maximum word number on the encoder side */
+    int maxEnc;
+
+    /* maximum word number on the decoder side */
+    int maxDec;
+
+    /* a key for sorting */
+    int key;
+};
+
+class T2TBatchLoader
+{
+public:
+    /* buffer for loading words */
+    int * buf;
+
+    /* another buffer */
+    int * buf2;
+
+    /* batch buf */
+    BatchNode * bufBatch;
+
+    /* buffer size */
+    int bufSize;
+
+    /* size of batch buffer */
+    int bufBatchSize;
+
+    /* length of each sequence */
+    int * seqLen;
+
+    /* another array */
+    int * seqLen2;
+
+    /* offset of the first word for each sequence */
+    int * seqOffset;
+
+    /* number of sequences in the buffer */
+    int nseqBuf;
+
+    /* offset for next sequence in the buffer */
+    int nextSeq;
+
+    /* offset for next batch */
+    int nextBatch;
+
+    /* indicates whether we double the </s> symbol for the output of lms */
+    bool isDoubledEnd;
+    
+    /* indicates whether we use batchsize = max * sc
+       rather rather than batchsize = word-number, where max is the maximum
+       length and sc is the sentence number */
+    bool isSmallBatch;
+
+    /* counterpart of "isSmallBatch" */
+    bool isBigBatch;
+
+    /* randomize batches */
+    bool isRandomBatch;
+
+    /* bucket size */
+    int bucketSize;
+
+public:
+    /* constructor */
+    T2TBatchLoader();
+
+    /* de-constructor */
+    ~T2TBatchLoader();
+
+    /* initialization */
+    void Init(int argc, char ** argv);
+
+    /* load data to buffer */
+    int LoadBuf(FILE * file, bool isSorted, int step);
+
+    /* clear data buffer */
+    void ClearBuf();
+
+    /* set the random batch flag */
+    void SetRandomBatch(bool flag = true);
+
+    /* load a batch of sequences */
+    int LoadBatch(FILE * file, bool isLM,
+                  XTensor * batchEnc, XTensor * paddingEnc, 
+                  XTensor * batchDec, XTensor * paddingDec,
+                  XTensor * gold, XTensor * label,
+                  int * seqs,
+                  int vsEnc, int vsDec, int sBatch, int wBatch, 
+                  bool isSorted, int &ws, int &wCount,
+                  int devID, XMem * mem, 
+				  bool isTraining);
+
+    /* load a batch of sequences (for language modeling) */
+    int LoadBatchLM(FILE * file, 
+                    XTensor * batchEnc, XTensor * paddingEnc,
+                    XTensor * batchDec, XTensor * paddingDec,
+                    XTensor * gold, XTensor * label,
+                    int * seqs, int vs, int sBatch, int wBatch, 
+                    bool isSorted, int &wCount,
+                    int devID, XMem * mem, 
+					bool isTraining);
+
+    /* load a batch of sequences (for machine translation) */
+    int LoadBatchMT(FILE * file, 
+                    XTensor * batchEnc, XTensor * paddingEnc, 
+                    XTensor * batchDec, XTensor * paddingDec,
+                    XTensor * gold, XTensor * label,
+                    int * seqs, int vsEnc, int vsDec, int sBatch, int wBatch, 
+                    bool isSorted, int &ws, int &wCount,
+                    int devID, XMem * mem, 
+					bool isTraining);
+
+    /* shuffle the data file */
+    void Shuffle(const char * srcFile, const char * tgtFile);
+};
+}
+
+#endif
\ No newline at end of file
--- a/source/sample/transformer/T2TDecoder.cpp
+++ b/source/sample/transformer/T2TDecoder.cpp
@@ -21,6 +21,8 @@

 #include <math.h>
 #include "T2TDecoder.h"
+#include "T2TUtility.h"
+#include "T2TLayerNormal.h"
 #include "../../tensor/core/CHeader.h"

 namespace transformer
@@ -29,6 +31,10 @@ namespace transformer
 /* constructor */
 AttDecoder::AttDecoder()
 {
+    attentions = NULL;
+    fnns = NULL;
+    attLayerNorms = NULL;
+    fnnLayerNorms = NULL;
    attentionsEnde = NULL;
    attEndeLayerNorms = NULL;
 }
@@ -36,6 +42,10 @@ AttDecoder::AttDecoder()
 /* de-constructor */
 AttDecoder::~AttDecoder()
 {
+    delete[] attentions;
+    delete[] fnns;
+    delete[] attLayerNorms;
+    delete[] fnnLayerNorms;
    delete[] attentionsEnde;
    delete[] attEndeLayerNorms;
 }
@@ -53,14 +63,38 @@ void AttDecoder::InitModel(int argc, char ** argv,
                           bool myIsMasked, int myIgnored, 
                           int myDevID, XMem * myMem)
 {
-    AttEncoder::InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
+    //AttEncoder::InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);

+    devID = myDevID;
+    mem = myMem;
+    ignored = myIgnored;
+
+    LoadParamInt(argc, argv, "nlayer", &nlayer, 6);
+    LoadParamInt(argc, argv, "hsize", &hSize, DEFAULT_EMBEDDING_SIZE);
+    LoadParamInt(argc, argv, "esize", &eSize, DEFAULT_EMBEDDING_SIZE);
+    LoadParamInt(argc, argv, "vsizetgt", &vSize, -1);
+    LoadParamFloat(argc, argv, "dropout", &dropoutP, 0);
+
+    CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
+    CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsizetgt\"");
+
+    /* embedding model */
+    embedder.InitModel(argc, argv, devID, mem, false);
+
+    attentions = new T2TAttention[nlayer];
+    fnns = new T2TFNN[nlayer];
+    attLayerNorms = new T2TLN[nlayer];
+    fnnLayerNorms = new T2TLN[nlayer];
    attentionsEnde = new T2TAttention[nlayer];
    attEndeLayerNorms = new T2TLN[nlayer];

    /* initialize the stacked layers */
-    for(int i = 0; i < nlayer; i++){
-        attentionsEnde[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
+    for (int i = 0; i < nlayer; i++) {
+        attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
+        fnns[i].InitModel(argc, argv, myDevID, myMem);
+        attLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
+        fnnLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
+        attentionsEnde[i].InitModel(argc, argv, true, myIgnored, myDevID, myMem);
        attEndeLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
    }
 }
@@ -93,7 +127,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X

        /******************/
        /* self attention */
-        att = attentions[i].Make(x, x, x, mask, isTraining);
+        att = attentions[i].MakeBig(x, mask, isTraining);

        /* dropout */
        if(isTraining && dropoutP > 0)
@@ -133,6 +167,8 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
        /* layer normalization */
        x = fnnLayerNorms[i].Make(res);
    }
+    
+    x.SetName(DECODING_NAME);

    return x;
 }

--- a/source/sample/transformer/T2TDecoder.h
+++ b/source/sample/transformer/T2TDecoder.h
@@ -26,10 +26,60 @@

 namespace transformer
 {
+    
+#define DECODING_NAME "decoding"
+#define DECODING_INPUT_NAME "decoding_input"

-class AttDecoder : public AttEncoder
+class AttDecoder
 {
 public:
+
+    /* device id */
+    int devID;
+
+    /* memory pool */
+    XMem * mem;
+
+    /* layer number */
+    int nlayer;
+
+    /* hidden layer size of the FNN layer */
+    int hSize;
+
+    /* embedding size */
+    int eSize;
+
+    /* vocabulary size */
+    int vSize;
+
+    /* dropout probability */
+    DTYPE dropoutP;
+
+    /* some positions can be ignored in attention. this is useful in lm where the first position needs
+ *     special design for the attention model. */
+    int ignored;
+
+    /* embedding of word at each position */
+    T2TEmbedder embedder;
+
+    /* FNN model of each layer */
+    T2TFNN * fnns;
+
+    /* attention model of each layer */
+    T2TAttention * attentions;
+
+    /* layer normalization for fnn */
+    T2TLN * fnnLayerNorms;
+
+    /* layer normalization for attention */
+    T2TLN * attLayerNorms;
+
+    /* input tensor of the encoder */
+    XTensor * input;
+
+    /* output tensor of the encoder */
+    XTensor * output;
+
    /* encoder-decoder attention model of each layer */
    T2TAttention * attentionsEnde;


--- a/source/sample/transformer/T2TEmbedding.cpp
+++ b/source/sample/transformer/T2TEmbedding.cpp
@@ -48,12 +48,18 @@ initialize the model
 >> myDevID - device id
 >> myMem - the memory pool
 */
-void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
+void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, XMem * myMem, bool isEnc)
 {
    devID = myDevID;
    mem = myMem;
    
-    LoadParamInt(argc, argv, "vsize", &vSize, -1);
+    if(isEnc){
+        LoadParamInt(argc, argv, "vsize", &vSize, -1);
+    }
+    else{
+        LoadParamInt(argc, argv, "vsizetgt", &vSize, -1);
+    }
+    //LoadParamInt(argc, argv, "vsize", &vSize, -1);
    LoadParamInt(argc, argv, "maxlen", &maxLength, 512);
    LoadParamInt(argc, argv, "d", &eSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);

--- a/source/sample/transformer/T2TEmbedding.h
+++ b/source/sample/transformer/T2TEmbedding.h
@@ -71,7 +71,7 @@ public:
    ~T2TEmbedder();

    /* initialize the model */
-    void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL);
+    void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL, bool isEnc = true);

    /* make positional embeddings */
    void MakePosEmbedding(int eSize, int d, int length);

--- a/source/sample/transformer/T2TEncoder.cpp
+++ b/source/sample/transformer/T2TEncoder.cpp
@@ -103,8 +103,6 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo

    x = embedder.Make(input);

-    //x.Dump(tmpFILE, "embedding: ");
-
    /* dropout */
    if(isTraining && dropoutP > 0)
        x = Dropout(x, dropoutP);
@@ -116,8 +114,8 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo
        XTensor res;

        /* self attention */
-        att = attentions[i].Make(x, x, x, mask, isTraining);
-
+        att = attentions[i].MakeBig(x, mask, isTraining);
+        
        /* dropout */
        if(isTraining && dropoutP > 0)
            att = Dropout(att, dropoutP);
@@ -141,6 +139,9 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo
        /* layer normalization */
        x = fnnLayerNorms[i].Make(res);
    }
+    
+    x.SetName(ENCODING_NAME);
+    input.SetName(ENCODING_INPUT_NAME);

    return x;
 }

--- a/source/sample/transformer/T2TEncoder.h
+++ b/source/sample/transformer/T2TEncoder.h
@@ -32,6 +32,9 @@ using namespace nts;

 namespace transformer
 {
+    
+#define ENCODING_NAME "encoding"
+#define ENCODING_INPUT_NAME "encoding_input"

 /* 
 base class of the encoder 

--- a/source/sample/transformer/T2TFNN.cpp
+++ b/source/sample/transformer/T2TFNN.cpp
@@ -89,13 +89,15 @@ XTensor T2TFNN::Make(XTensor &input, bool isTraining)
    XTensor t1;

    /* t1 = max(0, x * w1 + b1) */
-    t1 = Rectify(MMul(input, w1) + b1);
+    //t1 = Rectify(MMul(input, w1) + b1);
+    t1 = Rectify(MulAndShift(input, w1, b1));
    
    if(isTraining && dropoutP > 0)
        t1 = Dropout(t1, dropoutP);

    /* result = t1 * w2 + b2 */
-    return MMul(t1, w2) + b2;
+    //return MMul(t1, w2) + b2;
+    return MulAndShift(t1, w2, b2);
 }



--- a/source/sample/transformer/T2TLengthPenalty.cpp
+++ b/source/sample/transformer/T2TLengthPenalty.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2019, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../../tensor/core/CHeader.h"
+#include "T2TLengthPenalty.h"
+
+using namespace nts;
+
+namespace transformer
+{
+
+/* 
+GNMT-like length penalty: pl = ((5 + n)/(5 + 1))^\alpha 
+where n = length of the sequence 
+>> length - length of the sequence (for each entry)
+>> alpha - the parameter controls the length preference
+<< return - length penaltyof the sequence (for each entry)
+*/
+XTensor T2TLengthPenalizer::GNMT(const XTensor & length, float alpha)
+{
+    XTensor base;
+    XTensor lp;
+
+    //base = ScaleAndShift(ScaleAndShift(length, 0, 5.0F), 1.0F/(5 + 1));
+    base = (length + 5)/(1 + 5);
+
+    lp = Power(base, alpha);
+    
+    return lp;
+}
+
+}
--- a/source/sample/transformer/T2TLengthPenalty.h
+++ b/source/sample/transformer/T2TLengthPenalty.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2019, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-04-08
+ * Start of a new week - I just finished several documents.
+ * Writing document is harder than writing code :)
+ */
+
+#ifndef __T2TLENGTHPENALTY_H__
+#define __T2TLENGTHPENALTY_H__
+
+#include "../../tensor/XTensor.h"
+
+using namespace nts;
+
+namespace transformer
+{
+
+/* We intend to penalize short sequences because they have higher score
+   in product of a sequence of probability-like terms and have more chances
+   to beat others in search. */
+class T2TLengthPenalizer
+{
+public:
+    /* GNMT-like length penalty: pl = ((5 + n)/(5 + 1))^\alpha 
+       where n = length of the sequence */
+    static
+    XTensor GNMT(const XTensor & length, float alpha);
+};
+
+}
+
+#endif
--- a/source/sample/transformer/T2TModel.cpp
+++ b/source/sample/transformer/T2TModel.cpp
--- a/source/sample/transformer/T2TModel.h
+++ b/source/sample/transformer/T2TModel.h
@@ -31,6 +31,9 @@
 namespace transformer
 {

+/* a transformer model that keeps parameters of the encoder,
+   the decoder and the output layer (softmax). Also, it creates
+   the network used in transformer. */
 class T2TModel
 {
 public:
@@ -78,10 +81,24 @@ public:
    void MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool isTraining);

    /* make the network for machine translation (with the output softmax layer) */
-    void MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTensor &paddingEnc, XTensor &paddingDec, bool isTraining);
+    void MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, 
+                XTensor &paddingEnc, XTensor &paddingDec, bool isTraining);
+
+    /* make the mask for training MT models */
+    void MakeMTMask(XTensor &inputEnc, XTensor &inputDec, 
+                    XTensor &paddingEnc, XTensor &paddingDec, 
+                    XTensor &maskEnc, XTensor &maskDec, XTensor &maskEncDec);
+    
+    /* make the mask of the encoder */
+    void MakeMTMaskEnc(XTensor &inputEnc, XTensor &paddingEnc, XTensor &maskEnc);
+    
+    /* make the mask of the decoder */
+    void MakeMTMaskDec(XTensor &inputEnc, XTensor &inputDec,
+                       XTensor &paddingEnc, XTensor &paddingDec,
+                       XTensor &maskDec, XTensor &maskEncDec);

    /* get parameter matrics */
-    void GetParams(XList &list);
+    void GetParams(TensorList &list);

    /* dump the parameters */
    void Dump(const char * fn);

--- a/source/sample/transformer/T2TOutput.cpp
+++ b/source/sample/transformer/T2TOutput.cpp
@@ -56,7 +56,7 @@ void T2TOutput::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)

    float minmax = 0;

-    LoadParamInt(argc, argv, "vsize", &vSize, -1);
+    LoadParamInt(argc, argv, "vsizetgt", &vSize, -1);
    LoadParamInt(argc, argv, "d", &inSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "d", &hSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamFloat(argc, argv, "outputminmax", &minmax, 0.08F);
@@ -93,8 +93,9 @@ void T2TOutput::Make(XTensor &input, XTensor &output)
 {
    XTensor &x = input;

-    output = LogSoftmax(MMul(x, w), -1);
-    //output = Softmax(MMul(x, w), -1);
+    //output = LogSoftmax(MMul(x, w), -1);
+    output = Softmax(MMul(x, w), -1);
+    output.SetName(OUTPUT_NAME);
 }

 }
--- a/source/sample/transformer/T2TOutput.h
+++ b/source/sample/transformer/T2TOutput.h
@@ -28,6 +28,8 @@ using namespace nts;

 namespace transformer
 {
+    
+#define OUTPUT_NAME "output"

 /* output layer */
 class T2TOutput

--- a/source/sample/transformer/T2TPredictor.cpp
+++ b/source/sample/transformer/T2TPredictor.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-13
+ */
+
+#include "T2TPredictor.h"
+#include "../../tensor/core/CHeader.h"
+
+using namespace nts;
+
+namespace transformer
+{
+
+/* constructor */
+T2TStateBundle::T2TStateBundle()
+{
+    states = NULL;
+    isStart = false;
+}
+
+/* de-constructor */
+T2TStateBundle::~T2TStateBundle()
+{
+    if(states != NULL)
+        delete[] states;
+}
+
+/* 
+create states 
+>> num - number of states
+*/
+void T2TStateBundle::MakeStates(int num)
+{
+    CheckNTErrors(num > 0, "invalid number");
+
+    if(states != NULL)
+        delete[] states;
+
+    states = new T2TState[num];
+
+    for(int i = 0; i < num; i++){
+        states[i].prediction = -1;
+        states[i].pid = T2T_PID_EMPTY;
+        states[i].isEnd = false;
+        states[i].isStart = false;
+        states[i].isCompleted = false;
+        states[i].prob = 0;
+        states[i].probPath = 0;
+        states[i].modelScore = 0;
+        states[i].nstep = 0;
+        states[i].last = NULL;
+    }
+
+    stateNum = num;
+}
+
+/* constructor */
+T2TPredictor::T2TPredictor()
+{
+    startSymbol = -1;
+}
+
+/* de-constructor */
+T2TPredictor::~T2TPredictor()
+{
+}
+
+/* 
+create an initial state 
+>> model - the t2t model
+>> top - the top-most layer of the network
+>> input - input of the network
+>> beamSize - beam size
+>> state - the state to be initialized
+*/
+void T2TPredictor::Create(T2TModel * model, XTensor * top, const XTensor * input, int beamSize, T2TStateBundle * state)
+{
+    state->layersEnc.Clear();
+    state->layersDec.Clear();
+
+    XTensor * encoding = XLink::SearchNode(top, ENCODING_NAME);
+    CheckNTErrors(encoding != NULL, "No encoding layers found!");
+
+    state->layersEnc.Add(encoding);
+    state->layersDec.Add(NULL);
+
+    int dims[MAX_TENSOR_DIM_NUM];
+    for (int i = 0; i < input->order - 1; i++)
+        dims[i] = input->GetDim(i);
+    dims[input->order - 1] = beamSize;
+
+    InitTensor(&state->probPath, input->order, dims, X_FLOAT, 1.0F, input->devID, input->mem);
+    InitTensor(&state->nstep, input->order, dims, X_FLOAT, 1.0F, input->devID, input->mem);
+    InitTensor(&state->endMark, input->order, dims, X_INT, 1.0F, input->devID, input->mem);
+
+    state->probPath.SetZeroAll();
+    state->nstep.SetZeroAll();
+    state->endMark.SetZeroAll();
+
+    state->stateNum = 0;
+}
+
+/*
+set start symbol
+>> symbol - the symbol (in integer)
+*/
+void T2TPredictor::SetStartSymbol(int symbol)
+{
+    startSymbol = symbol;
+}
+
+/* 
+read a state 
+>> model - the t2t model that keeps the network created so far
+>> state - a set of states. It keeps
+             1) hypotheses (states)
+             2) probablities of hypotheses
+             3) parts of the network for expanding toward the next state
+*/
+void T2TPredictor::Read(T2TModel * model, T2TStateBundle * state)
+{
+    m = model;
+    s = state;
+}
+
+/*
+predict the next state
+>> next - next states (assuming that the current state has been read)
+>> encoding - encoder output
+>> inputEnc - input of the encoder
+>> paddingEnc - padding of the encoder
+*/
+void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding,
+                           XTensor * inputEnc, XTensor * paddingEnc)
+{
+    int dims[MAX_TENSOR_DIM_NUM];
+
+    next->layersEnc.Clear();
+    next->layersDec.Clear();
+    
+    AttDecoder &decoder = *m->decoder;
+    
+    /* word indices of previous positions */
+    XTensor * inputLast = (XTensor*)s->layersDec.GetItem(0);
+
+    /* word indices of positions up to next state */
+    XTensor inputDec;
+
+    /* the first token */
+    XTensor first;
+    
+    CheckNTErrors(inputEnc->order >= 2, "Wrong order of the tensor!");
+    for(int i = 0; i < inputEnc->order - 1; i++)
+        dims[i] = inputEnc->GetDim(i);
+    dims[inputEnc->order - 1] = 1;
+
+    InitTensor(&first, inputEnc->order, dims, X_INT, 1.0F, inputEnc->devID, inputEnc->mem);
+    _SetDataFixedInt(&first, startSymbol);
+
+    /* add a new word into the input sequence of the decoder side */
+    if (inputLast == NULL) {
+        inputDec = Identity(first);
+    }
+    else{
+        inputDec = GeneratePaths(s);
+        inputDec.SetDevice(inputEnc->devID, inputEnc->mem);
+
+        inputDec = Concatenate(first, inputDec, inputDec.order - 1);
+    }
+
+    /* prediction probabilities */
+    XTensor &output = next->prob;
+    XTensor decoding;
+    XTensor decodingStep;
+    
+    for(int i = 0; i < inputDec.order - 1; i++)
+        dims[i] = inputDec.GetDim(i);
+    dims[inputDec.order - 1] = inputDec.GetDim(-1);
+    
+    XTensor paddingDec;
+    InitTensor(&paddingDec, inputDec.order, dims, X_INT, 1.0F, paddingEnc->devID, paddingEnc->mem);
+    SetDataFixedInt(paddingDec, 1);
+    
+    XTensor maskDec;
+    XTensor maskEncDec;
+    
+    /* decoder mask */
+    m->MakeMTMaskDec(*inputEnc, inputDec, *paddingEnc, paddingDec, maskDec, maskEncDec);
+
+    /* make the decoding network */
+    decoding = decoder.Make(inputDec, *encoding, maskDec, maskEncDec, false);
+
+    XTensor selectSrc;
+    XTensor selectTgt;
+
+    CheckNTErrors(decoding.order >= 2, "The tensor must be of order 2 or larger!");
+
+    int stride = decoding.GetDim(decoding.order - 2);
+
+    InitTensor1D(&selectSrc, 1, X_INT);
+    InitTensor1D(&selectTgt, 1, X_INT);
+
+    selectSrc.SetInt(stride - 1, 0);
+    selectTgt.SetInt(0, 0);
+
+    selectSrc.SetDevice(decoding.devID, decoding.mem);
+    selectTgt.SetDevice(decoding.devID, decoding.mem);
+    
+    /* the decoder output of the last position */
+    decodingStep = CopyIndexed(decoding, decoding.order - 2, selectSrc, selectTgt);
+
+    /* generate the output probabilities */
+    m->outputLayer->Make(decodingStep, output);
+    
+    next->layersEnc.AddList(&s->layersEnc);
+    next->layersDec.Add(&inputDec);
+    next->layersDec.Add(&output);
+}
+
+/* 
+generate paths up to the states of the current step 
+>> state - state bundle of the current step
+*/
+XTensor T2TPredictor::GeneratePaths(T2TStateBundle * state)
+{
+    CheckNTErrors(state->stateNum >= 0, "Illegal state!");
+
+    int distance = -1;
+    
+    for(int i = 0; i < state->stateNum; i++){
+        T2TState * cur = state->states + i;
+        int nsteps = 0;
+
+        while(cur != NULL){
+            nsteps++;
+            cur = cur->last;
+        }
+
+        if(nsteps > distance)
+            distance = nsteps;
+    }
+
+    XTensor path;
+    InitTensor2D(&path, state->stateNum, distance, X_INT);
+    path.SetZeroAll();
+
+    for(int i = 0; i < state->stateNum; i++){
+        T2TState * cur = state->states + i;
+        int nsteps = 0;
+
+        while(cur != NULL){
+            nsteps++;
+            path.Set2DInt(cur->prediction, i, distance - nsteps);
+            cur = cur->last;
+        }
+    }
+
+    return path;
+}
+
+}
+
--- a/source/sample/transformer/T2TPredictor.h
+++ b/source/sample/transformer/T2TPredictor.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-13
+ * This is the first source file I create in 2019 - new start!
+ */
+
+#ifndef __T2TPREDICTOR_H__
+#define __T2TPREDICTOR_H__
+
+#include "T2TModel.h"
+#include "T2TLengthPenalty.h"
+
+namespace transformer
+{
+
+#define T2T_PID_EMPTY -1
+
+/* state for search. It keeps the path (back-pointer), prediction distribution,
+   and etc. It can be regarded as a hypothsis in translation. */
+class T2TState
+{
+public:
+    /* we assume that the prediction is an integer */
+    int prediction;
+
+    /* id of the problem. One can regard it as the sentence id when we 
+       translate a number of sentences in the batched manner. The hypothesis 
+       is empty if id = -1 */
+    int pid;
+
+    /* indicates whether the state is an end */
+    bool isEnd;
+
+    /* indicates whether the state is the start */
+    bool isStart;
+
+    /* indicates whether the state is completed */
+    bool isCompleted;
+
+    /* probability of every prediction (last state of the path) */
+    float prob;
+
+    /* probability of every path */
+    float probPath;
+
+    /* model score of every path. A model score = path probability + some other stuff */
+    float modelScore;
+
+    /* nubmer of steps we go over so far */
+    int nstep;
+
+    /* pointer to the previous state */
+    T2TState * last;
+};
+
+/* a bundle of states */
+class T2TStateBundle
+{
+public:
+    /* predictions */
+    XTensor prediction;
+    
+    /* id of the previous state that generates the current one  */
+    XTensor preID;
+
+    /* mark that indicates whether each hypothesis is completed */
+    XTensor endMark;
+
+    /* probability of every prediction (last state of the path) */
+    XTensor prob;
+
+    /* probability of every path */
+    XTensor probPath;
+
+    /* model score of every path */
+    XTensor modelScore;
+
+    /* step number of each hypothesis */
+    XTensor nstep;
+
+    /* layers on the encoder side. We actually use the encoder output instead
+       of all hidden layers. */
+    TensorList layersEnc;
+
+    /* layers on the decoder side */
+    TensorList layersDec;
+
+    /* list of states */
+    T2TState * states;
+
+    /* number of states */
+    int stateNum;
+
+    /* indicates whether it is the first state */
+    bool isStart;
+
+public:
+    /* constructor */
+    T2TStateBundle();
+
+    /* de-constructor */
+    ~T2TStateBundle();
+
+    /* create states */
+    void MakeStates(int num);
+};
+
+/* The predictor reads the current state and then predicts the next. 
+   It is exactly the same procedure of MT inference -
+   we get the state of previous words and then generate the next word.
+   Here, a state can be regared as the representation of words (word 
+   indices, hidden states, embeddings and etc.).  */
+class T2TPredictor
+{
+private:
+    /* pointer to the transformer model */
+    T2TModel * m;
+
+    /* current state */
+    T2TStateBundle * s;
+
+    /* start symbol */
+    int startSymbol;
+
+public:
+    /* constructor */
+    T2TPredictor();
+
+    /* de-constructor */
+    ~T2TPredictor();
+
+    /* create an initial state */
+    void Create(T2TModel * model, XTensor * top, const XTensor * input, int beamSize, T2TStateBundle * state);
+
+    /* set the start symbol */
+    void SetStartSymbol(int symbol);
+
+    /* read a state */
+    void Read(T2TModel * model, T2TStateBundle * state);
+
+    /* predict the next state */
+    void Predict(T2TStateBundle * next, XTensor * encoding, XTensor * inputEnc, XTensor * paddingEnc);
+
+    /* generate paths up to the states of the current step */
+    XTensor GeneratePaths(T2TStateBundle * state);
+};
+
+}
+
+#endif
--- a/source/sample/transformer/T2TSearch.cpp
+++ b/source/sample/transformer/T2TSearch.cpp
--- a/source/sample/transformer/T2TSearch.h
+++ b/source/sample/transformer/T2TSearch.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
+ */
+
+#ifndef __T2TSEARCH_H__
+#define __T2TSEARCH_H__
+
+#include "T2TModel.h"
+#include "T2TPredictor.h"
+
+namespace transformer
+{
+
+/* The class orgnizes the search process. It calls "predictors" to generate
+   distributions of the predictions and prunes the search space by beam pruning.
+   This makes a graph where each path respresents a translation hypothsis.
+   The output can be the path with the highest model score. */
+class T2TSearch
+{
+private:
+    /* the alpha parameter controls the length preference */
+    float alpha;
+
+    /* predictor */
+    T2TPredictor predictor;
+    
+    /* max length of the generated sequence */
+    int maxLength;
+    
+    /* beam size */
+    int beamSize;
+
+    /* batch size */
+    int batchSize;
+
+    /* we keep the final hypotheses in a heap for each sentence in the batch. */
+    XHeap<MIN_HEAP, float> * fullHypos;
+
+    /* array of the end symbols */
+    int * endSymbols;
+
+    /* number of the end symbols */
+    int endSymbolNum;
+
+    /* start symbol */
+    int startSymbol;
+
+public:
+    /* constructor */
+    T2TSearch();
+
+    /* de-constructor */
+    ~T2TSearch();
+    
+    /* initialize the model */
+    void Init(int argc, char ** argv);
+
+    /* search for the most promising states */
+    void Search(T2TModel * model, XTensor * input, XTensor * padding, XTensor * output);
+
+    /* preparation */
+    void Prepare(int myBatchSize,int myBeamSize);
+
+    /* compute the model score for each hypothesis */
+    void Score(T2TStateBundle * prev, T2TStateBundle * beam);
+
+    /* generate token indices via beam pruning */
+    void Generate(T2TStateBundle * beam);
+
+    /* expand the search graph */
+    void Expand(T2TStateBundle * prev, T2TStateBundle * beam);
+
+    /* collect hypotheses with ending symbol */
+    void Collect(T2TStateBundle * beam);
+
+    /* fill the hypotheis heap with incomplete hypothses */
+    void FillHeap(T2TStateBundle * beam);
+
+    /* save the output sequences in a tensor */
+    void Dump(XTensor * output);
+
+    /* check if the token is an end symbol */
+    bool IsEnd(int token);
+
+    /* set end symbols for search */
+    void SetEnd(const int * tokens, const int tokenNum);
+
+    /* check whether all hypotheses are completed */
+    bool IsAllCompleted(T2TStateBundle * beam);
+
+    /* make a mask to prevent duplicated entries in beam expansion for the first position */
+    XTensor MakeFirstMask(T2TStateBundle * beam);
+};
+
+}
+
+#endif
--- a/source/sample/transformer/T2TTester.cpp
+++ b/source/sample/transformer/T2TTester.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
+ */
+
+#include <math.h>
+#include "T2TUtility.h"
+#include "T2TTester.h"
+#include "T2TSearch.h"
+#include "../../tensor/XUtility.h"
+#include "../../tensor/core/CHeader.h"
+#include "../../network/XNoder.h"
+
+using namespace nts;
+
+namespace transformer
+{
+
+/* constructor */
+T2TTester::T2TTester()
+{
+}
+
+/* de-constructor */
+T2TTester::~T2TTester()
+{
+}
+
+/* initialize the model */
+void T2TTester::Init(int argc, char ** argv)
+{
+    LoadParamInt(argc, argv, "vsize", &vSize, 1);
+    LoadParamInt(argc, argv, "vsizetgt", &vSizeTgt, vSize);
+
+    batchLoader.Init(argc, argv);
+    seacher.Init(argc, argv);
+}
+
+/* 
+test the model
+>> fn - test data file
+>> ofn - output data file
+>> model - model that is trained
+*/
+void T2TTester::Test(const char * fn, const char * ofn, T2TModel * model)
+{
+    int wc = 0;
+    int ws = 0;
+    int wordCount = 0;
+    int wordCountTotal = 0;
+    int sentCount = 0;
+    int batchCount = 0;
+    float loss = 0;
+
+    /* data files */
+    FILE * file = fopen(fn, "rb");
+    CheckNTErrors(file, "Cannot read the test file");
+    FILE * ofile = fopen(ofn, "wb");
+    CheckNTErrors(ofile, "Cannot open the output file");
+
+    int devID = model->devID;
+    XMem * mem = model->mem;
+
+    XNet net;
+    
+    double startT = GetClockSec();
+        
+    wordCount = 0;
+        
+    /* batch of input sequences */
+    XTensor batchEnc;
+    XTensor batchDec;
+
+    /* label */
+    XTensor label;
+
+    /* padding */
+    XTensor paddingEnc;
+    XTensor paddingDec;
+
+    /* gold standard */
+    XTensor gold;
+
+    /* an array that keeps the sequences */
+    int * seqs = new int[MILLION];
+
+    batchLoader.SetRandomBatch(false);
+    batchLoader.ClearBuf();
+
+    while(batchLoader.LoadBatch(file, model->isLM, 
+                                &batchEnc, &paddingEnc, &paddingDec, &paddingDec, &gold, &label,
+                                seqs, vSize, vSizeTgt,
+                                1, 1, false, ws, wc, devID, mem, false))
+    {
+        CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch!");
+        CheckNTErrors(!model->isLM, "Only MT model is supported!");
+        
+        XTensor output;
+
+        seacher.Search(model, &batchEnc, &paddingEnc, &output);
+
+        Dump(ofile, &output);
+
+        float prob = 0;
+            
+        loss += -prob;
+        wc = batchEnc.GetDim(-1);
+        wordCount += wc;
+        wordCountTotal += wc;
+        sentCount += batchEnc.GetDim(-2);
+        batchCount += 1;
+
+        if (batchCount % 1 == 0) {
+            double elapsed = GetClockSec() - startT;
+            XPRINT3(0, stderr, 
+                   "[INFO] elapsed=%.1fs, sent=%d, sword=%d\n",
+                    elapsed, sentCount, wordCount);
+        }
+    }
+        
+    fclose(file);
+    fclose(ofile);
+
+    delete[] seqs;
+    
+    double elapsed = GetClockSec() - startT;
+
+    XPRINT4(0, stderr, "[INFO] test finished (took %.1fs, word=%d, sent=%d, and ppl=%.3f)\n",
+            elapsed,wordCountTotal, sentCount, exp(loss/wordCount));
+}
+
+/*
+dump the result into the file
+>> file - data file
+>> output - output tensor
+*/
+void T2TTester::Dump(FILE * file, XTensor * output)
+{
+    int seqLength = output->GetDim(-1);
+
+    for (int i = 0; i < output->unitNum; i += seqLength) {
+        for (int j = 0; j < seqLength; j++) {
+            int w = output->GetInt(i + j);
+            fprintf(file, "%d ", w);
+            if (w < 0)
+                break;
+        }
+
+        fprintf(file, "\n");
+    }
+}
+
+}
--- a/source/sample/transformer/T2TTester.h
+++ b/source/sample/transformer/T2TTester.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
+ * A week with no trips :)
+ */
+
+#ifndef __T2TTESTER_H__
+#define __T2TTESTER_H__
+
+#include "T2TSearch.h"
+#include "T2TBatchLoader.h"
+
+namespace transformer
+{
+
+/* This class translates test sentences with a trained model. */
+class T2TTester
+{
+public:
+    /* vocabulary size of the source side */
+    int vSize;
+
+    /* vocabulary size of the target side */
+    int vSizeTgt;
+    
+    /* for batching */
+    T2TBatchLoader batchLoader;
+
+    /* decoder for inference */
+    T2TSearch seacher;
+
+public:
+    /* constructor */
+    T2TTester();
+
+    /* de-constructor */
+    ~T2TTester();
+
+    /* initialize the model */
+    void Init(int argc, char ** argv);
+
+    /* test the model */
+    void Test(const char * fn, const char * ofn, T2TModel * model);
+
+    /* dump the result into the file */
+    void Dump(FILE * file, XTensor * output);
+};
+
+}
+
+#endif
\ No newline at end of file
--- a/source/sample/transformer/T2TTrainer.cpp
+++ b/source/sample/transformer/T2TTrainer.cpp
--- a/source/sample/transformer/T2TTrainer.h
+++ b/source/sample/transformer/T2TTrainer.h
@@ -23,11 +23,9 @@
 #define __T2TTRAINER_H__

 #include "T2TModel.h"
-
+#include "T2TBatchLoader.h"
 #include "../../tensor/function/FHeader.h"

-#define MAX_SEQUENCE_LENGTH 1024 * 4
-
 using namespace nts;

 namespace transformer
@@ -42,33 +40,6 @@ public:

    /* parameter array */
    char ** argArray;
-
-    /* buffer for loading words */
-    int * buf;
-
-    /* another buffer */
-    int * buf2;
-
-    /* buffer size */
-    int bufSize;
-
-    /* length of each sequence */
-    int * seqLen;
-
-    /* another array */
-    int * seqLen2;
-
-    /* offset of the first word for each sequence */
-    int * seqOffset;
-
-    /* number of sequences in the buffer */
-    int nseqBuf;
-
-    /* offset for next sequence in the buffer */
-    int nextSeq;
-    
-    /* indicates whether the sequence is sorted by length */
-    bool isLenSorted;
    
    /* dimension size of each inner layer */
    int d;
@@ -111,10 +82,10 @@ public:
    float adamBeta2T;

    /* list of the moment of the parameter matrics */
-    XList moments;
+    TensorList moments;

    /* list of the 2nd order moment of the parameter matrics */
-    XList moments2nd;
+    TensorList moments2nd;

    /* indicates whether the data file is shuffled for training */
    bool isShuffled;
@@ -130,20 +101,15 @@ public:
    
    /* number of batches on which we do model update */
    int updateStep;
-    
-    /* indicates whether we double the </s> symbol for the output of lms */
-    bool isDoubledEnd;
-    
-    /* indicates whether we use batchsize = max * sc
-       rather rather than batchsize = word-number, where max is the maximum
-       length and sc is the sentence number */
-    bool isSmallBatch;

-    /* counterpart of "isSmallBatch" */
-    bool isBigBatch;
+    /* indicates whether we intend to debug the net */
+    bool isDebugged;

-    /* indicates whether we use small memory footprint for backward process */
-    bool isSmallFootprint;
+    /* indicates whether the sequence is sorted by length */
+    bool isLenSorted;
+
+    /* for batching */
+    T2TBatchLoader batchLoader;

 public:
    /* constructor */
@@ -163,46 +129,6 @@ public:

    /* make a checkpoint */
    void MakeCheckpoint(T2TModel * model, const char * validFN, const char * modelFN, const char * label, int id);
-
-    /* load data to buffer */
-    int LoadBuf(FILE * file, bool isSorted, int step);
-
-    /* clear data buffer */
-    void ClearBuf();
-
-    /* load a batch of sequences */
-    int LoadBatch(FILE * file, bool isLM,
-                  XTensor * batchEnc, XTensor * paddingEnc, 
-                  XTensor * batchDec, XTensor * paddingDec,
-                  XTensor * gold,
-                  int * seqs,
-                  int vsEnc, int vsDec, int sBatch, int wBatch, 
-                  bool isSorted, int &wCount,
-                  int devID, XMem * mem, 
-				  bool isTraining);
-
-    /* load a batch of sequences (for language modeling) */
-    int LoadBatchLM(FILE * file, 
-                    XTensor * batchEnc, XTensor * paddingEnc,
-                    XTensor * batchDec, XTensor * paddingDec,
-                    XTensor * gold,
-                    int * seqs, int vs, int sBatch, int wBatch, 
-                    bool isSorted, int &wCount,
-                    int devID, XMem * mem, 
-					bool isTraining);
-
-    /* load a batch of sequences (for machine translation) */
-    int LoadBatchMT(FILE * file, 
-                    XTensor * batchEnc, XTensor * paddingEnc, 
-                    XTensor * batchDec, XTensor * paddingDec,
-                    XTensor * gold,
-                    int * seqs, int vsEnc, int vsDec, int sBatch, int wBatch, 
-                    bool isSorted, int &wCount,
-                    int devID, XMem * mem, 
-					bool isTraining);
-
-    /* shuffle the data file */
-    void Shuffle(const char * srcFile, const char * tgtFile);
    
    /* get word probabilities for a batch of sequences */
    float GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs);

--- a/source/sample/transformer/Transformer.cpp
+++ b/source/sample/transformer/Transformer.cpp
@@ -25,6 +25,8 @@
 #include "T2TModel.h"
 #include "T2TUtility.h"
 #include "T2TTrainer.h"
+#include "T2TPredictor.h"
+#include "T2TTester.h"
 #include "../../tensor/XDevice.h"
 #include "../../tensor/XUtility.h"
 #include "../../tensor/XGlobal.h"
@@ -36,8 +38,6 @@ int TransformerMain(int argc, const char ** argv)
 {
    if(argc == 0)
        return 1;
-    
-    fprintf(stderr, "%e\n", log(1e-8F));

    char ** args = new char*[argc];
    for(int i = 0; i < argc; i++){
@@ -49,6 +49,7 @@ int TransformerMain(int argc, const char ** argv)

    ShowParams(argc, args);

+    bool isBeamSearch = false;
    char * trainFN = new char[MAX_LINE_LENGTH];
    char * modelFN = new char[MAX_LINE_LENGTH];
    char * testFN = new char[MAX_LINE_LENGTH];
@@ -58,8 +59,10 @@ int TransformerMain(int argc, const char ** argv)
    LoadParamString(argc, args, "model", modelFN, "");
    LoadParamString(argc, args, "test", testFN, "");
    LoadParamString(argc, args, "output", outputFN, "");
+    LoadParamBool(argc, args, "beamsearch", &isBeamSearch, false);

    srand((unsigned int)time(NULL));
+
    T2TTrainer trainer;
    trainer.Init(argc, args);

@@ -78,12 +81,22 @@ int TransformerMain(int argc, const char ** argv)
    if(strcmp(modelFN, ""))
        model.Read(modelFN);

-    T2TTrainer tester;
-    tester.Init(argc, args);
-
    /* test the model on the new data */
-    if(strcmp(testFN, "") && strcmp(outputFN, ""))
-        tester.Test(testFN, outputFN, &model);
+    if(strcmp(testFN, "") && strcmp(outputFN, "")){
+        /* beam search */
+        if(isBeamSearch){
+            T2TTester searcher;
+            searcher.Init(argc, args);
+            searcher.Test(testFN, outputFN, &model);
+        }
+
+        /* forced decoding */
+        else{
+            T2TTrainer tester;
+            tester.Init(argc, args);
+            tester.Test(testFN, outputFN, &model);
+        }
+    }

    delete[] trainFN;
    delete[] modelFN;

--- a/source/tensor/Main.cpp
+++ b/source/tensor/Main.cpp
@@ -30,6 +30,7 @@
 #include "XDevice.h"
 #include "./test/Test.h"
 #include "./core/CHeader.h"
+#include "./loss/CrossEntropy.h"

 //#define CRTDBG_MAP_ALLOC
 //#include <stdlib.h>  

--- a/source/tensor/XDataType.cpp
+++ b/source/tensor/XDataType.cpp
@@ -90,7 +90,10 @@ data type conversion
 >> typeT - target data type
 >> size - number of the items in s (and t)
 */
-void ConvertDataType(int devID, void * s, TENSOR_DATA_TYPE typeS, void * t, TENSOR_DATA_TYPE typeT, int size)
+void ConvertDataType(int devID, 
+                     void * s, TENSOR_DATA_TYPE typeS, 
+                     void * t, TENSOR_DATA_TYPE typeT, 
+                     int size)
 {
    CheckNTErrors((devID < 0), "This code must be run on CPUs!");


--- a/source/tensor/XDataType.h
+++ b/source/tensor/XDataType.h
@@ -53,12 +53,6 @@ void ConvertDataType(int devID,
                     void * s, TENSOR_DATA_TYPE typeS, 
                     void * t, TENSOR_DATA_TYPE typeT, int size);

-#ifdef USE_CUDA
-void CudaConvertDataType(int devID, 
-                         void * s, TENSOR_DATA_TYPE typeS, 
-                         void * t, TENSOR_DATA_TYPE typeT, int size);
-#endif
-
 } /* end of the nts (NiuTrans.Tensor) namespace */

 #endif
\ No newline at end of file
--- a/source/tensor/XDevice.cpp
+++ b/source/tensor/XDevice.cpp
@@ -24,6 +24,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <time.h>
 #include "XDevice.h"
 #include "XGlobal.h"
 #include "XThread.h"
@@ -59,6 +60,7 @@ XDevice::~XDevice()
        cublasDestroy(cublasHandle);
    if(stream != NULL)
        delete stream;
+    curandDestroyGenerator(gen);
 #endif
 }

@@ -68,6 +70,7 @@ void XDevice::Init(int myDevID)
    Clear();

    devID = myDevID;
+    seed = rand();

    /* CPU information */
    if(devID < 0){
@@ -80,6 +83,10 @@ void XDevice::Init(int myDevID)
        cudaDeviceProp prop;

        cudaSetDevice(myDevID);
+
+        curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);
+        curandSetPseudoRandomGeneratorSeed(gen, seed);
+
        if(cudaGetDeviceProperties(&prop, devID) != cudaSuccess){
            XPRINT1(0, stderr, "cannot get GPU(%d) information.", devID);
            exit(1);
@@ -194,7 +201,8 @@ void XDevice::SetGPUDevice(int devID)
    cudaError_t error = cudaSetDevice(devID);

    if (error != cudaSuccess){
-        fprintf(stderr, "Error! Calling cudaSetDevice(%d) fails(%d:%s)\n", devID, error, cudaGetErrorString(error));
+        fprintf(stderr, "Error! Calling cudaSetDevice(%d) fails(%d:%s)\n",
+                devID, error, cudaGetErrorString(error));
        exit(1);
    }
 #else
@@ -209,7 +217,7 @@ void XDevice::SetGPUDeviceFast(int devID)
    SetFastFlags();
 }

-/* switch to a get current dev */
+/* get the id of the current GPU device */
 int XDevice::GetGPUDevice()
 {
 #ifdef USE_CUDA
@@ -217,7 +225,8 @@ int XDevice::GetGPUDevice()
    cudaError_t error = cudaGetDevice(&devID);

    if (error != cudaSuccess){
-        fprintf(stderr, "Error! Calling cudaGetDevice(%d) fails(%d:%s)\n", devID, error, cudaGetErrorString(error));
+        fprintf(stderr, "Error! Calling cudaGetDevice(%d) fails(%d:%s)\n",
+                devID, error, cudaGetErrorString(error));
        exit(1);
    }

@@ -241,7 +250,7 @@ void XDevice::SetFastFlags()
 #endif
 }

-/* reset cuda flag for more efficient cuda execution (all devices) */
+/* reset the cuda flag for more efficient cuda execution (all devices) */
 void XDevice::SetFastFlagsAllDevices()
 {
 #ifdef USE_CUDA
@@ -267,9 +276,11 @@ XDevManager::~XDevManager()
 }


-/* initialize it and get the CPU and GPU information */
+/* initialization */
 void XDevManager::Init()
 {
+    srand((unsigned int)time(NULL));
+
    Clear();

    /* CPUs (we actually do not care about how many CPUs are using) */
@@ -309,7 +320,7 @@ void XDevManager::Clear()

 #ifdef USE_CUDA

-/* get the handle of GPU */
+/* get the handle of a given GPU */
 cublasHandle_t * XDevManager::GetCudaHandle(const int devID)
 {
    CheckNTErrors(devID < nGPU, "index of GPU is out of range.");
@@ -317,7 +328,7 @@ cublasHandle_t * XDevManager::GetCudaHandle(const int devID)
    return GPUs[devID].GetCublasHandle();
 }

-/* get the stream of cuda */
+/* get the stream of a given GPU */
 cudaStream_t * XDevManager::GetCudaStream(const int devID)
 {
    CheckNTErrors(devID < nGPU, "index of GPU is out of range.");
@@ -465,7 +476,7 @@ split a string
 >> items - splitting result
 << return - how many items are there
 */
-int SplitALine(char * inputString, const char * seperator, XList * items)
+int SplitALine(char * inputString, const char * seperator, StrList* items)
 {
    items->Clear();

@@ -514,12 +525,12 @@ get device ids for the given device information
             devInfo = "0:CPU-1 1:GPU-0 2:CPU-1"
             means that the first device is CPU, the second device
             is GPU-0, the third device is CPU.
->> devIDs - device sequence specified by devInfo
+>> devIDs - device IDs specified by devInfo
 << return - number of devices
 */
 int XDevManager::GetDeviceIDs(char * devInfo, int * devIDs)
 {
-    XList * terms = new XList(1);
+	StrList* terms = new StrList(1);
    SplitALine(devInfo, " ", terms);

    for(int i = 0; i < terms->count; i++){
@@ -556,7 +567,7 @@ int XDevManager::GetDeviceIDs(char * devInfo, int * devIDs)
    return devCount;
 }

-/* show id sequence */
+/* show device IDs */
 void XDevManager::ShowDeviceIDs(char * devInfo, char * msg)
 {
    msg[0] = 0;

--- a/source/tensor/XDevice.h
+++ b/source/tensor/XDevice.h
@@ -99,6 +99,9 @@ public:

    /* default stream for the device */
    XStream * stream;
+
+    /* seed for random number generation */
+    int seed;
    
 #ifdef USE_CUDA
    /* mutex for handle (GPU cublas) */
@@ -109,6 +112,9 @@ public:

    /* specify if the handle is initialized */
    bool isHandleReady;
+
+    /* generater of random numbers */
+    curandGenerator_t gen;
 #endif


@@ -230,6 +236,18 @@ extern XDevManager GDevs;
        cudaSetDevice(devIDBackup); \
 } \

+#define CheckDev(a, b) \
+{ \
+    if((a < 0 && b >= 0) || (a >= 0 && b < 0)){ \
+        fprintf(stderr, "[ERROR] (%s line %d): we must run the code on the same device (%d vs %d)\n", __FILENAME__, __LINE__, a, b); \
+        exit(1); \
+    } \
+    else if (a >= 0 && b >= 0 && a != b) { \
+        fprintf(stderr, "[ERROR] (%s line %d): we must run the code on the same device (%d vs %d)\n", __FILENAME__, __LINE__, a, b); \
+        exit(1); \
+    } \
+} \
+
 } /* end of the nts (NiuTrans.Tensor) namespace */

 #endif
--- a/source/tensor/XGlobal.h
+++ b/source/tensor/XGlobal.h
@@ -49,7 +49,7 @@ namespace nts {

 #ifdef DOUBELPRICSION
 #define DTYPE double
-#define DTYPE_MIN (DTYPE)1.79E+308
+#define DTYPE_MIN (DTYPE)-1.79E+308
 #else
 #define DTYPE float
 #define DTYPE_MIN (DTYPE)-3.40E+38
@@ -153,7 +153,9 @@ extern bool useCUDA;
 #define XPRINT7(VERBOSE,FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6,ARG7) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6,ARG7);FFLUSH(FILEH);}}
 #define XPRINT8(VERBOSE,FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6,ARG7,ARG8) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6,ARG7,ARG8);FFLUSH(FILEH);}}

-#define B2I(V) V==0?false:true
+#define B2I(V) V == 0 ? false : true
+
+#define MODX(a, b) int(b == 0 ? a : a - floor(double(a)/b) * b)

 /* BLAS interfaces */
 #ifdef DOUBELPRICSION

--- a/source/tensor/XHeap.cpp
+++ b/source/tensor/XHeap.cpp
@@ -31,15 +31,15 @@ namespace nts{

 /* constructor */
 template<HeapType hType, typename T>
+XHeap<hType, T>::XHeap()
+{
+}
+
+/* constructor */
+template<HeapType hType, typename T>
 XHeap<hType, T>::XHeap(int mySize, XMem * myMem)
 {
-    mem = myMem;
-    size = mySize;
-    count = 0;
-    if (mem == NULL)
-        items = new HeapNode<T>[mySize];
-    else
-        mem->Alloc(mem->devID, mySize * sizeof(T));
+    Init(mySize, myMem);
 }

 /* deconstructor */
@@ -50,6 +50,19 @@ XHeap<hType, T>::~XHeap()
 }

 template<HeapType hType, typename T>
+void XHeap<hType, T>::Init(int mySize, XMem * myMem)
+{
+    mem = myMem;
+    size = mySize;
+    count = 0;
+
+    if (mem == NULL)
+        items = new HeapNode<T>[mySize];
+    else
+        mem->Alloc(mem->devID, mySize * sizeof(T));
+}
+
+template<HeapType hType, typename T>
 void XHeap<hType, T>::Clear(T initValue)
 {
    count = 0;
@@ -89,10 +102,24 @@ _XINLINE_ HeapNode<T> XHeap<hType, T>::End()
 template<HeapType hType, typename T>
 _XINLINE_ void XHeap<hType, T>::Push(HeapNode<T> node)
 {
-    //CheckNTErrors((count < size), "Heap is full!");
-    items[count] = node;
-    Up(count);
-    count++;
+    if (count < size) {
+        items[count] = node;
+        Up(count);
+        count++;
+    }
+    else if(count == size){
+        HeapNode<T> & item0 = items[0];
+        if (hType == MIN_HEAP && item0.value >= node.value)
+            return;
+        else if (hType == MAX_HEAP && item0.value <= node.value)
+            return;
+        items[0] = node;
+        Down(0);
+    }
+    else {
+        ShowNTErrors("Overflow of the heap!");
+    }
+    
 }

 /* replace the top-most item and update the heap */
@@ -107,7 +134,7 @@ _XINLINE_ void XHeap<hType, T>::ReplaceTop(HeapNode<T> node)
 template<HeapType hType, typename T>
 _XINLINE_ HeapNode<T> XHeap<hType, T>::Pop()
 {
-    //CheckNTErrors((size > 0), "Empty heap!");
+    CheckNTErrors(count > 0, "Empty heap!");
    HeapNode<T> node = items[0];
    items[0] = items[count - 1];
    count--;

--- a/source/tensor/XHeap.h
+++ b/source/tensor/XHeap.h
@@ -39,7 +39,7 @@ template <typename T>
 struct HeapNode
 {
    /* node index */
-    int index;
+    long long index;

    /* value of the node */
    T value;
@@ -52,9 +52,16 @@ struct HeapNode

    HeapNode(int i, T v)
    {
-        index = i;
+        index = (long long)i;
        value = v;
    };
+
+    HeapNode(void * i, T v)
+    {
+        index = (long long)i;
+        value = v;
+
+    }
 };

 /* a heap that keeps a data array of T */
@@ -76,11 +83,17 @@ public:

 public:
    /* constructor */
+    XHeap();
+
+    /* constructor */
    XHeap(int mySize, XMem * myMem = NULL);

    /* deconstructor */
    ~XHeap();

+    /* initialization */
+    void Init(int mySize, XMem * myMem = NULL);
+
    /* clear the data */
    void Clear(T initValue);

@@ -107,6 +120,9 @@ public:

    /* move item k up the tree */
    void Up(int k);
+
+    /* how many items are kept in the heap */
+    inline int Count() { return count; };
 };

 } /* end of the nts (NiuTrans.Tensor) namespace */

--- a/source/tensor/XLink.cpp
+++ b/source/tensor/XLink.cpp
@@ -300,9 +300,30 @@ void XLink::MakeLink(const XTensor * t1, const XTensor * t2, XTensor * h, int id
    if(h == NULL)
        return;
    
-    XList list(2);
-    list.Add(t1);
-    list.Add(t2);
+    TensorList list(2);
+    list.Add((XTensor*)t1);
+    list.Add((XTensor*)t2);
+
+    MakeLink(&list, h, id);
+}
+
+/*
+create a hyperedge with two input tensors and a output tensor
+>> t1 - a tail tensor
+>> t2 - the second tail tensor
+>> t3 - the third tail tensor
+>> h - head tensor
+>> id - id of the edge type
+*/
+void XLink::MakeLink(const XTensor * t1, const XTensor * t2, const XTensor * t3,XTensor * h, int id)
+{
+    if (h == NULL)
+        return;
+
+    TensorList list(3);
+    list.Add((XTensor*)t1);
+    list.Add((XTensor*)t2);
+    list.Add((XTensor*)t3);

    MakeLink(&list, h, id);
 }
@@ -313,7 +334,7 @@ create a hyper edge with a list of tensors and a output tensor
 >> h - head tensor
 >> id - id of the edge type
 */
-void XLink::MakeLink(const XList * list, XTensor * h, int id)
+void XLink::MakeLink(const TensorList * list, XTensor * h, int id)
 {
    /* forward */
    XLink &income = h->income;
@@ -347,7 +368,7 @@ create a hyper edge with a input tensors and a list of output tensors
 >> list - a list of output tensors
 >> id - id of the edge type
 */
-void XLink::MakeLink(XTensor * t, XList * list, int id)
+void XLink::MakeLink(XTensor * t, TensorList * list, int id)
 {
    /* forward */
    for(int i = 0; i < list->count; i++){
@@ -507,8 +528,92 @@ void XLink::Replace(const XTensor * oldOne, XTensor * newOne)
            CheckNTErrors(hit, "No proper node found in parent.income edge!");
        }
    }
+
+    strcpy(newOne->name, oldOne->name);
 }

+
+/*
+copy a node with another, i.e., we add the links to the new node
+>> src - the node to be copied
+>> tgt - the new node
+*/
+void XLink::Copy(const XTensor * reference, XTensor * target)
+{
+    if (reference == NULL || target == NULL)
+        return;
+
+    XLink &newIncome = target->income;
+    XLink &newOutgo = target->outgo;
+
+    XLink::ClearOutgoing(target);
+    XLink::ClearIncoming(target);
+
+    /* incoming nodes */
+    if (reference->income.typeID != 0) {
+        if (newIncome.tailNum < reference->income.tailNum) {
+            delete[] newIncome.tails;
+            newIncome.tails = new XTensor*[reference->income.tailNum];
+        }
+
+        newIncome.SetType(reference->income.typeID);
+        newIncome.head = target;
+        newIncome.tailNum = reference->income.tailNum;
+        memcpy(newIncome.tails, reference->income.tails, sizeof(XTensor*) * newIncome.tailNum);
+
+        int paraArraySize = reference->income.paramNum * reference->income.paramSize;
+        newIncome.params = new char[paraArraySize];
+        memcpy(newIncome.params, reference->income.params, paraArraySize);
+        newIncome.paramNum = reference->income.paramNum;
+
+        /* update the link to each child node */
+        for (int i = 0; i < newIncome.tailNum; i++) {
+            XTensor * child = newIncome.tails[i];
+            XLink &childOutgo = child->outgo;
+            bool hit = false;
+            for (int j = 0; j < childOutgo.tailNum; j++) {
+                if (childOutgo.tails[j] == reference) {
+                    //childOutgo.tails[j] = target;
+                    childOutgo.AddTail(target);
+                    hit = true;
+                    break;
+                }
+            }
+
+            if (childOutgo.tailNum > 0) {
+                CheckNTErrors(hit, "No proper node found in child.outgo edge!");
+            }
+        }
+    }
+
+    if (newOutgo.tailNum < reference->outgo.tailNum) {
+        delete[] newOutgo.tails;
+        newOutgo.tails = new XTensor*[reference->outgo.tailNum];
+    }
+
+    /* outgoing nodes */
+    newOutgo.head = target;
+    newOutgo.tailNum = reference->outgo.tailNum;
+    memcpy(newOutgo.tails, reference->outgo.tails, sizeof(XTensor*) * newOutgo.tailNum);
+
+    /* update the link to each parent node */
+    for (int i = 0; i < newOutgo.tailNum; i++) {
+        XTensor * parent = newOutgo.tails[i];
+        XLink &parentIncome = parent->income;
+        bool hit = false;
+        for (int j = 0; j < parentIncome.tailNum; j++) {
+            if (parentIncome.tails[j] == reference) {
+                //parentIncome.tails[j] = target;
+                parentIncome.AddTail(target);
+                hit = true;
+            }
+        }
+
+        if (parentIncome.tailNum > 0) {
+            CheckNTErrors(hit, "No proper node found in parent.income edge!");
+        }
+    }
+}
 /* 
 copy incoming edges of a given node
 >> reference - the node we copy from
@@ -521,7 +626,7 @@ void XLink::CopyIncoming(const XTensor * reference, XTensor * target)
    ClearIncoming(target);

    int tailNum = reference->income.tailNum;
-    XList tails(tailNum);
+    TensorList tails(tailNum);
    for(int i = 0; i < tailNum; i++){
        XTensor * tail = (XTensor*)reference->income.tails[i];
        tails.Add(tail);
@@ -634,6 +739,29 @@ void XLink::ShowNode(FILE * file, XTensor * node)

    fprintf(stderr, "\n");
 }
+
+/* 
+search for a node in a top-down manner by its name 
+>> top - the top most node
+<< return - the node we found
+*/
+XTensor * XLink::SearchNode(XTensor * top, const char * name)
+{
+    if(!strcmp(top->name, name))
+        return top;
+
+    XLink &incoming = top->income;
+
+    for(int i = 0; i < incoming.tailNum; i++){
+        XTensor * child = incoming.tails[i];
+        XTensor * hit = SearchNode(child, name);
+        if(hit != NULL)
+            return hit;
+    }
+
+    return NULL;
+}
+
    
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/XLink.h
+++ b/source/tensor/XLink.h
@@ -33,7 +33,7 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 /* cross reference */
 struct XTensor;

-#define MAX_OP_NAME_LENGTH 16
+#define MAX_OP_NAME_LENGTH 64
 #define PARAM_UNTI_SIZE    64

 /*
@@ -138,13 +138,17 @@ struct XLink
    static
    void MakeLink(const XTensor * t1, const XTensor * t2, XTensor * h, int id);

+    /* create a hyper edge with three input tensors and a output tensor */
+    static
+    void MakeLink(const XTensor * t1, const XTensor * t2, const XTensor * t3, XTensor * h, int id);
+
    /* create a hyper edge with a list of input tensors and a output tensor */
    static
-    void MakeLink(const XList * list, XTensor * h, int id);
+    void MakeLink(const TensorList * list, XTensor * h, int id);

    /* create a hyper edge with a input tensors and a list of output tensors */
    static
-    void MakeLink(XTensor * h, XList * list, int id);
+    void MakeLink(XTensor * h, TensorList * list, int id);

    /* add a parameter */
    static
@@ -170,6 +174,10 @@ struct XLink
    static 
    void Replace(const XTensor * oldOne, XTensor * newOne);

+    /* copy a node with another, i.e., we add the links to the new node */
+    static
+    void Copy(const XTensor * reference, XTensor * target);
+
    /* copy links of a given node */
    static
    void CopyIncoming(const XTensor * reference, XTensor * target);
@@ -181,6 +189,10 @@ struct XLink
    /* show a node */
    static
    void ShowNode(FILE * file, XTensor * node);
+
+    /* search a node in a top-down manner by its name */
+    static
+    XTensor * SearchNode(XTensor * top, const char * name);
 };
    
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/XList.cpp
+++ b/source/tensor/XList.cpp
--- a/source/tensor/XList.h
+++ b/source/tensor/XList.h
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
+ * Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,32 +15,31 @@
 * limitations under the License.
 */

-/*
- * 
- * Implementation of list that keeps data items
- *
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-04-17
- * The first coding job this year!
- *
- */
-
-#ifndef __XLIST_H__
-#define __XLIST_H__
+ /*
+  *
+  * Implementation of template list that keeps data items
+  *
+  * $Created by: HU Chi (huchinlp@foxmail.com)
+  *
+  */

 #include "XMem.h"
 #include "XGlobal.h"

-/* the nts (NiuTrans.Tensor) namespace */
-namespace nts{
+#ifndef __TensorList_H__
+#define __TensorList_H__

-typedef int (* ListCompare)(const void * item1, const void * item2);

-/* the XList class */
-class XList
-{
+/* the nts (NiuTrans.Tensor) namespace */
+namespace nts {
+	
+/* the TensorListBase class */
+template <typename T>
+struct TensorListBase {
 public:
+
    /* data items */
-    void ** items;
+    T *items;

    /* number of items */
    int count;
@@ -49,56 +48,88 @@ public:
    int maxNum;

    /* the memory pool for data array allocation */
-    XMem * mem;
-
-    /* indicates whether data items are integers */
-    bool isIntList;
+    XMem* mem;

 public:
    /* constructor */
-    XList();
+    TensorListBase();

    /* constructor */
-    XList(int myMaxNum, bool isIntListOrNot = false);
+    TensorListBase(int myMaxNum);

    /* constructor */
-    XList(int myMaxNum, XMem * myMem, bool isIntListOrNot = false);
+    TensorListBase(int myMaxNum, XMem* myMem);

    /* de-constructor */
-    ~XList();
-
-    /* utilities */
-    void Create(int myMaxNum, XMem * myMem);
-    void Add(const void * item);
-    void Add(void ** inputItems, int inputItemCount);
-    void AddList(XList * l);
-    void AddInt(int i);
-    void Insert(int pos, void * item);
-    void * GetItem(int i) const;   
-    int GetItemInt(int i);
-    void SetItem(int i, void * item);
-    void SetItemInt(int i, int item);
-    
-    int FindFirst(void * item);
+    ~TensorListBase();
+
+    /* add an item into the list */
+    void Add(T&& item);
+
+	/* add an item into the list */
+	void Add(const T& item);
+
+	/* add a number of items into the list */
+    void Add(T* inputItems, int inputItemCount);
+
+	/* append a list to the current list */
+    void AddList(TensorListBase* l);
+
+	/* insert an item to the given position of the list */
+    void Insert(int pos, const T& item);
+
+	/* insert an item to the given position of the list */
+	void Insert(int pos, T&& item);
+
+	/* get the item at position i */
+    T& GetItem(int i) const;
+
+	/* set the item at position i */
+    void SetItem(int i, const T& item);
+
+	/* set the item at position i */
+	void SetItem(int i, T&& item);
+
+	/* find the position of the first matched item  */
+    int FindFirst(const T& item);
+
+	/* clear the data array */
    void Clear();
-    void ClearStringList();
-    void Sort(int itemSize, ListCompare comp);
+
+	/* sort the list */
+    void Sort(int itemSize);
+
+	/* reverse the list */
    void Reverse();
+
+	/* remove the item at position i */
    void Remove(int i);
-    XList * Copy(XMem * myMem);
+
+	/* copy the list */
+    TensorListBase* Copy(XMem* myMem);
+
+	/* shuffle the list */
    void Shuffle(int nround = 10, int beg = -1, int len = 0);

    /* short */
-    _XINLINE_ void * Get(int i) {return GetItem(i);};
-    _XINLINE_ int GetInt(int i) {return GetItemInt(i);};
-    _XINLINE_ void Set(int i, void * item) {SetItem(i, item);};
-    _XINLINE_ void SetInt(int i, int item) {SetItemInt(i, item);};
-
+	T& operator[] (int i) {
+		return GetItem(i);
+	};
+    T& Get(int i) { return GetItem(i); };
+	void Set(int i, T item) { SetItem(i, item); };
 };

-extern XList NULLList;
+struct XTensor;
+
+typedef TensorListBase<int> IntList;
+typedef TensorListBase<char> CharList;
+typedef TensorListBase<char*> StrList;
+typedef TensorListBase<long> LongList;
+typedef TensorListBase<float> FloatList;
+typedef TensorListBase<short> ShortList;
+typedef TensorListBase<void*> XList;
+typedef TensorListBase<XTensor*> TensorList;

-} 
-/* end of the nts (NiuTrans.Tensor) namespace */
+} /* end of the nts (NiuTrans.Tensor) namespace */

-#endif
+#endif // __TensorList_H__
--- a/source/tensor/XMem.cpp
+++ b/source/tensor/XMem.cpp
@@ -34,6 +34,11 @@ namespace nts{
 int testxmemid = 0;
 void * recordp = NULL;

+/*
+for managing the memories
+*/
+XMemManager GMems;
+
 XMem * GMem;

 /* constructor */
@@ -58,7 +63,7 @@ constructor
 >> myMode - mode of running the memory pool
            UNI_FREE: free all the space at the end of using the memory pool
            FREE_ON_THE_FLY: normal "malloc" and "free" mode
->> myBlockSize - size of memory block
+>> myBlockSize - size of a memory block
 >> myBlockNum  - number of memory blocks
 >> myBufSize - size of buffer
 */
@@ -103,7 +108,7 @@ initialize it
 >> myMode - mode of running the memory pool
            UNI_FREE: free all the space at the end of using the memory pool
            FREE_ON_THE_FLY: normal "malloc" and "free" mode
->> myBlockSize - size of memory block
+>> myBlockSize - size of a memory block
 >> myBlockNum  - number of memory blocks
 >> myBufSize - size of buffer
 */
@@ -216,9 +221,9 @@ void XMem::Free(int myDevID, void * mem)
    }
 }

-/* 
-get signature 
-<< return - return the signature
+/*
+get the signature
+<< return - the signature
 */
 MTYPE XMem::GetSignature()
 {
@@ -226,7 +231,7 @@ MTYPE XMem::GetSignature()
 }

 /* 
-use string as the name of the memory pool 
+set the name of the memory pool 
 >> myName - name of the memory pool
 */
 void XMem::SetName(const char * myName)
@@ -259,7 +264,7 @@ void XMem::SetDevice(int myDevID)
 }

 /* 
-switch to the device (with fast cuda execution mode) we want to work 
+switch to the device (with fast cuda execution mode) we intend to work on
 >> myDevID - device id(-1: CPU memory, >=0: GPU device ID)
 */
 void XMem::SetDeviceFast(int myDevID)
@@ -275,7 +280,7 @@ void XMem::SetDeviceFast(int myDevID)
 }

 /* 
-run in static mode 
+run in the static mode
 >> myIsStatic - specify if the memory allocation is static
 */
 void XMem::SetStaticMode(bool myIsStatic)
@@ -1461,6 +1466,23 @@ void XMem::CreateBLASHandle()
 #endif
 }

+/* show profile of the memory pool */
+void XMem::ShowMemUsage(FILE * file)
+{
+    MTYPE used = 0;
+    MTYPE total = 0;
+
+    for(int i = 0; i < blockNum; i++){
+        if(blocks[i].mem != NULL){
+            used  += blocks[i].used;
+            total += blocks[i].size;
+        }
+    }
+
+    fprintf(file, "mem:%.1fMB used:%.1fMB usage:%.3f\n", 
+           (DTYPE)used/MILLION, (DTYPE)total/MILLION, (DTYPE)used/total);
+}
+
 #ifdef USE_CUDA

 /* get the handle of cublas */
@@ -1471,4 +1493,170 @@ cublasHandle_t * XMem::GetCublasHandle()

 #endif

+/* constructor */
+XMemManager::XMemManager()
+{
+    Initialize();
+}
+
+/* de-constructor */
+XMemManager::~XMemManager()
+{
+}
+
+/* get memory size */
+MTYPE XMemManager::GetAvailableMemory()
+{
+    unsigned long freeMem = 0;
+#if __APPLE__
+    int mib[2] = {CTL_HW, HW_MEMSIZE};
+    unsigned int namelen = sizeof(mib) / sizeof(mib[0]);
+    unsigned long long size;
+    size_t len = sizeof(size);
+    if (sysctl(mib, namelen, &size, &len, NULL, 0) < 0){
+        ShowNTErrors("Cannot get memory size on Mac!");
+    }
+    else{
+        return size;
+    }
+#elif _WIN32
+    MEMORYSTATUSEX memoryStatus;
+    memoryStatus.dwLength = sizeof(memoryStatus);
+    if (GlobalMemoryStatusEx(&memoryStatus)){
+        freeMem = memoryStatus.ullAvailPhys;
+    }
+#else
+    long pages = sysconf(_SC_AVPHYS_PAGES);
+    long page_size = sysconf(_SC_PAGE_SIZE);
+    freeMem = pages * page_size;
+#endif
+    return (MTYPE)freeMem;
+}
+
+/* get GPU memory size */
+MTYPE XMemManager::GetAvailableGPUMemory(int devID)
+{
+    size_t freeMem = 0;
+    
+#ifdef USE_CUDA
+    size_t totalMem = 0;
+    cudaSetDevice(devID);
+    if (cudaMemGetInfo(&freeMem, &totalMem) != cudaSuccess){
+        XPRINT(0, stderr, "cannot get GPU memory information.");
+        exit(1);
+    }
+#endif
+    return (MTYPE)freeMem;
+}
+
+/* get buffer size */
+void XMemManager::GetBufferSize(MTYPE freeMem, MTYPE * myBufSize)
+{
+    *myBufSize = 0;
+    if (freeMem >= MILLION * 128){
+        *myBufSize = MILLION * 32;
+        if (freeMem >= MILLION * 256){
+            *myBufSize = MILLION * 64;
+            if (freeMem >= MILLION * 512){
+                *myBufSize = MILLION * 128;
+                if (freeMem >= MILLION * 1024) {
+                    *myBufSize = MILLION * 256;
+                    if (freeMem >= MILLION * 2048)
+                        *myBufSize = MILLION * 512;
+                }
+            }
+        }
+    }
+} 
+
+/* initialize it and set the global memory information */
+void XMemManager::Initialize()
+{
+    srand((unsigned int)time(NULL));
+
+    Free();
+    
+    /* CPUs (we actually do not care about how many CPUs are using) */
+    nCPUMem = 1;
+
+    MTYPE freeMem = GetAvailableMemory();
+    MTYPE myBufSize = 0;
+    GetBufferSize(freeMem, &myBufSize);
+    CPUMems[0].Initialize(-1, UNI_FREE, MIN_BLOCK_SIZE_FOR_MEMPOOL, MIN_BLOCK_NUM_FOR_MEMPOOL, myBufSize);
+
+    /* GPUs */
+    nGPUMem = 0;
+
+#ifdef USE_CUDA
+    if (cudaGetDeviceCount(&nGPUMem) != cudaSuccess) {
+        XPRINT(0, stderr, "cannot get GPU information.");
+        exit(1);
+    }
+
+    for (int i = 0; i < nGPUMem; i++) {
+        MTYPE freeMem = GetAvailableGPUMemory(i);
+        MTYPE myBufSize = 0;
+        GetBufferSize(freeMem, &myBufSize);
+        GPUMems[i].Initialize(i, UNI_FREE, MIN_BLOCK_SIZE_FOR_MEMPOOL, MIN_BLOCK_NUM_FOR_MEMPOOL, myBufSize);
+    }
+
+#endif
+}
+
+/* free it */
+void XMemManager::Free()
+{
+    for (int i = 0; i < MAX_CPU_NUM; i++)
+        CPUMems[i].Free();
+    for (int i = 0; i < MAX_GPU_NUM; i++)
+        GPUMems[i].Free();
+}
+
+/* get global memory pool */
+XMem * XMemManager::GetMem(const int devID)
+{
+    XMem * mem = NULL;
+    if (devID < 0)
+        mem = CPUMems;
+    else{
+        if (devID < nGPUMem)
+            mem = GPUMems + devID;
+        else
+            XPRINT1(0, stderr, "Cannot get the memory (%d). Please check your device id!", devID);
+    }
+    
+    return mem;
+}
+
+/* get global memory size */
+int XMemManager::GetMemSize(const int devID, MTYPE * myBlockSize, int * myBlockNum, MTYPE * myBufSize)
+{
+    XMem * mem = GetMem(devID);
+    int result = 0;
+    if (mem != NULL){
+        *myBlockSize = mem->maxBlockSize;
+        *myBlockNum = mem->blockNum;
+        *myBufSize = mem->bufSize;
+        result = 1;
+    }
+    return result;
+}
+
+/* show memory information */
+void XMemManager::ShowMemInfo()
+{
+    XPRINT(1, stderr, "Memory Information:\n");
+    MTYPE myBlockSize, myBufSize;
+    int myBlockNum;
+    for(int i = 0; i < nCPUMem; i++){
+        GetMemSize(-1, &myBlockSize, &myBlockNum, &myBufSize);
+        XPRINT3(1, stderr, " - id:-1 CPU, blockSize:%lld, blockNum:%d, bufSize:%lld\n", myBlockSize, myBlockNum, myBufSize);
+    }
+
+    for(int i = 0; i < nGPUMem; i++){
+        GetMemSize(i, &myBlockSize, &myBlockNum, &myBufSize);
+        XPRINT4(1, stderr, " - id:%2d GPU, blockSize:%lld, blockNum:%d, bufSize:%lld\n", i, myBlockSize, myBlockNum, myBufSize);
+    }
+}
+
 } /* end of the nts (NiuTrans.Tensor) namespace */
--- a/source/tensor/XMem.h
+++ b/source/tensor/XMem.h
@@ -24,6 +24,7 @@
 #ifndef __XMEM_H__
 #define __XMEM_H__

+#include <stdio.h>
 #include <stdlib.h>

 #ifdef CUDA_BLAS
@@ -38,6 +39,15 @@
 #include <curand.h>
 #endif

+#ifdef __APPLE__
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#elif WIN32
+#include <windows.h>
+#else
+#include <unistd.h>
+#endif
+
 /* the nts (NiuTrans.Tensor) namespace */
 namespace nts{

@@ -52,6 +62,8 @@ typedef long long          INT_64;
 #define BUF_PITCH 256
 #define MIN_BLOCK_SIZE_FOR_MEMPOOL 128 * 1024 * 1024
 #define MIN_BLOCK_NUM_FOR_MEMPOOL 1024
+#define MAX_CPU_NUM 16
+#define MAX_GPU_NUM 16

 /* 
 mode of runnig a memory pool 
@@ -402,6 +414,9 @@ public:
    /* create a new cublas handle */
    void CreateBLASHandle();

+    /* show profile of the memory pool */
+    void ShowMemUsage(FILE * file);
+
 #ifdef USE_CUDA
    /* get the handle of cublas */
    cublasHandle_t * GetCublasHandle();
@@ -409,6 +424,61 @@ public:

 };

+/*
+a class for the management of memory
+*/
+class XMemManager
+{
+public:
+    /* cpu memory pool information */
+    XMem CPUMems[MAX_CPU_NUM];
+
+    /* number of cpu memory pools */
+    int nCPUMem;
+
+    /* gpu memory pool information */
+    XMem GPUMems[MAX_GPU_NUM];
+
+    /* number of gpu memory pools */
+    int nGPUMem;
+
+public:
+    /* constructor */
+    XMemManager();
+
+    /* de-constructor */
+    ~XMemManager();
+
+    /* get memory size */
+    MTYPE GetAvailableMemory();
+
+    /* get GPU memory size */
+    MTYPE GetAvailableGPUMemory(int devID);
+
+    /* get buffer size */
+    void GetBufferSize(MTYPE freeMem, MTYPE * myBufSize);
+
+    /* initialize it and set the global memory information */
+    void Initialize();
+
+    /* free it */
+    void Free();
+
+    /* get global memory pool */
+    XMem * GetMem(const int devID);
+
+    /* get global memory size */
+    int GetMemSize(const int devID, MTYPE * myBlockSize, int * myBlockNum, MTYPE * myBufSize);
+
+    /* show memory information */
+    void ShowMemInfo();
+};
+
+/* managing the memories */
+extern XMemManager GMems;
+
+
+
 extern XMem * GMem;

 extern int testxmemid;

--- a/source/tensor/XName.cpp
+++ b/source/tensor/XName.cpp
@@ -67,6 +67,8 @@ const char * GetOPName(int type)
            return "M_MULTIPLY";
        else if (type == MATH_MULTIPLYDIM)
            return "M_MULTIPLYDIM";
+        else if (type == MATH_MULTIPLYBROADCAST)
+            return "M_MULTIPLYBROADCAST";
        else if (type == MATH_NEGATE)
            return "M_NEGATE";
        else if (type == MATH_NORMALIZE)
@@ -75,6 +77,14 @@ const char * GetOPName(int type)
            return "M_POWER";
        else if (type == MATH_SCALEANDSHIFT)
            return "M_SCALEANDSHIFT";
+        else if (type == MATH_SCALE)
+            return "M_SCALE";
+        else if (type == MATH_DESCALE)
+            return "M_DESCALE";
+        else if (type == MATH_SHIFT)
+            return "M_SHIFT";
+        else if (type == MATH_MULANDSHIFT)
+            return "M_OPERATION";
        else if (type == MATH_SIGN)
            return "M_SIGN";
        else if (type == MATH_SUB)
@@ -85,6 +95,8 @@ const char * GetOPName(int type)
            return "M_SUM";
        else if (type == MATH_SUMDIM)
            return "M_SUMDIM";
+        else if (type == MATH_SUMBROADCAST)
+            return "M_SUMBROADCAST";
        else if (type == REDUCE_REDUCEMAX)
            return "R_REDUCEMAX";
        else if (type == REDUCE_REDUCEMEAN)
@@ -97,13 +109,7 @@ const char * GetOPName(int type)
            return "R_REDUCEVARIANCE";
    }
    else if ((type & DATA_BASE) != 0){
-        if (type == GETANDSET_CONVERTDATATYPE)
-            return "G_CONVERTDATATYPE";
-        else if (type == GETANDSET_INDEXTOONEHOT)
-            return "G_INDEXTOONEHOT";
-        else if (type == GETANDSET_ONEHOTTOINDEX)
-            return "G_ONEHOTTOINDEX";
-        else if (type == GETANDSET_SELECT)
+        if (type == GETANDSET_SELECT)
            return "G_SELECT";
        else if (type == MOVEMENT_COPYINDEXED)
            return "M_COPYINDEXED";
@@ -111,6 +117,8 @@ const char * GetOPName(int type)
            return "M_COPYVALUES";
        else if (type == MOVEMENT_GATHER)
            return "M_GATHER";
+        else if (type == MOVEMENT_DROPOUTWITHINDEX)
+            return "M_DROPOUTWITHINDEX";
        else if (type == SHAPE_CONCATENATE)
            return "S_CONCATENATE";
        else if (type == SHAPE_MERGE)
@@ -152,6 +160,10 @@ const char * GetOPName(int type)
        else if (type == FUNC_SOFTMAX)
            return "F_SOFTMAX";
    }
+    else if ((type & LOSS_BASE) != 0) {
+        if (type == LOSS_CROSSENTROPY)
+            return "L_CROSSENTROPY";
+    }
    
    return "NULL";
 }

--- a/source/tensor/XName.h
+++ b/source/tensor/XName.h
@@ -52,17 +52,24 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define MATH_MATRIXMULBATCHED   MATH_MATRIXMUL + 1
 #define MATH_MULTIPLY           MATH_MATRIXMULBATCHED + 1
 #define MATH_MULTIPLYDIM        MATH_MULTIPLY + 1
-#define MATH_NEGATE             MATH_MULTIPLYDIM + 1
+#define MATH_MULTIPLYBROADCAST  MATH_MULTIPLYDIM + 1
+#define MATH_NEGATE             MATH_MULTIPLYBROADCAST + 1
 #define MATH_NORMALIZE          MATH_NEGATE + 1
 #define MATH_POWER              MATH_NORMALIZE + 1
 #define MATH_SCALEANDSHIFT      MATH_POWER + 1
-#define MATH_SIGN               MATH_SCALEANDSHIFT + 1
+#define MATH_MULANDSHIFT        MATH_SCALEANDSHIFT + 1
+#define MATH_SCALE              MATH_MULANDSHIFT + 1
+#define MATH_DESCALE            MATH_SCALE + 1
+#define MATH_SHIFT              MATH_DESCALE + 1
+#define MATH_MOD                MATH_SHIFT + 1
+#define MATH_SIGN               MATH_MOD + 1
 #define MATH_SUB                MATH_SIGN + 1
 #define MATH_SUBDIM             MATH_SUB + 1
 #define MATH_SUM                MATH_SUBDIM + 1
 #define MATH_SUMDIM             MATH_SUM + 1
+#define MATH_SUMBROADCAST       MATH_SUMDIM + 1

-#define REDUCE                  MATH_SUMDIM + 1
+#define REDUCE                  MATH_SUMBROADCAST + 1
 #define REDUCE_REDUCEMAX        REDUCE + 1
 #define REDUCE_REDUCEMEAN       REDUCE_REDUCEMAX + 1
 #define REDUCE_REDUCESUM        REDUCE_REDUCEMEAN + 1
@@ -73,16 +80,15 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define DATA_BASE               MATH_BASE * 2
 #define GETANDSET               DATA_BASE + 1
 #define GETANDSET_CONVERTDATATYPE GETANDSET + 1
-#define GETANDSET_INDEXTOONEHOT GETANDSET_CONVERTDATATYPE + 1
-#define GETANDSET_ONEHOTTOINDEX GETANDSET_INDEXTOONEHOT + 1
-#define GETANDSET_SELECT        GETANDSET_ONEHOTTOINDEX + 1
+#define GETANDSET_SELECT        GETANDSET_CONVERTDATATYPE + 1

 #define MOVEMENT                GETANDSET_SELECT + 1
 #define MOVEMENT_COPYINDEXED    MOVEMENT + 1
 #define MOVEMENT_COPYVALUES     MOVEMENT_COPYINDEXED + 1
 #define MOVEMENT_GATHER         MOVEMENT_COPYVALUES + 1
+#define MOVEMENT_DROPOUTWITHINDEX         MOVEMENT_GATHER + 1

-#define SHAPE                   MOVEMENT_GATHER + 1
+#define SHAPE                   MOVEMENT_DROPOUTWITHINDEX + 1
 #define SHAPE_CONCATENATE       SHAPE + 1
 #define SHAPE_MERGE             SHAPE_CONCATENATE + 1
 #define SHAPE_MERGE_LIST        SHAPE_MERGE + 1
@@ -108,6 +114,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define FUNC_SIGMOID            FUNC_RECTIFY + 1
 #define FUNC_SOFTMAX            FUNC_SIGMOID + 1

+#define LOSS_BASE               FUNCTION_BASE * 2
+#define LOSS_CROSSENTROPY       LOSS_BASE + 1
+
 /* get operator name */
 const char * GetOPName(int type);


--- a/source/tensor/XPRunner.cpp
+++ b/source/tensor/XPRunner.cpp
@@ -146,7 +146,7 @@ run a set of jobs in parallel
 >> jobArgs - the list of arguments for each job
 >> sleepTime - time to sleep (in ms) for each round
 */
-void XPRunner::Run(XList * jobFunctions, XList * jobArgs, float sleepTime)
+void XPRunner::Run(TensorList * jobFunctions, TensorList * jobArgs, float sleepTime)
 {
    if(threadNum <= 0){
        XPRINT(1, stderr, "Error! No threads were created!\n");
@@ -195,7 +195,7 @@ void XPRunner::Run(XList * jobFunctions, XList * jobArgs, float sleepTime)
            TFunction function = (TFunction)jobFunctions->GetItem(jobArgs->count - c);

            /* the arguments that are passed to the function */
-            volatile XList * args = (XList*)jobArgs->GetItem(jobArgs->count - c);
+            volatile TensorList * args = (TensorList*)jobArgs->GetItem(jobArgs->count - c);

            /* thread */
            XThread * thread  = threads + availableThreads[i];

--- a/source/tensor/XPRunner.h
+++ b/source/tensor/XPRunner.h
@@ -106,7 +106,7 @@ public:
    void KillThreads();

    /* run a set of jobs in parallel */
-    void Run(XList * jobFunctions, XList * jobArgs, float sleepTime = 0);
+    void Run(TensorList * jobFunctions, TensorList * jobArgs, float sleepTime = 0);

    /* get the number of parallel jobs to run */
    int GetJobNum(int size);

--- a/source/tensor/XQueue.cpp
+++ b/source/tensor/XQueue.cpp
@@ -42,7 +42,7 @@ job item used in queues
 JobQueueNode::JobQueueNode()
 {
    job  = NULL;
-    args = new XList(1);
+    args = new TensorList(1);
 }

 /* de-constructor */
@@ -67,7 +67,7 @@ XQueue::XQueue(int mySize)
    head = 0;
    tail = 0;
    isJobQueue = false;
-    jobDequeuerArgs = new XList(1);
+    jobDequeuerArgs = new TensorList(1);
    jobDequeuerBreak = false;
    runningJobCount = 0;
    jobStream = NULL;
@@ -188,8 +188,10 @@ void XQueue::RunJobConsumer(int jobDevID)

    isJobQueue = true;
    jobDequeuerArgs->Clear();
-    jobDequeuerArgs->Add(this);
-    jobDequeuerArgs->Add(jobDevID >= 0 ? devids + jobDevID : &cpuid);
+
+	// warning: this may cause unknown error
+    jobDequeuerArgs->Add((XTensor*)this);
+    jobDequeuerArgs->Add(jobDevID >= 0 ? (XTensor*)(devids + jobDevID) : (XTensor*)&cpuid);

    jobDequeuer.function = (TFunction)DequeueJobs;
    jobDequeuer.argv = jobDequeuerArgs;
@@ -211,7 +213,7 @@ void XQueue::StopJobConsumer()
 }

 /* add a job item to process */
-void XQueue::EnqueueJob(void * job, XList * jobArgs)
+void XQueue::EnqueueJob(void * job, TensorList * jobArgs)
 {
    MUTEX_LOCK(jobQueueMutex);
    runningJobCount++;
@@ -225,7 +227,7 @@ void XQueue::EnqueueJob(void * job, XList * jobArgs)
 }

 /* job item consumer */
-void XQueue::DequeueJobs(XList * args)
+void XQueue::DequeueJobs(TensorList * args)
 {
    CheckNTErrors((args->count == 2), "Illegal arguments!");


--- a/source/tensor/XQueue.h
+++ b/source/tensor/XQueue.h
@@ -52,7 +52,7 @@ public:
    void * job;

    /* arguments of the job */
-    XList * args;
+    TensorList * args;

 public:
    /* constructor */
@@ -102,7 +102,7 @@ private:
    XThread jobDequeuer;

    /* argument list of jobDequeuer */
-    XList * jobDequeuerArgs;
+    TensorList * jobDequeuerArgs;

    /* indicates whether jobDequeuer stops */
    bool jobDequeuerBreak;
@@ -141,11 +141,11 @@ public:
    void StopJobConsumer();

    /* add a job item to process */
-    void EnqueueJob(void * job, XList * jobArgs);
+    void EnqueueJob(void * job, TensorList * jobArgs);

    /* job item consumer */
    static
-    void DequeueJobs(XList * args);
+    void DequeueJobs(TensorList * args);

    /* get the break flag */
    bool GetJobBreak();

--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
--- a/source/tensor/XThread.h
+++ b/source/tensor/XThread.h
@@ -85,7 +85,7 @@ namespace nts{

 #endif

-typedef void (*TFunction) (volatile XList*);
+typedef void (*TFunction) (volatile TensorList*);

 /*
 This is a class that wraps the standard implementation of threading
@@ -133,7 +133,7 @@ public:

    /* arguments (for the function to run) */
    volatile
-    XList * argv;
+    TensorList * argv;

    /* a flag to break */
    volatile

--- a/source/tensor/core/CHeader.h
+++ b/source/tensor/core/CHeader.h
@@ -28,6 +28,7 @@

 #include "arithmetic/Div.h"
 #include "arithmetic/DivDim.h"
+#include "arithmetic/Mask.h"
 #include "arithmetic/MatrixMul.h"
 #include "arithmetic/MatrixMul2D.h"
 #include "arithmetic/MatrixMul2DMultiTheading.h"
@@ -44,12 +45,14 @@
 #include "arithmetic/SumByColumnVT.h"
 #include "arithmetic/SumDim.h"
 #include "arithmetic/XTensorBLAS.h"
+#include "arithmetic/MulAndShift.h"

 #include "getandset/ConvertDataType.h"
 #include "getandset/OnehotAndIndex.h"
 #include "getandset/Select.h"
 #include "getandset/SetData.h"

+#include "math/Binary.h"
 #include "math/Clip.h"
 #include "math/Compare.h"
 #include "math/Normalize.h"

--- a/source/tensor/core/arithmetic/Div.cpp
+++ b/source/tensor/core/arithmetic/Div.cpp
@@ -21,6 +21,7 @@

 #include "../../XTensor.h"
 #include "../../XName.h"
+#include "../../XUtility.h"
 #include "Div.h"
 #include "Div.cuh"
 #include "DivDim.h"
@@ -41,12 +42,15 @@ where i is the index of the item
 */
 void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
 {
-	int leadingDimRDI = a->order - leadingDim - 1;
    CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum),
                  "Unmatched tensors in multiplication!");
    CheckNTErrors((a->order == b->order && a->order == c->order), 
                  "Unmatched tensors!");

+    CheckDev(a->devID, b->devID);
+
+    int leadingDimRDI = a->order - leadingDim - 1;
+
 #ifdef USE_CUDA
    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
        _CudaDiv(a, b, c, alpha, leadingDim);
@@ -214,4 +218,55 @@ XTensor Div(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim)
    return c;
 }

+/*
+element-wise division of two tensors
+
+c(i) = a(i)/b(i) + \alpha * c(i)
+where i is the index of the item
+
+>> a - tensor a
+>> b - tensor b
+>> c - result tensor
+>> alpha - the coefficient
+>> leadingDim - the dimension along which we perform broadcasting
+>> requireLink - if add operation to network
+*/
+void Div(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int leadingDim, bool requireLink)
+{
+    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
+        InitTensor(&c, &a);
+    }
+
+    int n = GetDivDimIndex(a, b);
+
+    if (n == -1) {
+        CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!");
+
+        /* call _Div function */
+        _Div(&a, &b, &c, 0, leadingDim);
+
+        if (requireLink) {
+            /* tensor connections */
+            XLink::MakeLink(&a, &b, &c, MATH_DIV);
+            XLink::AddParamToHead(&c, alpha);
+            XLink::AddParamToHeadInt(&c, leadingDim);
+        }
+    }
+    else if (n >= 0 && n < a.order) {
+        /* call _DivDim function */
+        _DivDim(&a, &b, &c, n, alpha);
+
+        if (requireLink) {
+            /* tensor connections */
+            XLink::MakeLink(&a, &b, &c, MATH_DIVDIM);
+            XLink::AddParamToHeadInt(&c, n);
+            XLink::AddParamToHead(&c, alpha);
+        }
+    }
+    else {
+        ShowNTErrors("Something is wrong!");
+    }
+
+}
+
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/Div.h
+++ b/source/tensor/core/arithmetic/Div.h
@@ -49,6 +49,13 @@ where i is the index of the element
 */
 XTensor Div(const XTensor &a, const XTensor &b, DTYPE alpha = 0.0, int leadingDim = 0);

+/*
+element-wise division of two tensors:
+c(i) = a(i)/b(i) + \alpha * c(i)
+where i is the index of the element
+*/
+void Div(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha = 0.0, int leadingDim = 0, bool requireLink = false);
+
 } // namespace nts(NiuTrans.Tensor)

 #endif // __DIV_H__
\ No newline at end of file
--- a/source/tensor/core/arithmetic/DivDim.cpp
+++ b/source/tensor/core/arithmetic/DivDim.cpp
@@ -19,10 +19,12 @@
 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-15
 */

+#include <math.h>
 #include "Div.h"
 #include "DivDim.h"
 #include "DivDim.cuh"
 #include "../../XName.h"
+#include "../../XUtility.h"
 #include "../movement/CopyValues.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -42,6 +44,8 @@ i.e., a is divided with b by broadcasting
 */
 void _DivDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alpha)
 {
+    n = MODX(n, a->order);
+
    CheckNTErrors(a && b && c, "Empty tensor input!");
    CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in division!");
    CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
@@ -50,6 +54,8 @@ void _DivDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alp
    CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
    CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");

+    CheckDev(a->devID, b->devID);
+
    if(XTensor::IsSameShaped(a, b)){
        _Div(a, b, c, alpha);
        return;
@@ -151,6 +157,8 @@ XTensor DivDim(const XTensor &a, const XTensor &b, int n, DTYPE alpha)
 {
    XTensor c(&a);
    c.SetTMPFlag();
+
+    n = MODX(n, a.order);
    
    /* call _Div function */
    _DivDim(&a, &b, &c, n, alpha);
@@ -162,5 +170,36 @@ XTensor DivDim(const XTensor &a, const XTensor &b, int n, DTYPE alpha)
    
    return c;
 }
+
+/*
+tensor division
+
+c = a / b + \alpha * c
+where the size of b is equal to the n-th dimension of a, 
+i.e., a is divided with b by broadcasting 
+
+>> a - a tensor
+>> b - another tensor whose size is equal to that of dimension n of a
+>> c - where we put result. we save it in a if c is NULL
+>> n - the dimension index
+>> alpha - the scaling factor
+>> requireLink - if add operation to network
+*/
+void DivDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE alpha, bool requireLink)
+{
+    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
+        InitTensor(&c, &a);
+    }
+
+    /* call _Div function */
+    _DivDim(&a, &b, &c, n, alpha);
+
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&a, &b, &c, MATH_DIVDIM);
+        XLink::AddParamToHeadInt(&c, n);
+        XLink::AddParamToHead(&c, alpha);
+    }
+}
    
 }
--- a/source/tensor/core/arithmetic/DivDim.h
+++ b/source/tensor/core/arithmetic/DivDim.h
@@ -52,6 +52,14 @@ i.e., a is divided with b by broadcasting
 we make a new tensor c to keep the result and return it
 */
 XTensor DivDim(const XTensor &a, const XTensor &b, int n, DTYPE alpha = (DTYPE)0.0);
+
+/* 
+tensor division of two tensors:
+c(i) = a/b + \alpha * c
+where the size of b is equal to the n-th dimension of a, 
+i.e., a is divided with b by broadcasting 
+*/
+void DivDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE alpha = (DTYPE)0.0, bool requireLink = false);
    
 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/arithmetic/Mask.cpp
+++ b/source/tensor/core/arithmetic/Mask.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2019-04-24
+* I'll attend several conferences and workshops in the following weeks -
+* busy days :(
+*/
+
+#include "../../XTensor.h"
+#include "../../XName.h"
+#include "../../XUtility.h"
+#include "Mask.h"
+#include "Mask.cuh"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/*
+mask entries of a given tensor:
+c(i) = a(i) if mask(i) is non-zero
+c(i) = alpha if mask(i) = 0
+where i is the index of the element
+*/
+void _Mask(const XTensor * a, const XTensor * mask, XTensor * c, DTYPE alpha)
+{
+    CheckNTErrors(a && mask && c, "Empty tensor input!");
+    CheckNTErrors(a->unitNum == mask->unitNum && a->unitNum == c->unitNum,
+        "Unmatched tensors in addition!");
+    CheckNTErrors(mask->dataType == X_INT, "The mask tensor must be in X_INT!")
+    //CheckNTErrors(a->dataType == mask->dataType && a->dataType == c->dataType,
+    //    "Unmatched tensors in addition!");
+
+    if (a->devID >= 0 || mask->devID >= 0 || c->devID >= 0) {
+#ifdef USE_CUDA
+        if (a == c) {
+            int P2PAccesible = 0;
+#ifdef CUDA_UVA
+            cudaDeviceCanAccessPeer(&P2PAccesible, a->devID, b->devID);
+#endif
+            if ((a->devID < 0 && mask->devID >= 0) ||
+                (a->devID >= 0 && mask->devID < 0) ||
+                (a->devID >= 0 && mask->devID >= 0 && a->devID != mask->devID && !P2PAccesible))
+            {
+                ShowNTErrors("Cannot run this method on multiple devices simultaneously!");
+            }
+            else
+                _CudaMask(a, mask, c, alpha);
+        }
+        else
+            _CudaMask(a, mask, c, alpha);
+
+#endif
+    }
+    else {
+        if (!a->isSparse && !mask->isSparse) {
+            CheckNTErrors(!c->isSparse, "Illegal use of sparse tensor in addition!");
+
+            if (a->dataType == DEFAULT_DTYPE &&
+                mask->dataType == X_INT &&
+                c->dataType == DEFAULT_DTYPE)
+            {
+                DTYPE * ap = (DTYPE*)a->data;
+                int * maskp = (int*)mask->data;
+                DTYPE * cp = (DTYPE*)c->data;
+
+                /* unrolling */
+                int num = a->unitNum;
+                if (num % 2 == 0) {
+                    for (int i = 0; i < num; i += 2) {
+                        if (maskp[i] == 0) {
+                            cp[i] = alpha;
+                        }
+                        else {
+                            cp[i] = ap[i];
+                        }
+
+                        if (maskp[i + 1] == 0) {
+                            cp[i + 1] = alpha;
+                        }
+                        else {
+                            cp[i + 1] = ap[i + 1];
+                        }
+                    }
+                }
+                else {
+                    for (int i = 0; i < num; i++) {
+                        if (maskp[i] == 0) {
+                            cp[i] = alpha;
+                        }
+                        else {
+                            cp[i] = ap[i];
+                        }
+                    }
+                }
+            }
+            else {
+                // TODO!!
+                ShowNTErrors("TODO!");
+            }
+        }
+        else {
+            // TODO!!
+            ShowNTErrors("TODO!");
+        }
+    }
+}
+
+/*
+mask entries of a given tensor (on site):
+a(i) = a(i) if mask(i) is non-zero
+a(i) = alpha if mask(i) = 0
+where i is the index of the element
+*/
+void _MaskMe(XTensor * a, const XTensor * mask, DTYPE alpha)
+{
+    _Mask(a, mask, a, alpha);
+}
+
+/*
+mask entries of a given tensor (return an XTensor structure):
+a(i) = a(i) if mask(i) is non-zero
+a(i) = alpha if mask(i) = 0
+where i is the index of the element
+*/
+XTensor Mask(const XTensor &a, const XTensor &mask, DTYPE alpha)
+{
+    XTensor c(&a);
+    c.SetTMPFlag();
+
+    /* call _Sum function */
+    _Mask(&a, &mask, &c, alpha);
+
+    /* tensor connections */
+    //XLink::MakeLink(&a, &mask, &c, MATH_SUM);
+    //XLink::AddParamToHead(&c, alpha);
+    // TODO!!
+    ShowNTErrors("TODO!");
+
+    return c;
+}
+
+}
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Mask.cu
+++ b/source/tensor/core/arithmetic/Mask.cu
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2019-04-24
+* I'll attend several conferences and workshops in the following weeks -
+* busy days :(
+*/
+
+#include "../../XDevice.h"
+#include "../../XUtility.h"
+#include "Sub.cuh"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+#ifdef USE_CUDA
+
+/*
+mask entries of a given tensor (CUDA Kernel)
+c = a - b * \beta
+>> a - A matrix
+>> mask - mask matrix
+>> c - where we put masked a
+>> size - the size of a/b/c
+>> alpha - value
+*/
+__global__
+    void KernelMASK(DTYPE * a, int * mask, DTYPE * c, int size, DTYPE alpha)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < size) {
+        if (mask[i] == 0) {
+            c[i] = alpha;
+        }
+        else {
+            c[i] = a[i];
+        }
+    }
+}
+
+/*
+mask entries of a given tensor (cuda version)
+>> a - a tensor
+>> mask - mask tensor
+>> c - where we put masked a
+>> alpha - value 
+*/
+void _CudaMask(const XTensor * a, const XTensor * mask, XTensor * c, DTYPE alpha)
+{
+    CheckNTErrors(a && mask && c, "Empty tensor input!");
+    CheckNTErrors((a->unitNum == mask->unitNum && a->unitNum == c->unitNum),
+        "Unmatched tensors in addition!");
+    CheckNTErrors(mask->dataType == X_INT, "The mask tensor must be in X_INT!")
+    //CheckNTErrors((a->dataType == mask->dataType && a->dataType == c->dataType),
+    //    "Unmatched tensors in addition!");
+    CheckNTErrors((a->devID == mask->devID && a->devID == c->devID),
+        "The tensors must be on the same!");
+
+    int devIDBackup = XDevice::GetGPUDevice();
+    XDevice::SetGPUDevice(a->devID);
+
+    if (!a->isSparse && !mask->isSparse) {
+        CheckNTErrors(!c->isSparse, "Illegal use of sparse matrix in addition!");
+
+        if (a->dataType == DEFAULT_DTYPE &&
+            mask->dataType == X_INT &&
+            c->dataType == DEFAULT_DTYPE)
+        {
+            int gridSize[3], blockSize[3];
+
+            GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
+            dim3 blocks(gridSize[0]);
+            dim3 threads(blockSize[0]);
+            KernelMASK << <blocks, threads >> >((DTYPE*)a->data, (int *)mask->data, (DTYPE*)c->data, a->unitNum, alpha);
+        }
+        else {
+            // TODO!!
+            ShowNTErrors("TODO!");
+        }
+    }
+    else {
+        // TODO!!
+        ShowNTErrors("TODO!");
+    }
+
+    XDevice::SetGPUDevice(devIDBackup);
+}
+
+#endif // USE_CUDA
+
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Mask.cuh
+++ b/source/tensor/core/arithmetic/Mask.cuh
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2019-04-24
+* I'll attend several conferences and workshops in the following weeks -
+* busy days :(
+*/
+
+#ifndef __MASK_CUH__
+#define __MASK_CUH__
+
+#include "../../XTensor.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+#ifdef USE_CUDA
+
+/* mask entries of a given tensor (cuda version) */
+void _CudaMask(const XTensor * a, const XTensor * mask, XTensor * c = NULL, DTYPE alpha = (DTYPE)1.0);
+
+#endif // USE_CUDA
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __MASK_CUH__
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Mask.h
+++ b/source/tensor/core/arithmetic/Mask.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+ * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2019-04-24
+ * I'll attend several conferences and workshops in the following weeks -
+ * busy days :(
+ */
+
+#ifndef __MASK_H__
+#define __MASK_H__
+
+#include "../../XTensor.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* 
+mask entries of a given tensor:
+c(i) = a(i) if mask(i) is non-zero
+c(i) = alpha if mask(i) = 0
+where i is the index of the element
+*/
+void _Mask(const XTensor * a, const XTensor * mask, XTensor * c, DTYPE alpha);
+
+/* 
+mask entries of a given tensor (on site):
+a(i) = a(i) if mask(i) is non-zero
+a(i) = alpha if mask(i) = 0
+where i is the index of the element
+*/
+void _MaskMe(XTensor * a, const XTensor * mask, DTYPE alpha);
+
+/* 
+mask entries of a given tensor (return an XTensor structure):
+a(i) = a(i) if mask(i) is non-zero
+a(i) = alpha if mask(i) = 0
+where i is the index of the element
+*/
+XTensor Mask(const XTensor &a, const XTensor &mask, DTYPE alpha = 0.0);
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __MASK_H__
--- a/source/tensor/core/arithmetic/MatrixMul.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul.cpp
--- a/source/tensor/core/arithmetic/MatrixMul.h
+++ b/source/tensor/core/arithmetic/MatrixMul.h
--- a/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.cpp
--- a/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.h
+++ b/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.h
--- a/source/tensor/core/arithmetic/MatrixMulBatched.cpp
+++ b/source/tensor/core/arithmetic/MatrixMulBatched.cpp
--- a/source/tensor/core/arithmetic/MatrixMulBatched.h
+++ b/source/tensor/core/arithmetic/MatrixMulBatched.h
--- a/source/tensor/core/arithmetic/MulAndShift.cpp
+++ b/source/tensor/core/arithmetic/MulAndShift.cpp
--- a/source/tensor/core/arithmetic/MulAndShift.h
+++ b/source/tensor/core/arithmetic/MulAndShift.h
--- a/source/tensor/core/arithmetic/Multiply.cpp
+++ b/source/tensor/core/arithmetic/Multiply.cpp
--- a/source/tensor/core/arithmetic/Multiply.h
+++ b/source/tensor/core/arithmetic/Multiply.h
--- a/source/tensor/core/arithmetic/MultiplyDim.cpp
+++ b/source/tensor/core/arithmetic/MultiplyDim.cpp
--- a/source/tensor/core/arithmetic/MultiplyDim.h
+++ b/source/tensor/core/arithmetic/MultiplyDim.h
--- a/source/tensor/core/arithmetic/Negate.cpp
+++ b/source/tensor/core/arithmetic/Negate.cpp
--- a/source/tensor/core/arithmetic/Negate.h
+++ b/source/tensor/core/arithmetic/Negate.h
--- a/source/tensor/core/arithmetic/Sign.cpp
+++ b/source/tensor/core/arithmetic/Sign.cpp
--- a/source/tensor/core/arithmetic/Sign.h
+++ b/source/tensor/core/arithmetic/Sign.h
--- a/source/tensor/core/arithmetic/Sub.cpp
+++ b/source/tensor/core/arithmetic/Sub.cpp
--- a/source/tensor/core/arithmetic/Sub.h
+++ b/source/tensor/core/arithmetic/Sub.h
--- a/source/tensor/core/arithmetic/SubDim.cpp
+++ b/source/tensor/core/arithmetic/SubDim.cpp
--- a/source/tensor/core/arithmetic/SubDim.h
+++ b/source/tensor/core/arithmetic/SubDim.h
--- a/source/tensor/core/arithmetic/Sum.cpp
+++ b/source/tensor/core/arithmetic/Sum.cpp
--- a/source/tensor/core/arithmetic/Sum.h
+++ b/source/tensor/core/arithmetic/Sum.h
--- a/source/tensor/core/arithmetic/SumDim.cpp
+++ b/source/tensor/core/arithmetic/SumDim.cpp
--- a/source/tensor/core/arithmetic/SumDim.cu
+++ b/source/tensor/core/arithmetic/SumDim.cu
--- a/source/tensor/core/arithmetic/SumDim.cuh
+++ b/source/tensor/core/arithmetic/SumDim.cuh
--- a/source/tensor/core/arithmetic/SumDim.h
+++ b/source/tensor/core/arithmetic/SumDim.h
--- a/source/tensor/core/arithmetic/XTensorBLAS.cu
+++ b/source/tensor/core/arithmetic/XTensorBLAS.cu
--- a/source/tensor/core/arithmetic/XTensorBLAS.h
+++ b/source/tensor/core/arithmetic/XTensorBLAS.h
--- a/source/tensor/core/getandset/ConvertDataType.cpp
+++ b/source/tensor/core/getandset/ConvertDataType.cpp
--- a/source/tensor/core/getandset/ConvertDataType.cu
+++ b/source/tensor/core/getandset/ConvertDataType.cu
--- a/source/tensor/core/getandset/ConvertDataType.cuh
+++ b/source/tensor/core/getandset/ConvertDataType.cuh
--- a/source/tensor/core/getandset/ConvertDataType.h
+++ b/source/tensor/core/getandset/ConvertDataType.h
--- a/source/tensor/core/getandset/OnehotAndIndex.cpp
+++ b/source/tensor/core/getandset/OnehotAndIndex.cpp
--- a/source/tensor/core/getandset/OnehotAndIndex.cu
+++ b/source/tensor/core/getandset/OnehotAndIndex.cu
--- a/source/tensor/core/getandset/OnehotAndIndex.cuh
+++ b/source/tensor/core/getandset/OnehotAndIndex.cuh
--- a/source/tensor/core/getandset/OnehotAndIndex.h
+++ b/source/tensor/core/getandset/OnehotAndIndex.h
--- a/source/tensor/core/getandset/SetData.cpp
+++ b/source/tensor/core/getandset/SetData.cpp
--- a/source/tensor/core/getandset/SetData.cu
+++ b/source/tensor/core/getandset/SetData.cu
--- a/source/tensor/core/getandset/SetData.cuh
+++ b/source/tensor/core/getandset/SetData.cuh
--- a/source/tensor/core/getandset/SetData.h
+++ b/source/tensor/core/getandset/SetData.h
--- a/source/tensor/core/math/Binary.cpp
+++ b/source/tensor/core/math/Binary.cpp
--- a/source/tensor/core/math/Binary.cu
+++ b/source/tensor/core/math/Binary.cu
--- a/source/tensor/core/math/Binary.cuh
+++ b/source/tensor/core/math/Binary.cuh
--- a/source/tensor/core/math/Binary.h
+++ b/source/tensor/core/math/Binary.h
--- a/source/tensor/core/math/Clip.cpp
+++ b/source/tensor/core/math/Clip.cpp
--- a/source/tensor/core/math/Clip.h
+++ b/source/tensor/core/math/Clip.h
--- a/source/tensor/core/math/Compare.cuh
+++ b/source/tensor/core/math/Compare.cuh
--- a/source/tensor/core/math/Compare.h
+++ b/source/tensor/core/math/Compare.h
--- a/source/tensor/core/math/Normalize.cpp
+++ b/source/tensor/core/math/Normalize.cpp
--- a/source/tensor/core/math/Power.cpp
+++ b/source/tensor/core/math/Power.cpp
--- a/source/tensor/core/math/Power.h
+++ b/source/tensor/core/math/Power.h
--- a/source/tensor/core/math/ScaleAndShift.cpp
+++ b/source/tensor/core/math/ScaleAndShift.cpp
--- a/source/tensor/core/math/ScaleAndShift.h
+++ b/source/tensor/core/math/ScaleAndShift.h
--- a/source/tensor/core/math/Unary.cpp
+++ b/source/tensor/core/math/Unary.cpp
--- a/source/tensor/core/math/Unary.h
+++ b/source/tensor/core/math/Unary.h
--- a/source/tensor/core/movement/CopyIndexed.cpp
+++ b/source/tensor/core/movement/CopyIndexed.cpp
--- a/source/tensor/core/movement/CopyIndexed.h
+++ b/source/tensor/core/movement/CopyIndexed.h
--- a/source/tensor/core/movement/CopyValues.cpp
+++ b/source/tensor/core/movement/CopyValues.cpp
--- a/source/tensor/core/movement/CopyValues.cu
+++ b/source/tensor/core/movement/CopyValues.cu
--- a/source/tensor/core/movement/CopyValues.h
+++ b/source/tensor/core/movement/CopyValues.h
--- a/source/tensor/core/movement/Gather.cpp
+++ b/source/tensor/core/movement/Gather.cpp
--- a/source/tensor/core/movement/Gather.h
+++ b/source/tensor/core/movement/Gather.h
--- a/source/tensor/core/reduce/ReduceMax.cpp
+++ b/source/tensor/core/reduce/ReduceMax.cpp
--- a/source/tensor/core/reduce/ReduceMax.h
+++ b/source/tensor/core/reduce/ReduceMax.h
--- a/source/tensor/core/reduce/ReduceMean.cpp
+++ b/source/tensor/core/reduce/ReduceMean.cpp
--- a/source/tensor/core/reduce/ReduceMean.h
+++ b/source/tensor/core/reduce/ReduceMean.h
--- a/source/tensor/core/reduce/ReduceSum.cpp
+++ b/source/tensor/core/reduce/ReduceSum.cpp
--- a/source/tensor/core/reduce/ReduceSum.h
+++ b/source/tensor/core/reduce/ReduceSum.h
--- a/source/tensor/core/reduce/ReduceSumAll.cpp
+++ b/source/tensor/core/reduce/ReduceSumAll.cpp
--- a/source/tensor/core/reduce/ReduceSumSquared.cpp
+++ b/source/tensor/core/reduce/ReduceSumSquared.cpp
--- a/source/tensor/core/reduce/ReduceSumSquared.h
+++ b/source/tensor/core/reduce/ReduceSumSquared.h
--- a/source/tensor/core/reduce/ReduceVariance.cpp
+++ b/source/tensor/core/reduce/ReduceVariance.cpp
--- a/source/tensor/core/reduce/ReduceVariance.h
+++ b/source/tensor/core/reduce/ReduceVariance.h
--- a/source/tensor/core/shape/Concatenate.cpp
+++ b/source/tensor/core/shape/Concatenate.cpp
--- a/source/tensor/core/shape/Concatenate.h
+++ b/source/tensor/core/shape/Concatenate.h
--- a/source/tensor/core/shape/ConcatenateSolely.cpp
+++ b/source/tensor/core/shape/ConcatenateSolely.cpp
--- a/source/tensor/core/shape/ConcatenateSolely.h
+++ b/source/tensor/core/shape/ConcatenateSolely.h
--- a/source/tensor/core/shape/Merge.cpp
+++ b/source/tensor/core/shape/Merge.cpp
--- a/source/tensor/core/shape/Merge.h
+++ b/source/tensor/core/shape/Merge.h
--- a/source/tensor/core/shape/MergeBlockLists.cpp
+++ b/source/tensor/core/shape/MergeBlockLists.cpp
--- a/source/tensor/core/shape/MergeBlockLists.cu
+++ b/source/tensor/core/shape/MergeBlockLists.cu
--- a/source/tensor/core/shape/MergeBlockLists.cuh
+++ b/source/tensor/core/shape/MergeBlockLists.cuh
--- a/source/tensor/core/shape/MergeBlockLists.h
+++ b/source/tensor/core/shape/MergeBlockLists.h
--- a/source/tensor/core/shape/Reshape.cpp
+++ b/source/tensor/core/shape/Reshape.cpp
--- a/source/tensor/core/shape/Reshape.h
+++ b/source/tensor/core/shape/Reshape.h
--- a/source/tensor/core/shape/Split.cpp
+++ b/source/tensor/core/shape/Split.cpp
--- a/source/tensor/core/shape/Split.h
+++ b/source/tensor/core/shape/Split.h
--- a/source/tensor/core/shape/Squeeze.cpp
+++ b/source/tensor/core/shape/Squeeze.cpp
--- a/source/tensor/core/shape/Squeeze.h
+++ b/source/tensor/core/shape/Squeeze.h
--- a/source/tensor/core/shape/Unsqueeze.cpp
+++ b/source/tensor/core/shape/Unsqueeze.cpp
--- a/source/tensor/core/shape/Unsqueeze.h
+++ b/source/tensor/core/shape/Unsqueeze.h
--- a/source/tensor/core/sort/Sort.cpp
+++ b/source/tensor/core/sort/Sort.cpp
--- a/source/tensor/core/sort/TopK.cpp
+++ b/source/tensor/core/sort/TopK.cpp
--- a/source/tensor/core/utilities/FlushToMem.cpp
+++ b/source/tensor/core/utilities/FlushToMem.cpp
--- a/source/tensor/core/utilities/FlushToMem.cu
+++ b/source/tensor/core/utilities/FlushToMem.cu
--- a/source/tensor/core/utilities/FlushToMem.cuh
+++ b/source/tensor/core/utilities/FlushToMem.cuh
--- a/source/tensor/core/utilities/FlushToMem.h
+++ b/source/tensor/core/utilities/FlushToMem.h
--- a/source/tensor/core/utilities/XMatrixSegment.cpp
+++ b/source/tensor/core/utilities/XMatrixSegment.cpp
--- a/source/tensor/function/Dropout.cpp
+++ b/source/tensor/function/Dropout.cpp
--- a/source/tensor/function/Dropout.h
+++ b/source/tensor/function/Dropout.h
--- a/source/tensor/function/DropoutWithIndex.cpp
+++ b/source/tensor/function/DropoutWithIndex.cpp
--- a/source/tensor/function/DropoutWithIndex.cu
+++ b/source/tensor/function/DropoutWithIndex.cu
--- a/source/tensor/function/DropoutWithIndex.cuh
+++ b/source/tensor/function/DropoutWithIndex.cuh
--- a/source/tensor/function/DropoutWithIndex.h
+++ b/source/tensor/function/DropoutWithIndex.h
--- a/source/tensor/function/FHeader.h
+++ b/source/tensor/function/FHeader.h
--- a/source/tensor/function/HardTanH.cpp
+++ b/source/tensor/function/HardTanH.cpp
--- a/source/tensor/function/HardTanH.cu
+++ b/source/tensor/function/HardTanH.cu
--- a/source/tensor/function/HardTanH.cuh
+++ b/source/tensor/function/HardTanH.cuh
--- a/source/tensor/function/HardTanH.h
+++ b/source/tensor/function/HardTanH.h
--- a/source/tensor/function/Identity.cpp
+++ b/source/tensor/function/Identity.cpp
--- a/source/tensor/function/Identity.h
+++ b/source/tensor/function/Identity.h
--- a/source/tensor/function/LogSoftmax.cpp
+++ b/source/tensor/function/LogSoftmax.cpp
--- a/source/tensor/function/LogSoftmax.cuh
+++ b/source/tensor/function/LogSoftmax.cuh
--- a/source/tensor/function/LogSoftmax.h
+++ b/source/tensor/function/LogSoftmax.h
--- a/source/tensor/function/Rectify.cpp
+++ b/source/tensor/function/Rectify.cpp
--- a/source/tensor/function/Rectify.cu
+++ b/source/tensor/function/Rectify.cu
--- a/source/tensor/function/Rectify.cuh
+++ b/source/tensor/function/Rectify.cuh
--- a/source/tensor/function/Rectify.h
+++ b/source/tensor/function/Rectify.h
--- a/source/tensor/function/Sigmoid.cpp
+++ b/source/tensor/function/Sigmoid.cpp
--- a/source/tensor/function/Sigmoid.cu
+++ b/source/tensor/function/Sigmoid.cu
--- a/source/tensor/function/Sigmoid.cuh
+++ b/source/tensor/function/Sigmoid.cuh
--- a/source/tensor/function/Sigmoid.h
+++ b/source/tensor/function/Sigmoid.h
--- a/source/tensor/function/Softmax.cpp
+++ b/source/tensor/function/Softmax.cpp
--- a/source/tensor/function/Softmax.cu
+++ b/source/tensor/function/Softmax.cu
--- a/source/tensor/function/Softmax.h
+++ b/source/tensor/function/Softmax.h
--- a/source/tensor/function/SoftmaxWithCrossEntropy.cpp
+++ b/source/tensor/function/SoftmaxWithCrossEntropy.cpp
--- a/source/tensor/function/SoftmaxWithCrossEntropy.cu
+++ b/source/tensor/function/SoftmaxWithCrossEntropy.cu
--- a/source/tensor/function/SoftmaxWithCrossEntropy.cuh
+++ b/source/tensor/function/SoftmaxWithCrossEntropy.cuh
--- a/source/tensor/function/SoftmaxWithCrossEntropy.h
+++ b/source/tensor/function/SoftmaxWithCrossEntropy.h
--- a/source/tensor/loss/CrossEntropy.cpp
+++ b/source/tensor/loss/CrossEntropy.cpp
--- a/source/tensor/loss/CrossEntropy.cu
+++ b/source/tensor/loss/CrossEntropy.cu
--- a/source/tensor/loss/CrossEntropy.cuh
+++ b/source/tensor/loss/CrossEntropy.cuh
--- a/source/tensor/loss/CrossEntropy.h
+++ b/source/tensor/loss/CrossEntropy.h
--- a/source/tensor/loss/LHeader.h
+++ b/source/tensor/loss/LHeader.h
--- a/source/tensor/test/TConcatenate.cpp
+++ b/source/tensor/test/TConcatenate.cpp
--- a/source/tensor/test/TConcatenateSolely.cpp
+++ b/source/tensor/test/TConcatenateSolely.cpp
--- a/source/tensor/test/TCrossEntropy.cpp
+++ b/source/tensor/test/TCrossEntropy.cpp
--- a/source/tensor/test/TCrossEntropy.h
+++ b/source/tensor/test/TCrossEntropy.h
--- a/source/tensor/test/TGather.cpp
+++ b/source/tensor/test/TGather.cpp
--- a/source/tensor/test/THardTanH.cpp
+++ b/source/tensor/test/THardTanH.cpp
--- a/source/tensor/test/TIdentity.cpp
+++ b/source/tensor/test/TIdentity.cpp
--- a/source/tensor/test/TMerge.cpp
+++ b/source/tensor/test/TMerge.cpp
--- a/source/tensor/test/TRectify.cpp
+++ b/source/tensor/test/TRectify.cpp
--- a/source/tensor/test/TSigmoid.cpp
+++ b/source/tensor/test/TSigmoid.cpp
--- a/source/tensor/test/TSplit.cpp
+++ b/source/tensor/test/TSplit.cpp
--- a/source/tensor/test/TSpread.cpp
+++ b/source/tensor/test/TSpread.cpp
--- a/source/tensor/test/Test.cpp
+++ b/source/tensor/test/Test.cpp
--- a/source/tensor/test/Test.h
+++ b/source/tensor/test/Test.h