Merge with Xuchen branch

135aadf4 · liyinqiao · c7559b7d · 135aadf4 · 135aadf4 · 135aadf4
Commit 135aadf4 authored Jul 30, 2019 by liyinqiao
--- a/Makefile
+++ b/Makefile
+# the prefix of the generated executable file
+PREFIX = NiuTrans
+NIUTRANS_EXE := $(PREFIX).Tensor
+
+# code path and generated file path
+ROOT = .
+SRC = $(ROOT)/source
+LIB_DIR = $(ROOT)/lib
+EXE_DIR = $(ROOT)/bin
+
+# whether to generate dll
+dll = 0
+
+# 0 - use CPU 
+# 1 - use GPU
+USE_CUDA = 1
+# modify this path if neccessary
+CUDA_ROOT = /usr/local/cuda-9.0
+CUDA_LIB_DIR = $(CUDA_ROOT)/lib64
+CUDA_INCLUDE = $(CUDA_ROOT)/include
+
+# use MKL 
+USE_MKL = 0
+INTEL_ROOT = /opt/intel
+MKL_ROOT = /opt/intel/mkl
+MKL_LIB_DIR = $(MKL_ROOT)/lib/intel64/
+MKL_INCLUDE = $(MKL_ROOT)/include
+
+# use OpenBLAS
+USE_OPENBLAS = 0
+OPENBLAS_ROOT = /opt/OpenBLAS
+OPENBLAS_LIB_DIR = $(OPENBLAS_ROOT)/lib
+OPENBLAS_INCLUDE = $(OPENBLAS_ROOT)/include
+
+SRC_DIR = $(shell find $(SRC) -type d)
+
+# included header files directory
+# depended outside library files directory
+INC_DIR = $(SRC_DIR)
+DEPLIB_DIR = 
+ifeq ($(USE_CUDA), 1)
+	INC_DIR += $(CUDA_INCLUDE)
+	DEPLIB_DIR += $(CUDA_LIB_DIR)
+endif
+ifeq ($(USE_MKL), 1)
+	INC_DIR += $(MKL_INCLUDE)
+	DEPLIB_DIR += $(MKL_LIB_DIR)
+endif
+ifeq ($(USE_OPENBLAS), 1)
+	INC_DIR += $(OPENBLAS_INCLUDE)
+	DEPLIB_DIR += $(OPENBLAS_LIB_DIR)
+endif
+
+# macro
+MACRO = 
+ifeq ($(USE_CUDA), 1)
+	MACRO += -DUSE_CUDA
+endif
+ifeq ($(USE_MKL), 1)
+	MACRO += -DUSE_BLAS -DMKL
+endif
+ifeq ($(USE_OPENBLAS), 1)
+	MACRO += -DUSE_BLAS -DOPENBLAS
+endif
+
+# dependency
+STATIC_DEPLIB = 
+DYNAMIC_DEPLIB = -lpthread
+ifeq ($(USE_MKL), 1)
+    STATIC_DEPLIB += $(MKL_LIB_DIR)/libmkl_intel_lp64.a \
+	                 $(MKL_LIB_DIR)/libmkl_core.a \
+					 $(MKL_LIB_DIR)/libmkl_intel_thread.a \
+					 $(INTEL_ROOT)/lib/intel64/libiomp5.a                                              
+    DYNAMIC_DEPLIB += -liomp5 -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core
+endif
+ifeq ($(USE_OPENBLAS), 1)
+    STATIC_DEPLIB += $(OPENBLAS_LIB_DIR)/libopenblas.a
+    DYNAMIC_DEPLIB += -lopenblas
+endif
+ifeq ($(USE_CUDA), 1)
+    STATIC_DEPLIB += $(CUDA_LIB_DIR)/libcublas_static.a \
+                     $(CUDA_LIB_DIR)/libculibos.a \
+                     $(CUDA_LIB_DIR)/libnpps_static.a \
+                     $(CUDA_LIB_DIR)/libnppc_static.a \
+                     $(CUDA_LIB_DIR)/libcudadevrt.a \
+                     $(CUDA_LIB_DIR)/libcurand_static.a \
+					 /lib64/libdl.so.2
+    DYNAMIC_DEPLIB += -lcudart -lnvidia-ml
+endif 
+DEPLIBS = -Wl,--start-group $(STATIC_DEPLIB) -Wl,--end-group -lm -ldl $(DYNAMIC_DEPLIB)
+
+# specify the compilers here
+CC = gcc
+CXX = g++
+NVCC = $(CUDA_ROOT)/bin/nvcc
+ifeq ($(USE_INTEL_COMPILER), 1)
+	CC = icc
+	CXX = icc
+endif
+
+# main file
+MAIN_FILE = $(SRC)/network/Main.cpp
+Tensor_Main := $(SRC)/tensor/Main.cpp
+Network_Main := $(SRC)/network/Main.cpp
+
+ifeq ($(USE_CUDA), 1)
+	NIUTRANS_EXE := $(NIUTRANS_EXE).GPU
+else
+	NIUTRANS_EXE := $(NIUTRANS_EXE).CPU
+endif
+
+NIUTRANS_DLL := $(LIB_DIR)/lib$(NIUTRANS_EXE).so
+
+NIUTRANS_EXE := $(EXE_DIR)/$(NIUTRANS_EXE)
+
+# specify the compiling arguments here
+CFLAGS = -std=c++11 -msse4.2 -w -march=native -Wno-enum-compare -Wno-sign-compare -Wno-reorder -Wno-format
+
+# gtx 1080 arch=compute_61,code=sm_61
+# k80 arch=compute_37,code=sm_37
+# if we set wrong, the result can be `-inf`
+CUDA_FLAG = -arch=sm_30 \
+			-gencode=arch=compute_30,code=sm_30 \
+			-gencode=arch=compute_50,code=sm_50 \
+			-gencode=arch=compute_52,code=sm_52 \
+			-gencode=arch=compute_60,code=sm_60 \
+			-gencode=arch=compute_61,code=sm_61 \
+			-gencode=arch=compute_62,code=sm_62 \
+			-gencode=arch=compute_70,code=sm_70 \
+			-gencode=arch=compute_70,code=compute_70 \
+			-maxrregcount=0  --machine 64 -DUSE_CUDA --use_fast_math -std=c++11
+
+CFLAGS += -O3 -flto -DNDEBUG -rdynamic -fkeep-inline-functions
+
+# include dir
+CFLAGS += -fPIC $(addprefix -I, $(INC_DIR))
+# CUDA_FLAG += $(addprefix -I, $(INC_DIR))
+CXXFLAGS = $(CFLAGS)
+
+# lib dir
+LDFLAGS = $(addprefix -L, $(DEPLIB_DIR))
+
+# decoder source file
+ifeq ($(USE_CUDA), 1)
+	SOURCES := $(foreach dir,$(SRC_DIR),$(wildcard $(dir)/*.c) $(wildcard $(dir)/*.cpp) $(wildcard $(dir)/*.cc) $(wildcard $(dir)/*.cu))
+else
+	SOURCES := $(foreach dir,$(SRC_DIR),$(wildcard $(dir)/*.c) $(wildcard $(dir)/*.cpp) $(wildcard $(dir)/*.cc) )
+endif
+
+SOURCES := $(subst $(Tensor_Main), ,$(SOURCES))
+SOURCES := $(subst $(Network_Main), ,$(SOURCES))
+
+# object file
+OBJS := $(patsubst %.c,%.o,$(SOURCES))
+OBJS := $(patsubst %.cpp,%.o,$(OBJS))
+ifeq ($(USE_CUDA), 1)
+	OBJS := $(patsubst %.cu,%.cuo,$(OBJS))
+endif
+
+all: start lib exe finish
+
+start:
+	@echo ""
+	@echo "Start building ..."
+
+lib: start_lib niutrans_dll finish_lib
+
+start_lib:
+	@mkdir -p $(LIB_DIR)
+	@echo ""
+	@echo "Start building library"
+
+niutrans_dll: $(NIUTRANS_DLL)
+
+$(NIUTRANS_DLL): $(OBJS)
+ifeq ($(dll), 1)
+	@echo "Building dynamic link library: $(NIUTRANS_DLL)"
+	@$(CXX) -shared -Wall $(CXXFLAGS) $(MACRO) $(LDFLAGS) $(OBJS) $(DEPLIBS) -o $@
+else
+	@echo "Skip building dynamic link library"
+endif
+	
+finish_lib:
+	@echo "Finish building library"
+	@echo ""
+
+exe: start_exe niutrans_exe finish_exe
+
+start_exe:
+	@mkdir -p $(EXE_DIR)
+	@echo ""
+	@echo "Start building executable file"
+
+niutrans_exe: $(NIUTRANS_EXE)
+
+$(NIUTRANS_EXE): $(OBJS) $(MAIN_FILE)
+	@echo "Building executable file: $(NIUTRANS_EXE)"
+	@$(CXX) $(MAIN_FILE) $(CXXFLAGS) $(MACRO) $(LDFLAGS) $(OBJS) $(DEPLIBS) -o $@
+
+finish_exe:
+	@echo "Finish building executable file"
+	@echo ""
+
+finish:
+	@echo "Finish building ..."
+	@echo ""
+
+%.o: %.c
+	@$(CC) $(CFLAGS) -c $< -o $@
+
+%.o: %.cpp
+	@$(CXX) $(CXXFLAGS) $(MACRO) -c $< -o $@
+
+%.cuo: %.cu
+ifeq ($(dll), 1)
+	@$(NVCC) --shared --compiler-options '-fPIC' $(CUDA_FLAG) -c $< -o $@
+else
+	@$(NVCC) $(CUDA_FLAG) -c $< -o $@
+endif
+
+.PHONY: clean
+clean:
+	@echo "Cleaning object files"
+	@-rm -f $(OBJS)
+	
\ No newline at end of file
--- a/source/network/Main.cpp
+++ b/source/network/Main.cpp
@@ -45,7 +45,9 @@ int main( int argc, const char ** argv )
    //_CrtSetDbgFlag(_CrtSetDbgFlag(_CRTDBG_REPORT_FLAG) | _CRTDBG_LEAK_CHECK_DF);
    //_CrtSetBreakAlloc(2708);

-    if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
+    if(argc > 1 && !strcmp(argv[1], "-test"))
+        Test();
+    else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
        FNNLMMain(argc - 1, argv + 1);
    else if(argc > 1 && !strcmp(argv[1], "-t2t"))
        TransformerMain(argc - 1, argv + 1);
@@ -54,6 +56,7 @@ int main( int argc, const char ** argv )
        fprintf(stderr, "neural networks in an easy way. \n\n");
        fprintf(stderr, "Run this program with \"-test\" for unit test!\n");
        fprintf(stderr, "Or run this program with \"-fnnlm\" for sample FNNLM!\n");
+        fprintf(stderr, "Or run this program with \"-t2t\" for sample Transformer!\n");
    }

    //_CrtDumpMemoryLeaks();

--- a/source/network/XBackwardFunc.cpp
+++ b/source/network/XBackwardFunc.cpp
@@ -43,18 +43,18 @@ void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient)
    XNoder::MakeGrad(input);

    if(operID == FUNC_HARDTANH)
-        _HardTanHBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
+        _HardTanHBackward(output, input, output->grad, input->grad);
    else if(operID == FUNC_IDENTITY)
-        _IdentityBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
+        _IdentityBackward(output, input, output->grad, input->grad);
    else if(operID == FUNC_LOGSOFTMAX){
        int leadDim = income.GetParamInt(0);
        CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in logsoftmax!");
        _LogSoftmaxBackward(NULL, output, input, output->grad, input->grad, NULL, leadDim, NOLOSS);
    }
    else if(operID == FUNC_RECTIFY)
-        _RectifyBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
+        _RectifyBackward(output, input, output->grad, input->grad);
    else if(operID == FUNC_SIGMOID)
-        _SigmoidBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
+        _SigmoidBackward(output, input, output->grad, input->grad);
    else if(operID == FUNC_SOFTMAX){
        int leadDim = income.GetParamInt(0);
        CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in softmax!");

--- a/source/network/XBackwardLoss.cpp
+++ b/source/network/XBackwardLoss.cpp
@@ -69,7 +69,7 @@ void XLossGrad::MakeGrad(XTensor * node, bool isEfficient)
    if(operID == LOSS_CROSSENTROPY) {
        if (income.tailNum == 3) 
            padding = income.tails[2];
-         leadingDim = income.GetParamInt(0);
+        leadingDim = income.GetParamInt(0);
        CheckNTErrors(leadingDim >= 0 && leadingDim < output->order, "wrong leading dimension in logsoftmax!");
        _CrossEntropyBackward(dedy, output, gold, weight, padding, leadingDim);
    }
@@ -98,39 +98,39 @@ compute dE/dx for a given function y = f(x)
 >> params - parameters of the function
 >> lossName - name of the loss, e.g., cross entropy
 */
-void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x, 
-                        XTensor * dedy, XTensor * dedx, XTensor * padding,
-                        int funcID, void * params,
-                        LOSS_FUNCTION_NAME lossName)
-{
-    CheckNTErrors(gold && y && x, "Empty input tensors!");
-    CheckNTErrors(dedx, "Empty gradient tensors!");
-    CheckNTErrors((funcID & FUNCTION_BASE) != 0, "Illegal function id");
-
-    if(funcID == FUNC_HARDTANH){
-        _HardTanHBackward(gold, y, x, dedy, dedx, lossName);
-    }
-    else if(funcID == FUNC_IDENTITY){
-        _IdentityBackward(gold, y, x, dedy, dedx, lossName);
-    }
-    else if(funcID == FUNC_LOGSOFTMAX){
-        int leadDim = *(int*)params;
-        _LogSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
-    }
-    else if(funcID == FUNC_RECTIFY){
-        _RectifyBackward(gold, y, x, dedy, dedx, lossName);
-    }
-    else if(funcID == FUNC_SIGMOID){
-        _SigmoidBackward(gold, y, x, dedy, dedx, lossName);
-    }else if(funcID == FUNC_SOFTMAX){
-        int leadDim = *(int*)params;
-        _SoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
-    }
-    else{
-        ShowNTErrors("wrong function found when call the backward process!");
-    }
-
-}
+//void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x, 
+//                        XTensor * dedy, XTensor * dedx, XTensor * padding,
+//                        int funcID, void * params,
+//                        LOSS_FUNCTION_NAME lossName)
+//{
+//    CheckNTErrors(gold && y && x, "Empty input tensors!");
+//    CheckNTErrors(dedx, "Empty gradient tensors!");
+//    CheckNTErrors((funcID & FUNCTION_BASE) != 0, "Illegal function id");
+//
+//    if(funcID == FUNC_HARDTANH){
+//        _HardTanHBackward(gold, y, x, dedy, dedx, lossName);
+//    }
+//    else if(funcID == FUNC_IDENTITY){
+//        _IdentityBackward(gold, y, x, dedy, dedx, lossName);
+//    }
+//    else if(funcID == FUNC_LOGSOFTMAX){
+//        int leadDim = *(int*)params;
+//        _LogSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
+//    }
+//    else if(funcID == FUNC_RECTIFY){
+//        _RectifyBackward(gold, y, x, dedy, dedx, lossName);
+//    }
+//    else if(funcID == FUNC_SIGMOID){
+//        _SigmoidBackward(gold, y, x, dedy, dedx, lossName);
+//    }else if(funcID == FUNC_SOFTMAX){
+//        int leadDim = *(int*)params;
+//        _SoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
+//    }
+//    else{
+//        ShowNTErrors("wrong function found when call the backward process!");
+//    }
+//
+//}

 /* 
 compute dE/dy for variable y and error(loss) function E
@@ -139,27 +139,27 @@ compute dE/dy for variable y and error(loss) function E
 >> dedy - dE/dy
 >> lossName - name of the loss, e.g., cross entropy
 */
-void XLossGrad::Compute(XTensor * gold, XTensor * y, 
-                        XTensor * dedy, XTensor * padding,
-                        LOSS_FUNCTION_NAME lossName)
-{
-    if(gold == NULL){
-        if(dedy->dataType == X_FLOAT)
-            _SetDataFixedFloat(dedy, 1.0F);
-        else if(dedy->dataType == X_DOUBLE)
-            _SetDataFixedDouble(dedy, 1.0);
-        else if(dedy->dataType == X_INT)
-            _SetDataFixedInt(dedy, 1);
-        else{
-            ShowNTErrors("TODO");
-        }
-        return;
-    }
-
-    //_LossBackward(dedy, gold, y, lossName);
-    if(lossName == CROSSENTROPY)
-        _CrossEntropyBackward(dedy, y, gold, NULL, padding);
-
-}
+//void XLossGrad::Compute(XTensor * gold, XTensor * y, 
+//                        XTensor * dedy, XTensor * padding,
+//                        LOSS_FUNCTION_NAME lossName)
+//{
+//    if(gold == NULL){
+//        if(dedy->dataType == X_FLOAT)
+//            _SetDataFixedFloat(dedy, 1.0F);
+//        else if(dedy->dataType == X_DOUBLE)
+//            _SetDataFixedDouble(dedy, 1.0);
+//        else if(dedy->dataType == X_INT)
+//            _SetDataFixedInt(dedy, 1);
+//        else{
+//            ShowNTErrors("TODO");
+//        }
+//        return;
+//    }
+//
+//    //_LossBackward(dedy, gold, y, lossName);
+//    if(lossName == CROSSENTROPY)
+//        _CrossEntropyBackward(dedy, y, gold, NULL, padding);
+//
+//}

 }
\ No newline at end of file
--- a/source/network/XBackwardLoss.h
+++ b/source/network/XBackwardLoss.h
@@ -43,11 +43,11 @@ public:
    static
    bool IsLossOP(XTensor * node);

-    /* compute dE/dx for a given function y = f(x) */
-    void Compute(XTensor * gold, XTensor * y, XTensor * x, 
-                 XTensor * dedy, XTensor * dedx, XTensor * padding,
-                 int funcID, void * params,
-                 LOSS_FUNCTION_NAME lossName);
+    ///* compute dE/dx for a given function y = f(x) */
+    //void Compute(XTensor * gold, XTensor * y, XTensor * x, 
+    //             XTensor * dedy, XTensor * dedx, XTensor * padding,
+    //             int funcID, void * params,
+    //             LOSS_FUNCTION_NAME lossName);

    /* compute dE/dy for variable y and error(loss) function E */
    void Compute(XTensor * gold, XTensor * y, 

--- a/source/network/XBackwardMath.cpp
+++ b/source/network/XBackwardMath.cpp
@@ -530,7 +530,7 @@ void XMathGrad::GradMatrixMul(XTensor * node, bool isEfficient)
    XTensor * dedc = node->grad;
    XTensor * deda = a->grad;
    XTensor * dedb = b->grad;
-    
+
    if(a->order == 2 && b->order == 2)
        GradMatrixMul(a, deda, transA, b, dedb, transB, dedc, alpha, isEfficient);
    else if(transA == X_NOTRANS && a->order > 2 && b->order == 2){

--- a/source/sample/fnnlm/FNNLM.cpp
+++ b/source/sample/fnnlm/FNNLM.cpp
@@ -20,7 +20,7 @@
 * This is a simple impelementation of the feed-forward network-baesd language
 * model (FNNLM). See more details about FNNLM in
 * "A Neural Probabilistic Language Model" by Bengio et al.
- * Journal of Machine Learning Research 3 (2003) 1137C1155
+ * Journal of Machine Learning Research 3 (2003) 1137?155
 *
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-06-22
 */
@@ -469,6 +469,10 @@ void Train(const char * train, bool isShuffled, FNNModel &model)

                /* update model parameters */
                Update(model, grad, learningRate, false);
+
+                /* get probabilities */
+                float prob = GetProb(output, gold);
+                loss -= prob;
            }
            else{
                /* gradient = 0 */
@@ -480,23 +484,19 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
                ForwardAutoDiff(ngrams, ngramNum, output, model);
                
                /* this is implemented by multiply function */
-                //ForwardAutoDiff(inputs, output, model);
                lossTensor = CrossEntropy(output, gold);

                /* automatic differentiation */
                autoDiffer.Backward(lossTensor);
-                //autoDiffer.Backward(output, gold, CROSSENTROPY);

                /* update model parameters */
                Update(model, grad, learningRate, true);
+
+                /* get probabilities */
+                float prob = ReduceSumAll(lossTensor);
+                loss += prob;
            }
-                
-            /* get probabilities */
-            float prob = GetProb(output, gold);
-            
-            prob = ReduceSumAll(lossTensor);

-            loss += prob;
            wordCount += ngramNum;
            wordCountTotal += ngramNum;
            
@@ -579,9 +579,6 @@ void Update(FNNModel &model, FNNModel &grad, float epsilon, bool isNodeGrad)
        XTensor * para = (XTensor*)paraList.GetItem(i);
        XTensor * paraGrad = (XTensor*)gradList.GetItem(i);

-        //fprintf(stderr, "%d\n", i);
-        //paraGrad->Dump(stderr, "grad:", 10);
-
        /* the delta rule */
        _Sum(para, paraGrad, para, -epsilon);
    }
@@ -600,14 +597,14 @@ float GetProb(XTensor &output, XTensor &gold, XTensor * wordProbs)
    InitTensorV2(&probs, &output);
    
    /* probs[i,j] = output[i,j] * gold[i,j] */
-    _Multiply(&output, &gold, &probs);
+    Multiply(output, gold, probs);

    /* probability of each word */
    XTensor wprobs;
    InitTensor1DV2(&wprobs, output.GetDim(0), output.dataType, output.devID);
-    _ReduceSum(&probs, &wprobs, 1);
+    ReduceSum(probs, wprobs, 1);
    if(wordProbs != NULL)
-        _CopyValues(&wprobs, wordProbs);
+        CopyValues(wprobs, *wordProbs);

    /* reshape the tensor to fit it into the reduce procedure 
       TODO: XTensor supports scalars */
@@ -619,7 +616,7 @@ float GetProb(XTensor &output, XTensor &gold, XTensor * wordProbs)
    /* probability for the batch */
    XTensor result;
    InitTensor1DV2(&result, 1, X_FLOAT, output.devID);
-    _ReduceSum(&probs, &result, 1);
+    ReduceSum(probs, result, 1);
    
    return result.Get1D(0);
 }
@@ -784,7 +781,7 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)

        /* generate word embedding of position i:
           embedding = input * w   */
-        _MatrixMul(&input, X_NOTRANS, &w, X_NOTRANS, &embedding);
+        MatrixMul(input, X_NOTRANS, w, X_NOTRANS, embedding);

        eList.Add(&net.embeddings[i]);
    }
@@ -792,7 +789,7 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
    /* concatenate word embeddings
       embeddingcat = cat(embedding_0...embedding_{n-1}) */
    InitModelTensor2D(net.embeddingCat, batchSize, (n - 1) * model.eSize, model);
-    _Concatenate(&eList, &net.embeddingCat, 1);
+    Concatenate(eList, net.embeddingCat, 1);

    /* go over each hidden layer */
    for(int i = 0; i < depth; i++){
@@ -807,22 +804,22 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)

        /* generate hidden states of layer i: 
           s = h_pre * w    */
-        _MatrixMul(&h_pre, X_NOTRANS, &w, X_NOTRANS, &s);
+        MatrixMul(h_pre, X_NOTRANS, w, X_NOTRANS, s);

        /* make a 2d tensor for the bias term */
        XTensor b2D;
        InitTensorV2(&b2D, &s);
-        _Unsqueeze(&b, &b2D, 0, batchSize);
+        Unsqueeze(b, b2D, 0, batchSize);

        /* introduce bias term:
           s = s + b
           NOTE: the trick here is to extend b to a 2d tensor
                 to fit into the 2d representation in tensor summation */
-        _Sum(&s, &b2D, &s);
+        Sum(s, b2D, s);

        /* pass the state through the hard tanh function:
           h = tanh(s) */
-        _HardTanH(&s, &h);
+        HardTanH(s, h);
    }

    /* generate the output Pr(w_{n-1}|w_0...w_{n-2}):
@@ -840,16 +837,16 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
        InitModelTensor2D(y, batchSize, model.vSize, model);

        /* s = h_last * w  */
-        _MatrixMul(&h_last, X_NOTRANS, &w, X_NOTRANS, &s);
+        MatrixMul(h_last, X_NOTRANS, w, X_NOTRANS, s);

        XTensor b2D;
        InitTensorV2(&b2D, &s);
-        _Unsqueeze(&b, &b2D, 0, batchSize);
+        Unsqueeze(b, b2D, 0, batchSize);

-        _Sum(&s, &b2D, &s);
+        Sum(s, b2D, s);

        /* y = softmax(s) */
-        _LogSoftmax(&s, &y, 1);
+        LogSoftmax(s, y, 1);
    }
 }

@@ -891,18 +888,18 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
        x is the top most hidden layer)
       so we know 
       dE/dw = x^T * dE/ds */
-    _MatrixMul(&x, X_TRANS, &deds, X_NOTRANS, &dedw);
+    MatrixMul(x, X_TRANS, deds, X_NOTRANS, dedw);

    /* gradient of the bias: dE/db = dE/ds * 1 = dE/ds
    specifically dE/db_{j} = \sum_{i} dE/ds_{i,j} */
-    _ReduceSum(&deds, &dedb, 0);
+    ReduceSum(deds, dedb, 0);

    /* then, we compute 
       dE/dx_{j} = \sum_j' (dE/ds_{j'} * ds_{j'}/dx_j) 
                 = \sum_j' (dE/ds_{j'} * w_{j, j'})
       i.e., 
       dE/dx = dE/ds * w^T */
-    _MatrixMul(&deds, X_NOTRANS, &w, X_TRANS, &dedx);
+    MatrixMul(deds, X_NOTRANS, w, X_TRANS, dedx);

    XTensor &gradPassed = dedx;
    XTensor dedsHidden;
@@ -927,20 +924,20 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
        
        /* backpropagation through the activation fucntion: 
           dE/ds = dE/dh * dh/ds */
-        _HardTanHBackward(NULL, &h, &s, &dedh, &deds, NOLOSS);
+        _HardTanHBackward(&h, &s, &dedh, &deds);

        /* gradient of the weight: dE/dw = x^T * dE/ds   */
-        _MatrixMul(&x, X_TRANS, &deds, X_NOTRANS, &dedw);
+        MatrixMul(x, X_TRANS, deds, X_NOTRANS, dedw);

        /* gradient of the bias: dE/db = dE/ds * 1 = dE/ds
           specifically dE/db_{j} = \sum_{i} dE/ds_{i,j} */
-        _ReduceSum(&deds, &dedb, 0);
+        ReduceSum(deds, dedb, 0);

        /* gradient of the input: dE/dx = dE/ds * w^T    */
-        _MatrixMul(&deds, X_NOTRANS, &w, X_TRANS, &dedx);
+        MatrixMul(deds, X_NOTRANS, w, X_TRANS, dedx);

        if (i > 0)
-            _CopyValues(&dedx, &gradPassed);
+            CopyValues(dedx, gradPassed);
    }

    TensorList eList(n - 1);
@@ -955,7 +952,7 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
    XTensor &dedyCat = depth > 0 ? dedxBottom : dedx;

    /* split the concatenation of gradients of the embeddings */
-    _Split(&dedyCat, &eList, 1, n - 1);
+    Split(dedyCat, eList, 1, n - 1);

    /* go over for each word */
    for (int i = 0; i < n - 1; i++) {
@@ -966,7 +963,7 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
        /* gradient of the embedding weight: dE/dw += x^T * dE/dy 
           NOTE that we accumulate dE/dw here because the matrix w
           is shared by several layers (or words) */
-        _MatrixMul(&x, X_TRANS, dedy, X_NOTRANS, &dedw, 1.0F, 1.0F);
+        MatrixMul(x, X_TRANS, *dedy, X_NOTRANS, dedw, 1.0F, 1.0F);

        delete dedy;
    }
@@ -1171,9 +1168,10 @@ void Test(const char * test, const char * result, FNNModel &model)
        else {            
            /* this is implemented by gather function */
            ForwardAutoDiff(ngrams, ngramNum, output, model);
-                
-            /* this is implemented by multiply function */
-            //ForwardAutoDiff(inputs, output, model);
+            output = Log(output);
+				
+			/* this is implemented by multiply function */
+			//ForwardAutoDiff(inputs, output, model);
        }

        /* prediction probabilities */
@@ -1201,6 +1199,7 @@ void Test(const char * test, const char * result, FNNModel &model)
    }

    fclose(file);
+    fclose(ofile);

    double elapsed = GetClockSec() - startT;


--- a/source/sample/transformer/T2TSearch.cpp
+++ b/source/sample/transformer/T2TSearch.cpp
@@ -297,7 +297,7 @@ void T2TSearch::Generate(T2TStateBundle * beam)
       row means a previous state. The column number is size-of-beam \times vocab-size. We,
       therefore, divide entries of the top-k index by vocab-size to compute the id of the
       previous state for each hypothesis in the top-k list. */
-    Descale(preID, sizeVocab);
+    DescaleMe(preID, sizeVocab);
    
    /* Then, we do something similar to "preID". For the top-k predictions, we need 
       to know their indices in the vocabulary. We compute the offset of each prediction
@@ -311,13 +311,13 @@ void T2TSearch::Generate(T2TStateBundle * beam)
    CopyValues(scoreTopK, score);

    /*  CPU data (TODO: remove GPU->CPU data copy!!!) */
-    XTensor indexCPU;
-    InitTensorV2(&indexCPU, index.order, index.dimSize, index.dataType, -1);
-    CopyValues(index, indexCPU);
+    XTensor indexGPU;
+    indexGPU = CopyValues(index);
+    //InitTensor(&indexCPU, index.order, index.dimSize, index.dataType, index.denseRatio, -1);
+    //CopyValues(index, indexCPU);

-
-    for (int i = 0; i < indexCPU.unitNum; i++)
-        indexCPU.SetInt(i * stride + indexCPU.GetInt(i), i);
+    for (int i = 0; i < indexGPU.unitNum; i++)
+        indexGPU.SetInt(i * stride + indexGPU.GetInt(i), i);

    CheckNTErrors(XTensor::IsSameShaped(&prob, &probPath), "Wrong tensor shape!");

@@ -338,8 +338,8 @@ void T2TSearch::Generate(T2TStateBundle * beam)
    prob.Reshape(1, prob.unitNum);
    probTopK.Reshape(1, probTopK.unitNum);

-    _Gather(&probPath, &probPathTopK, probPathTopK.order - 1, (int*)indexCPU.data, indexCPU.unitNum);
-    _Gather(&prob, &probTopK, probTopK.order - 1, (int*)indexCPU.data, indexCPU.unitNum);
+    _CopyIndexed(&probPath, &probPathTopK, probPathTopK.order - 1, &indexGPU);
+    _CopyIndexed(&prob, &probTopK, probTopK.order - 1, &indexGPU);

    probPath.Reshape(order, dims);
    probPathTopK.Reshape(order, dimsTopK);

--- a/source/tensor/XDataType.cpp
+++ b/source/tensor/XDataType.cpp
@@ -60,7 +60,7 @@ TENSOR_DATA_TYPE GetDataType(const char * typeName)
    }
 }

-/****************************************************
+/*
 Below is for calling CPU BLAS for fast matrix operations
 I'm not sure how fast it is. But it seems that other
 guys are crazy about this. So I decided to have a try.
@@ -81,35 +81,4 @@ _XINLINE_ float Float16ToFloat(unsigned short h)
    return f;
 }

-/* 
-data type conversion
->> devID - device id
->> s - source data array
->> typeS - source data type
->> t - target data array
->> typeT - target data type
->> size - number of the items in s (and t)
-*/
-void ConvertDataType(int devID, void * s, TENSOR_DATA_TYPE typeS, void * t, TENSOR_DATA_TYPE typeT, int size)
-{
-    CheckNTErrors((devID < 0), "This code must be run on CPUs!");
-
-    if(typeS == typeT)
-        return;
-
-    if(typeS == X_FLOAT && typeT == X_FLOAT16){
-        for(int i = 0; i < size; i++){
-            ((unsigned short*)t)[i] = FloatToFloat16(((float*)s)[i]);
-        }
-    }
-    else if(typeS == X_FLOAT16 && typeT == X_FLOAT){
-        for(int i = 0; i < size; i++){
-            ((float*)t)[i] = Float16ToFloat(((unsigned short*)s)[i]);
-        }
-    }
-    else{
-        ShowNTErrors("Unsupported data types for conversion!");
-    }
-}
-
 } /* end of the nts (NiuTrans.Tensor) namespace */
--- a/source/tensor/XDataType.h
+++ b/source/tensor/XDataType.h
@@ -49,15 +49,6 @@ extern TENSOR_DATA_TYPE GetDataType(const char * typeName);
 /* data conversion (for lower precision computation) */
 unsigned short FloatToFloat16(float f);
 float Float16ToFloat(unsigned short h);
-void ConvertDataType(int devID, 
-                     void * s, TENSOR_DATA_TYPE typeS, 
-                     void * t, TENSOR_DATA_TYPE typeT, int size);
-
-#ifdef USE_CUDA
-void CudaConvertDataType(int devID, 
-                         void * s, TENSOR_DATA_TYPE typeS, 
-                         void * t, TENSOR_DATA_TYPE typeT, int size);
-#endif

 } /* end of the nts (NiuTrans.Tensor) namespace */


--- a/source/tensor/XGlobal.cpp
+++ b/source/tensor/XGlobal.cpp
@@ -51,7 +51,13 @@ bool CONST_TRUE = true;

 int verboseLevel = 0;
 bool useBLAS = false;
-bool useCUDA = false;
+
+
+#ifdef USE_CUDA
+    bool useCUDA = true;
+#else
+    bool useCUDA = false;
+#endif

 FILE * tmpLog = NULL;
 double myTime = 0;

--- a/source/tensor/XName.cpp
+++ b/source/tensor/XName.cpp
@@ -59,6 +59,8 @@ const char * GetOPName(int type)
            return "M_DIV";
        else if (type == MATH_DIVDIM)
            return "M_DIVDIM";
+        else if (type == MATH_MASK)
+            return "M_MASK";
        else if (type == MATH_MATRIXMUL)
            return "M_MATRIXMUL";
        else if (type == MATH_MATRIXMULBATCHED)

--- a/source/tensor/XName.h
+++ b/source/tensor/XName.h
@@ -48,7 +48,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define MATH_CLIP               MATH_ROUND + 1
 #define MATH_DIV                MATH_CLIP + 1
 #define MATH_DIVDIM             MATH_DIV + 1
-#define MATH_MATRIXMUL          MATH_DIVDIM + 1
+#define MATH_MASK               MATH_DIVDIM + 1
+#define MATH_MATRIXMUL          MATH_MASK + 1
 #define MATH_MATRIXMULBATCHED   MATH_MATRIXMUL + 1
 #define MATH_MULTIPLY           MATH_MATRIXMULBATCHED + 1
 #define MATH_MULTIPLYDIM        MATH_MULTIPLY + 1
@@ -79,7 +80,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* data and shape related operations */
 #define DATA_BASE               MATH_BASE * 2
 #define GETANDSET               DATA_BASE + 1
-#define GETANDSET_SELECT        GETANDSET + 1
+#define GETANDSET_CONVERTDATATYPE GETANDSET + 1
+#define GETANDSET_SELECT        GETANDSET_CONVERTDATATYPE + 1

 #define MOVEMENT                GETANDSET_SELECT + 1
 #define MOVEMENT_COPYINDEXED    MOVEMENT + 1

--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -48,6 +48,7 @@
 #include "core/math/ScaleAndShift.h"
 #include "core/getandset/SetData.h"
 #include "function/Identity.h"
+#include "core/CHeader.h"

 #ifdef USE_CUDA

@@ -485,6 +486,12 @@ XTensor XTensor::operator- (const DTYPE shift) const
    return ScaleAndShift(*this, 1, -shift);
 }

+/* overloading of the minus-sign */
+XTensor XTensor::operator- () const
+{
+    return Negate(*this);
+}
+
 /* overloading of the division-sign */
 XTensor XTensor::operator/ (const XTensor& tensor) const
 {
@@ -837,6 +844,12 @@ void XTensor::SetData(const void * d, int num, int beg)
    XMemCopy((char*)data + beg * unitSize, devID, d, -1, num * unitSize);
 }

+/* generate data items with a uniform distribution in [0, 1] */
+void XTensor::Rand(int rNum, int cNum)
+{
+    _SetDataRand(this, rNum, cNum);
+}
+
 /* 
 set the tensor items by a uniform distribution in range [lower, upper]
 >> lower - lower value of the range
@@ -2425,7 +2438,7 @@ initialize a dense 5d tensor V2
 */

 void InitTensor5DV2(XTensor * tensor, const int d0, const int d1, const int d2, const int d3, const int d4,
-                  const TENSOR_DATA_TYPE myDataType, const int myDevID)
+                    const TENSOR_DATA_TYPE myDataType, const int myDevID)
 {
    int dims[5];
    dims[0] = d0;

--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
@@ -238,6 +238,9 @@ public:
    /* overloading of the minus-sign */
    XTensor  operator- (const DTYPE shift) const;

+    /* overloading of the minus-sign */
+    XTensor  operator- () const;
+
    /* overloading of the division-sign */
    XTensor  operator/ (const XTensor &tensor) const;
    
@@ -301,6 +304,9 @@ public:
    /* set the tensor with an data array */
    void SetData(const void * d, int num, int beg = 0);

+    /* generate data items with a uniform distribution in [0, 1] */
+    void Rand(int rNum, int cNum);
+
    /* set tensor items by a uniform distribution */
    void SetDataRand(DTYPE lower = 0.0F, DTYPE upper = 1.0F);

@@ -497,7 +503,7 @@ void InitTensor5D(XTensor * tensor, const int d0, const int d1, const int d2, co

 /* initialize a dense 5d tensor V2 */
 void InitTensor5DV2(XTensor * tensor, const int d0, const int d1, const int d2, const int d3, const int d4,
-                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1);
+                    const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1);

 /* initialize a tensor with a reference tensor */
 void InitTensor(XTensor * tensor, const XTensor * reference);

--- a/source/tensor/core/CHeader.h
+++ b/source/tensor/core/CHeader.h
@@ -36,13 +36,9 @@
 #include "arithmetic/MatrixMulBatched.h"
 #include "arithmetic/Multiply.h"
 #include "arithmetic/MultiplyDim.h"
-#include "arithmetic/Negate.h"
-#include "arithmetic/Sign.h"
 #include "arithmetic/Sub.h"
 #include "arithmetic/SubDim.h"
 #include "arithmetic/Sum.h"
-#include "arithmetic/SumByColumnTV.h"
-#include "arithmetic/SumByColumnVT.h"
 #include "arithmetic/SumDim.h"
 #include "arithmetic/XTensorBLAS.h"
 #include "arithmetic/MulAndShift.h"
@@ -56,7 +52,6 @@
 #include "math/Clip.h"
 #include "math/Compare.h"
 #include "math/Normalize.h"
-#include "math/Power.h"
 #include "math/ScaleAndShift.h"
 #include "math/Unary.h"

@@ -97,5 +92,4 @@
 #include "utilities/XMatrixSegment.h"
 #include "utilities/FlushToMem.h"

-#include "../function/DropoutWithIndex.h"
 #endif // __CHEADER_H__
--- a/source/tensor/core/arithmetic/Mask.cpp
+++ b/source/tensor/core/arithmetic/Mask.cpp
@@ -151,16 +151,35 @@ XTensor Mask(const XTensor &a, const XTensor &mask, DTYPE alpha)
    XTensor c(&a);
    c.SetTMPFlag();

-    /* call _Sum function */
+    /* call _Mask function */
    _Mask(&a, &mask, &c, alpha);

    /* tensor connections */
-    //XLink::MakeLink(&a, &mask, &c, MATH_SUM);
-    //XLink::AddParamToHead(&c, alpha);
-    // TODO!!
-    ShowNTErrors("TODO!");
+    XLink::MakeLink(&a, &mask, &c, MATH_MASK);
+    XLink::AddParamToHead(&c, alpha);

    return c;
 }

+/*
+mask entries of a given tensor (return an XTensor structure):
+a(i) = a(i) if mask(i) is non-zero
+a(i) = alpha if mask(i) = 0
+where i is the index of the element
+*/
+void Mask(const XTensor &a, const XTensor &mask, XTensor &c, DTYPE alpha)
+{
+    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
+        InitTensor(&c, &a);
+    }
+
+    /* call _Mask function */
+    _Mask(&a, &mask, &c, alpha);
+
+    if (c.enableGrad) {
+        XLink::MakeLink(&a, &mask, &c, MATH_MASK);
+        XLink::AddParamToHead(&c, alpha);
+    }
+}
+
 }
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Mask.h
+++ b/source/tensor/core/arithmetic/Mask.h
@@ -34,7 +34,7 @@ c(i) = a(i) if mask(i) is non-zero
 c(i) = alpha if mask(i) = 0
 where i is the index of the element
 */
-void _Mask(const XTensor * a, const XTensor * mask, XTensor * c, DTYPE alpha);
+void _Mask(const XTensor * a, const XTensor * mask, XTensor * c, DTYPE alpha = 0.0);

 /* 
 mask entries of a given tensor (on site):
@@ -42,10 +42,10 @@ a(i) = a(i) if mask(i) is non-zero
 a(i) = alpha if mask(i) = 0
 where i is the index of the element
 */
-void _MaskMe(XTensor * a, const XTensor * mask, DTYPE alpha);
-void MaskMe(XTensor & a, const XTensor & mask, DTYPE alpha);
+void _MaskMe(XTensor * a, const XTensor * mask, DTYPE alpha = 0.0);
+void MaskMe(XTensor & a, const XTensor & mask, DTYPE alpha = 0.0);

-/* 
+/*
 mask entries of a given tensor (return an XTensor structure):
 a(i) = a(i) if mask(i) is non-zero
 a(i) = alpha if mask(i) = 0
@@ -53,6 +53,14 @@ where i is the index of the element
 */
 XTensor Mask(const XTensor &a, const XTensor &mask, DTYPE alpha = 0.0);

+/*
+mask entries of a given tensor (return an XTensor structure):
+a(i) = a(i) if mask(i) is non-zero
+a(i) = alpha if mask(i) = 0
+where i is the index of the element
+*/
+void Mask(const XTensor &a, const XTensor &mask, XTensor &c, DTYPE alpha = 0.0);
+
 } // namespace nts(NiuTrans.Tensor)

 #endif // __MASK_H__
--- a/source/tensor/core/arithmetic/MatrixMul.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul.cpp
@@ -202,7 +202,9 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
    delete cList;
 }

-bool CheckMMulShape(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c)
+bool CheckMMulShape(const XTensor * a, MATRIX_TRANS_TYPE transposedA, 
+                    const XTensor * b, MATRIX_TRANS_TYPE transposedB, 
+                    XTensor * c)
 {
    if (!(a && b && c))
        return false;
@@ -231,10 +233,13 @@ bool CheckMMulShape(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTen
    dimSize[sub++] = bm;

    for (int i = 0; i < order; i++) {
-        if (dimSize[i] != c->dimSize[i])
+        if (dimSize[i] != c->dimSize[i]) {
+            delete[] dimSize;
            return false;
+        }
    }
-
+    
+    delete[] dimSize;
    return true;
 }

@@ -303,8 +308,8 @@ XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
 }

 void MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
-    const XTensor &b, MATRIX_TRANS_TYPE transposedB, XTensor &c, 
-    DTYPE alpha, XPRunner * parallelRunner)
+               const XTensor &b, MATRIX_TRANS_TYPE transposedB, XTensor &c, 
+               DTYPE alpha, DTYPE beta, XPRunner * parallelRunner)
 {
    CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
    CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
@@ -337,7 +342,7 @@ void MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
    }

    /* call _MatrixMul function */
-    _MatrixMul(&a, transposedA, &b, transposedB, &c, alpha, 0, parallelRunner);
+    _MatrixMul(&a, transposedA, &b, transposedB, &c, alpha, beta, parallelRunner);

    if (c.enableGrad) {
        /* tensor connections */
@@ -400,7 +405,7 @@ XTensor MatrixMul(const XTensor &a, const XTensor &b,
 }

 void MatrixMul(const XTensor &a, const XTensor &b, XTensor &c,
-    DTYPE alpha, XPRunner * parallelRunner)
+               DTYPE alpha, XPRunner * parallelRunner)
 {
    CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
    CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");

--- a/source/tensor/core/arithmetic/MatrixMul.h
+++ b/source/tensor/core/arithmetic/MatrixMul.h
@@ -40,8 +40,11 @@ bj is the j-th element tensor of B, and c_{i,j} is the (i,j) elementtensor of th
 C should be a tensor of z * x * n * m. 
 Obviously C = A * B performs normal matrix multiplication if A = y * z and B = x * y.
 */
-void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
-                DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
+void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA, 
+                const XTensor * b, MATRIX_TRANS_TYPE transposedB, 
+                XTensor * c,
+                DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, 
+                XPRunner * parallelRunner = NULL);

 /* 
 matrix multiplication (return an XTensor structure) c = trans(a) * trans(b) * alpha
@@ -56,11 +59,16 @@ bj is the j-th element tensor of B, and c_{i,j} is the (i,j) elementtensor of th
 C should be a tensor of z * x * n * m. 
 Obviously C = A * B performs normal matrix multiplication if A = y * z and B = x * y.
 */
-XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB, 
-                  DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
+XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, 
+                  const XTensor &b, MATRIX_TRANS_TYPE transposedB, 
+                  DTYPE alpha = (DTYPE)1.0, 
+                  XPRunner * parallelRunner = NULL);

-void MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB,
-               XTensor &c, DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
+void MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, 
+               const XTensor &b, MATRIX_TRANS_TYPE transposedB,
+               XTensor &c, 
+               DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, 
+               XPRunner * parallelRunner = NULL);

 /* matrix multiplication with no transposition c = a * b * alpha*/
 XTensor MatrixMul(const XTensor &a, const XTensor &b, 
@@ -69,7 +77,6 @@ XTensor MatrixMul(const XTensor &a, const XTensor &b,
 void MatrixMul(const XTensor &a, const XTensor &b, XTensor &c, 
               DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);

-
 } // namespace nts(NiuTrans.Tensor)

 #endif // __MATRIXMUL_H__
\ No newline at end of file
--- a/source/tensor/core/arithmetic/MatrixMulBatched.cpp
+++ b/source/tensor/core/arithmetic/MatrixMulBatched.cpp
@@ -154,7 +154,7 @@ void _MatrixMulBatchedCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
                          const XTensor * b, MATRIX_TRANS_TYPE transposedB,
                          XTensor * c, DTYPE alpha, DTYPE beta)
 {
-CheckNTErrors((a && b && c), "Empty input tensors!");
+    CheckNTErrors(a && b && c, "Empty input tensors!");
    CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
                 "Input tensors should have the same data type!");
    CheckNTErrors(a->order >= 2 && b->order >= 2 && c->order >= 2,

--- a/source/tensor/core/arithmetic/MulAndShift.cpp
+++ b/source/tensor/core/arithmetic/MulAndShift.cpp
@@ -66,7 +66,7 @@ operation c = x * w + b  MulAndShift
 << return - the result of matrix multiplication
 */
 XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
-                  DTYPE alpha, XPRunner * parallelRunner)
+                    DTYPE alpha, XPRunner * parallelRunner)
 {
    CheckNTErrors(x.dataType == w.dataType, "Input tensors should have the same data type!");
    CheckNTErrors(x.order >= 2 && w.order >= 2, "Input tensors must have a order >= 2!");
@@ -129,9 +129,6 @@ XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
    DelTensorBuf(tmp);

    return c;
-
 }

-
-
 }
\ No newline at end of file
--- a/source/tensor/core/arithmetic/MulAndShift.h
+++ b/source/tensor/core/arithmetic/MulAndShift.h
@@ -29,7 +29,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)


 XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
-                  DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
+                    DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);


 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/arithmetic/Multiply.cu
+++ b/source/tensor/core/arithmetic/Multiply.cu
@@ -123,9 +123,9 @@ where i is the item index
 void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
 {
    int leadingDimRDI = a->order - leadingDim - 1;
-    CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum),
+    CheckNTErrors(a->unitNum <= c->unitNum && b->unitNum <= c->unitNum,
                  "Unmatched tensors in multiplication!");
-    CheckNTErrors((a->order == b->order && a->order == c->order), "Unmatched tensors!");
+    CheckNTErrors(a->order == b->order && a->order == c->order, "Unmatched tensors!");

    int stride = 1;
    int blockSizeA = 1;

--- a/source/tensor/core/arithmetic/Negate.cpp
+++ b/source/tensor/core/arithmetic/Negate.cpp
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
-
-#include "../../XTensor.h"
-#include "../../XName.h"
-#include "Negate.h"
-#include "Negate.cuh"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-/*
-set every entry to its minus value
->> a - input tensor we are processing
->> b - output tensor we are processing
-*/
-void _Negate(const XTensor * a, XTensor * b)
-{
-#ifdef USE_CUDA
-    /* run it on GPUs */
-    if (a->devID >= 0) {
-        _CudaNegate(a, b);
-    return;
-    }
-#endif
-
-    CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
-    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
-    DTYPE * d = (DTYPE*)a->data;
-    DTYPE * db = (DTYPE*)b->data;
-    for (int i = 0; i < a->unitNum; i++)
-        db[i] = -d[i];
-}
-
-/*
-set every entry to its minus value (do it on site)
-keep the result in the input tensor a and return nothing
->> a - the tensor we are processing
-*/
-void _NegateMe(XTensor * a)
-{
-    _Negate(a, a);
-}
-
-/*
-set every entry to its minus value (do it on site)
-keep the result in the input tensor a and return nothing
->> a - the tensor we are processing
-*/
-void NegateMe(XTensor& a)
-{
-    _Negate(&a, &a);
-}
-
-/*
-set every entry to its minus value (return an XTensor structure)
-make a new tensor to keep the result and return it
->> a - input tensor we are processing
-<< return - the minus value of input tensor
-*/
-XTensor Negate(const XTensor & a)
-{
-    XTensor b(&a);
-    b.SetTMPFlag();
-    
-    /* call _Negate function */
-    _Negate(&a, &b);
-    
-    /* tensor connections */
-    XLink::MakeLink(&a, NULL, &b, MATH_NEGATE);
-    
-    return b;
-}
-
-/*
-set every entry to its minus value
->> a - input tensor we are processing
->> b - output tensor we are processing
-*/
-void Negate(const XTensor & a, XTensor & b)
-{
-    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {
-        InitTensor(&b, &a);
-    }
-
-    /* call _Negate function */
-    _Negate(&a, &b);
-
-    if (b.enableGrad) {
-        /* tensor connections */
-        XLink::MakeLink(&a, NULL, &b, MATH_NEGATE);
-    }
-}
-
-} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Negate.cu
+++ b/source/tensor/core/arithmetic/Negate.cu
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
-
-#include "../../XDevice.h"
-#include "../../XTensor.h"
-#include "Negate.h"
-#include "Negate.cuh"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-#ifdef USE_CUDA
-/*
-set each entry to its negtive value (CUDA Kernel)
->> a - pointer to the input data array
->> b - pointer to the output data array
->> size - size of the data array
-*/
-__global__
-void KernelNegate(DTYPE * a, DTYPE * b, int size)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-    if (i < size)
-        b[i] = -a[i];
-}
-
-/*
-set each entry to its negtive value (CUDA Kernel)
-This is for float16 computation
->> a - pointer to the input data array
->> b - pointer to the output data array
->> size - size of the data array
-*/
-__global__
-void KernelNegate(__half * a, __half * b, int size)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
-        if (i < size)
-            b[i] = __hsub(__float2half(0), a[i]);
-#else
-        if (i < size)
-            b[i] = __float2half(-__half2float(a[i]));
-#endif
-}
-
-/*
-set each entry to its negtive value
->> a - input tensor
->> b - output tensor
-*/
-void _CudaNegate(const XTensor * a, XTensor * b)
-{
-    CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
-    CheckNTErrors((a->isSparse == false), "TODO!");
-
-    int gridSize[3];
-    int blockSize[3];
-
-    GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
-
-    dim3 blocks(gridSize[0]);
-    dim3 threads(blockSize[0]);
-
-    int devIDBackup;
-    ProtectCudaDev(a->devID, devIDBackup);
-
-    if (a->dataType == DEFAULT_DTYPE) {
-        KernelNegate << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum);
-    }
-    else if (a->dataType == X_FLOAT16) {
-        KernelNegate << <blocks, threads >> >((__half*)a->data, (__half*)b->data, a->unitNum);
-    }
-    else {
-        ShowNTErrors("TODO!");
-    }
-
-    BacktoCudaDev(a->devID, devIDBackup);
-}
-
-#endif // USE_CUDA
-} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/Negate.cuh
+++ b/source/tensor/core/arithmetic/Negate.cuh
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
-
-#ifndef __NEGATE_CUH__
-#define __NEGATE_CUH__
-
-#include "Negate.h"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-#ifdef USE_CUDA
-
-/* set each entry to its negtive value (CUDA Kernel) */
-__global__
-void KernelNegate(DTYPE * a, DTYPE * b, int size);
-
-/* set each entry to its negtive value (CUDA Kernel) with float16 data type*/
-__global__
-void KernelNegate(__half * a, __half * b, int size);
-
-/* set each entry to its negtive value */
-void _CudaNegate(const XTensor * a, XTensor * b);
-
-#endif // USE_CUDA
-
-} // namespace nts(NiuTrans.Tensor)
-
-#endif // __NEGATE_CUH__
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Negate.h
+++ b/source/tensor/core/arithmetic/Negate.h
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
-
-#ifndef __NEGATE_H__
-#define __NEGATE_H__
-
-#include "../../XTensor.h"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-/* set every entry to its minus value */
-void _Negate(const XTensor * a, XTensor * b);
-
-/* 
-set every entry to its minus value (do it on site)
-keep the result in the input tensor a and return nothing
-*/
-void _NegateMe(XTensor * a);
-void NegateMe(XTensor & a);
-
-/* 
-set every entry to its minus value (return an XTensor structure)
-make a new tensor to keep the result and return it
-*/
-XTensor Negate(const XTensor & a);
-
-/* set every entry to its minus value */
-void Negate(const XTensor & a, XTensor & b);
-
-} // namespace nts(NiuTrans.Tensor)
-
-#endif // __NEGATE_H__
--- a/source/tensor/core/arithmetic/Sign.cpp
+++ b/source/tensor/core/arithmetic/Sign.cpp
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
-*/
-
-#include "../../XTensor.h"
-#include "../../XName.h"
-#include "Sign.h"
-#include "Sign.cuh"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-/*
-set every entry to its sign value
->> a - input tensor we are processing
->> b - output tensor we are processing
-*/
-void _Sign(const XTensor * a, XTensor * b)
-{
-#ifdef USE_CUDA
-    /* run it on GPUs */
-    if (a->devID >= 0) {
-        _CudaSign(a, b);
-    return;
-}
-#endif
-
-    CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
-    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
-    DTYPE * d = (DTYPE*)a->data;
-    DTYPE * db = (DTYPE*)b->data;
-    for (int i = 0; i < a->unitNum; i++) {
-        if (d[i] > 0)
-            db[i] = 1.0F;
-        else if (d[i] == 0)
-            db[i] = 0.0F;
-        else
-            db[i] = -1.0F;
-    }
-}
-
-/*
-set every entry to its sign value (do it on site)
-keep the result in the input tensor a and return nothing
->> a - the tensor we are processing
-*/
-void _SignMe(XTensor * a)
-{
-    _Sign(a, a);
-}
-
-/*
-set every entry to its sign value (do it on site)
-keep the result in the input tensor a and return nothing
->> a - the tensor we are processing
-*/
-void SignMe(XTensor& a)
-{
-    _Sign(&a, &a);
-}
-
-/*
-set every entry to its sign value (return an XTensor structure)
-make a new tensor to keep the result and return it
->> a - input tensor we are processing
-<< return - the sign value of the input tensor
-*/
-XTensor Sign(const XTensor & a)
-{
-    XTensor b(&a);
-    b.SetTMPFlag();
-
-    /* call _Sign function */
-    _Sign(&a, &b);
-
-    /* tensor connections */
-    XLink::MakeLink(&a, NULL, &b, MATH_SIGN);
-
-    return b;
-}
-
-/*
-set every entry to its sign value
->> a - input tensor we are processing
->> b - output tensor we are processing
-*/
-void Sign(const XTensor & a, XTensor & b)
-{
-    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {
-        InitTensor(&b, &a);
-    }
-
-    /* call _Sign function */
-    _Sign(&a, &b);
-
-    if (b.enableGrad) {
-        /* tensor connections */
-        XLink::MakeLink(&a, NULL, &b, MATH_SIGN);
-    }
-}
-} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Sign.cu
+++ b/source/tensor/core/arithmetic/Sign.cu
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
-*/
-
-#include "../../XDevice.h"
-#include "../../XTensor.h"
-#include "Sign.h"
-#include "Sign.cuh"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-#ifdef USE_CUDA
-/*
-set each entry to its sign value (CUDA Kernel)
->> a - pointer to input data array
->> b - pointer to output data array
->> size - size of the data array
-*/
-__global__
-void KernelSign(DTYPE * a, DTYPE * b, int size)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-    if (i < size) {
-        if (a[i] > 0)
-            b[i] = 1.0F;
-        else if (a[i] == 0)
-            b[i] = 0.0F;
-        else
-            b[i] = -1.0F;
-    }
-}
-
-/*
-set each entry to its sign value with float16 data type value (CUDA Kernel)
-This is for float16 computation
->> a - pointer to input data array
->> b - pointer to output data array
->> size - size of the data array
-*/
-__global__
-void KernelSign(__half * a, __half * b, int size)
-{
-    return;
-}
-
-/*
-set each entry to its sign value
->> a - input tensor we are processing
->> b - output tensor we are processing
-*/
-void _CudaSign(const XTensor * a, XTensor * b)
-{
-    CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
-    CheckNTErrors((a->isSparse == false), "TODO!");
-
-    int gridSize[3];
-    int blockSize[3];
-
-    GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
-
-    dim3 blocks(gridSize[0]);
-    dim3 threads(blockSize[0]);
-
-    int devIDBackup;
-    ProtectCudaDev(a->devID, devIDBackup);
-
-    if (a->dataType == DEFAULT_DTYPE) {
-        KernelSign << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum);
-    }
-    else if (a->dataType == X_FLOAT16) {
-        KernelSign << <blocks, threads >> >((__half*)a->data, (__half*)b->data, a->unitNum);
-    }
-    else {
-        ShowNTErrors("TODO!");
-    }
-
-    BacktoCudaDev(a->devID, devIDBackup);
-}
-
-#endif // USE_CUDA
-} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/Sign.cuh
+++ b/source/tensor/core/arithmetic/Sign.cuh
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
-*/
-
-#ifndef __SIGN_CUH__
-#define __SIGN_CUH__
-
-#include "Sign.h"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-#ifdef USE_CUDA
-
-/* set each entry to its sign value (CUDA Kernel) */
-__global__
-void KernelSign(DTYPE * a, DTYPE * b, int size);
-
-/* set each entry to its sign value (CUDA Kernel) with float16 data type*/
-__global__
-void KernelSign(__half * a, __half * b, int size);
-
-/* set each entry to its sign value */
-void _CudaSign(const XTensor * a, XTensor * b);
-
-#endif // USE_CUDA
-
-} // namespace nts(NiuTrans.Tensor)
-
-#endif // __SIGN_H__
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Sign.h
+++ b/source/tensor/core/arithmetic/Sign.h
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
-*/
-
-#ifndef __SIGN_H__
-#define __SIGN_H__
-
-#include "../../XTensor.h"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-/* set every entry to its sign value */
-void _Sign(const XTensor * a, XTensor * b);
-
-/* 
-set every entry to its sign value (do it on site)
-keep the result in the input tensor a and return nothing
-*/
-void _SignMe(XTensor * a);
-
-/* 
-set every entry to its sign value (do it on site)
-keep the result in the input tensor a and return nothing
-*/
-void SignMe(XTensor & a);
-
-/* 
-set every entry to its sign value  (return an XTensor structure)
-make a new tensor to keep the result and return it
-*/
-XTensor Sign(const XTensor & a);
-
-/* set every entry to its sign value */
-void Sign(const XTensor & a, XTensor & b);
-
-} // namespace nts(NiuTrans.Tensor)
-
-#endif // __SIGN_H__
--- a/source/tensor/core/arithmetic/SumByColumnTV.cpp
+++ b/source/tensor/core/arithmetic/SumByColumnTV.cpp
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
-
-#include "../../XTensor.h"
-#include "SumByColumnTV.h"
-#include "SumByColumnTV.cuh"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-/*
-sum of a tensor and a vector (column vector) in a column by column manner
-
-for each column a_col (in a block), we have
-c_col = a_col + b * \beta
-where b is a vector.
-
->> a - a tensor
->> b - a vector with the same column size with a
->> c - where we put a+b. we save it in a if c is NULL
->> beta - the scaling factor
-*/
-void _SumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
-{
-    CheckNTErrors((a && b && c), "Empty input tensors!");
-    CheckNTErrors((XTensor::IsSameShaped(a, c)), "Unmatched tensors in addition!");
-    CheckNTErrors((b->order == 2 && b->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]),
-                  "Illegal input vector size!");
-
-    int rowNum = a->dimSize[0];
-    int colNum = a->dimSize[1];
-    int blockNum = 1;
-    for (int i = 2; i < a->order; i++)
-        blockNum *= a->dimSizeRDI[i];
-    int blockSize = colNum * rowNum;
-
-    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
-#ifdef USE_CUDA
-        _CudaSumByColumnTV(a, b, c, beta);
-#endif
-    }
-    else {
-        if (!a->isSparse && !b->isSparse) {
-            CheckNTErrors(!c->isSparse, "TODO!");
-
-            if (a->dataType == DEFAULT_DTYPE &&
-                b->dataType == DEFAULT_DTYPE &&
-                c->dataType == DEFAULT_DTYPE)
-            {
-                for (int k = 0; k < blockNum; k++) {
-                    for (int i = 0; i < rowNum; i++) {
-                        DTYPE * ap = (DTYPE*)a->data + k * blockSize + i * colNum;
-                        DTYPE * bp = (DTYPE*)b->data;
-                        DTYPE * cp = (DTYPE*)c->data + k * blockSize + i * colNum;
-                        DTYPE v = bp[i];
-                        for (int j = 0; j < colNum; j++)
-                            cp[j] = ap[j] + v * beta;
-                    }
-                }
-            }
-            else {
-                // TODO!!
-                ShowNTErrors("TODO!");
-            }
-        }
-        else {
-            // TODO!!
-            ShowNTErrors("TODO!");
-        }
-    }
-}
-
-} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/SumByColumnTV.cu
+++ b/source/tensor/core/arithmetic/SumByColumnTV.cu
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
-
-#include "../../XDevice.h"
-#include "../../XTensor.h"
-#include "SumByColumnTV.h"
-#include "SumByColumnTV.cuh"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-#ifdef USE_CUDA
-
-/*
-summation of a tensor and a vector (column vector)
-c_col = a_col  + b * \beta
->> a - a tensor
->> b - a vector with the same column size with a
->> c - where we put a+b. we save it in a
->> colNum - column number (of a block)
->> blockSize - size of a block
->> size - size of the entire data array
->> beta - the scaling factor
-*/
-__global__
-void KernelADDByColumnTV(DTYPE * a, DTYPE * b, DTYPE * c, int colNum, int blockSize, int size, DTYPE beta)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-    if (i >= size)
-        return;
-
-    int offset = i % blockSize;
-    int row = offset / colNum;
-
-    c[i] = a[i] + b[row] * beta;
-}
-
-/*
-summation of a tensor and a vector (column vector)
-for each column a_col (in a block), we have
-c_col = a_col + b * \beta
-where b is a vector.
-
->> a - a tensor
->> b - a vector with the same column size with a
->> c - where we put a+b. we save it in a if c is NULL
->> beta - the scaling factor
-*/
-void _CudaSumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
-{
-    CheckNTErrors((a && b && c), "Empty input tensors!");
-    CheckNTErrors((XTensor::IsSameShaped(a, c)), "Unmatched tensors in addition!");
-    CheckNTErrors((b->order == 2 && b->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]),
-                  "Illegal input vector size!");
-    CheckNTErrors((a->dataType == DEFAULT_DTYPE && b->dataType == DEFAULT_DTYPE &&
-                  c->dataType == DEFAULT_DTYPE), "TODO");
-
-    int rowNum = a->dimSize[0];
-    int colNum = a->dimSize[1];
-    int blockNum = 1;
-    for (int i = 2; i < a->order; i++)
-        blockNum *= a->dimSizeRDI[i];
-
-    int cudaGridSize[3];
-    int cudaBlockSize[3];
-
-    GDevs.GetCudaThread(c->devID, a->unitNum, cudaGridSize, cudaBlockSize);
-
-    int devIDBackup;
-    ProtectCudaDev(a->devID, devIDBackup);
-
-    KernelADDByColumnTV << <dim3(cudaGridSize[0]), dim3(cudaBlockSize[0]) >> >
-                          ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, colNum, rowNum * colNum, a->unitNum, beta);
-
-    BacktoCudaDev(a->devID, devIDBackup);
-}
-
-#endif // USE_CUDA
-
-} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/SumByColumnTV.cuh
+++ b/source/tensor/core/arithmetic/SumByColumnTV.cuh
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
-
-#ifndef __REDUCEMAX_CUH__
-#define __REDUCEMAX_CUH__
-
-#include "../reduce/ReduceMax.h"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-#ifdef USE_CUDA
-
-/* summation of a tensor and a vector (column vector) */
-void _CudaSumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
-
-#endif // USE_CUDA
-
-} // namespace nts(NiuTrans.Tensor)
-
-#endif // __REDUCEMAX_CUH__
-
--- a/source/tensor/core/arithmetic/SumByColumnTV.h
+++ b/source/tensor/core/arithmetic/SumByColumnTV.h
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
-
-#ifndef __SUMBYCOLUMNTV_H__
-#define __SUMBYCOLUMNTV_H__
-
-#include "../../XTensor.h"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-/* sum of a tensor and a (column) vector */
-void _SumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);
-
-} // namespace nts(NiuTrans.Tensor)
-
-#endif // __SUMBYCOLUMNTV_H__
--- a/source/tensor/core/arithmetic/SumByColumnVT.cpp
+++ b/source/tensor/core/arithmetic/SumByColumnVT.cpp
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
-
-#include "../../XTensor.h"
-#include "SumByColumnVT.h"
-#include "SumByColumnVT.cuh"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-/*
-sum of a vector (column vector) and a tensor in a column by column manner
-
-for each column b_col, we have
-c = a + \sum{col} b_col * \beta
-where c and a are vectors, and b_col is a column in b.
-
->> a - a tensor
->> b - a vector with the same column size with a
->> c - where we put a+b. we save it in a if c is NULL
->> beta - the scaling factor
-*/
-void _SumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
-{
-    CheckNTErrors((a && b && c), "Empty input tensors!");
-    CheckNTErrors((XTensor::IsSameShaped(a, c)), "Unmatched tensors in addition!");
-    CheckNTErrors((a->order == 2 && a->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]),
-                  "Illegal input vector size!");
-
-    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
-#ifdef USE_CUDA
-        _CudaSumByColumnVT(a, b, c, beta);
-#endif
-    }
-    else {
-        int rowNum = b->dimSize[0];
-        int colNum = b->dimSize[1];
-        int blockNum = 1;
-        for (int i = 2; i < b->order; i++)
-            blockNum *= b->dimSizeRDI[i];
-        int blockSize = colNum * rowNum;
-
-        if (!a->isSparse && !b->isSparse) {
-            CheckNTErrors(!c->isSparse, "TODO!");
-
-            if (a->dataType == DEFAULT_DTYPE &&
-                b->dataType == DEFAULT_DTYPE &&
-                c->dataType == DEFAULT_DTYPE)
-            {
-                for (int k = 0; k < blockNum; k++) {
-                    for (int i = 0; i < rowNum; i++) {
-                        DTYPE * ap = (DTYPE*)a->data;
-                        DTYPE * bp = (DTYPE*)b->data + k * blockSize + i * colNum;
-                        DTYPE * cp = (DTYPE*)c->data;
-                        DTYPE sum = 0;
-                        for (int j = 0; j < colNum; j++)
-                            sum += bp[j];
-                        cp[i] = ap[i] + sum * beta;
-                    }
-                }
-            }
-            else {
-                // TODO!!
-                ShowNTErrors("TODO!");
-            }
-        }
-        else {
-            // TODO!!
-            ShowNTErrors("TODO!");
-        }
-    }
-}
-
-} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/SumByColumnVT.cu
+++ b/source/tensor/core/arithmetic/SumByColumnVT.cu
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
-
-#include "../../XDevice.h"
-#include "../../XTensor.h"
-#include "SumByColumnVT.h"
-#include "SumByColumnVT.cuh"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-#ifdef USE_CUDA
-
-/*
-summation of a vector (column vector) and a tensor
-c = a + \sum{col} b_col * \beta
->> a - a vector with the same column size with b
->> b - a tensor
->> c - where we put a+b. we save it in a
->> colNum - column number (of a block)
->> blockSize - size of a block
->> size - size of the entire data array
->> beta - the scaling factor
-*/
-__global__
-void KernelADDByColumnVT(DTYPE * a, DTYPE * b, DTYPE * c, int colNum, int rowNum, int blockNum, DTYPE beta)
-{
-    int row = blockDim.x * blockIdx.x + threadIdx.x;
-
-    if (row >= rowNum)
-        return;
-
-    DTYPE sum = 0;
-    for (int k = 0; k < blockNum; k++) {
-        DTYPE * bp = b + (rowNum * k + row) * colNum;
-        if (colNum % 4 == 0) {
-            for (int i = 0; i < colNum; i += 4)
-                sum += bp[i] + bp[i + 1] + bp[i + 2] + bp[i + 3];
-        }
-        else if (colNum % 2 == 0) {
-            for (int i = 0; i < colNum; i += 2)
-                sum += bp[i] + bp[i + 1];
-        }
-        else {
-            for (int i = 0; i < colNum; i++)
-                sum += bp[i];
-        }
-        __syncthreads();
-    }
-
-    c[row] = a[row] + beta * sum;
-}
-
-/*
-summation of a vector (column vector) and a tensor
-
-for each column b_col, we have
-c = a + \sum{col} b_col * \beta
-where c and a are vectors, and b_col is a column in b.
-
->> a - a vector with the same column size with b
->> b - a tensor
->> c - where we put a+b. we save it in a if c is NULL
->> beta - the scaling factor
-*/
-void _CudaSumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
-{
-    CheckNTErrors((a && b && c), "Empty input tensors!");
-    CheckNTErrors((XTensor::IsSameShaped(a, c)), "Unmatched tensors in addition!");
-    CheckNTErrors((a->order == 2 && a->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]),
-                  "Illegal input vector size!");
-    CheckNTErrors((a->dataType == DEFAULT_DTYPE && b->dataType == DEFAULT_DTYPE &&
-                  c->dataType == DEFAULT_DTYPE), "TODO");
-
-    int rowNum = b->dimSize[0];
-    int colNum = b->dimSize[1];
-    int blockNum = 1;
-    for (int i = 2; i < b->order; i++)
-        blockNum *= b->dimSizeRDI[i];
-
-    int cudaGridSize[3];
-    int cudaBlockSize[3];
-
-    GDevs.GetCudaThread(c->devID, a->dimSizeRDI[1], cudaGridSize, cudaBlockSize);
-
-    int devIDBackup = 0;
-    ProtectCudaDev(a->devID, devIDBackup);
-
-    KernelADDByColumnVT << <dim3(cudaGridSize[0]), dim3(cudaBlockSize[0]) >> >
-                         ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, colNum, rowNum, blockNum, beta);
-
-    BacktoCudaDev(a->devID, devIDBackup);
-}
-#endif // USE_CUDA
-
-} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/SumByColumnVT.cuh
+++ b/source/tensor/core/arithmetic/SumByColumnVT.cuh
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
-
-#ifndef __SUMBYCOLUMNVT_CUH__
-#define __SUMBYCOLUMNVT_CUH__
-
-#include "SumByColumnVT.h"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-#ifdef USE_CUDA
-
-/* summation of a vector (column vector) and a tensor */
-void _CudaSumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
-
-#endif // USE_CUDA
-
-} // namespace nts(NiuTrans.Tensor)
-
-#endif // __SUMBYCOLUMNVT_CUH__
-
--- a/source/tensor/core/arithmetic/SumByColumnVT.h
+++ b/source/tensor/core/arithmetic/SumByColumnVT.h
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
-
-#ifndef __SUMBYCOLUMNVT_H__
-#define __SUMBYCOLUMNVT_H__
-
-#include "../../XTensor.h"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-/* sum of a (column) vector and a tensor */
-void _SumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);
-
-} // namespace nts(NiuTrans.Tensor)
-
-#endif // __SUMBYCOLUMNVT_H__
--- a/source/tensor/core/getandset/ConvertDataType.cpp
+++ b/source/tensor/core/getandset/ConvertDataType.cpp
@@ -20,20 +20,55 @@
 */

 #include "../../XTensor.h"
+#include "../../XName.h"
 #include "ConvertDataType.h"
 #include "ConvertDataType.cuh"
+#include "../movement/CopyValues.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

+/* 
+data type conversion
+>> devID - device id
+>> s - source data array
+>> typeS - source data type
+>> t - target data array
+>> typeT - target data type
+>> size - number of the items in s (and t)
+*/
+void ConvertDataType(int devID, 
+                     void * s, TENSOR_DATA_TYPE typeS, 
+                     void * t, TENSOR_DATA_TYPE typeT, 
+                     int size)
+{
+    CheckNTErrors((devID < 0), "This code must be run on CPUs!");
+
+    if(typeS == typeT)
+        return;
+
+    if(typeS == X_FLOAT && typeT == X_FLOAT16){
+        for(int i = 0; i < size; i++){
+            ((unsigned short*)t)[i] = FloatToFloat16(((float*)s)[i]);
+        }
+    }
+    else if(typeS == X_FLOAT16 && typeT == X_FLOAT){
+        for(int i = 0; i < size; i++){
+            ((float*)t)[i] = Float16ToFloat(((unsigned short*)s)[i]);
+        }
+    }
+    else{
+        ShowNTErrors("Unsupported data types for conversion!");
+    }
+}
+
 /*
 convert data type
->> input - input tensor
->> output - output tensor
+
+>> input - the input tensor
+>> output - the output tensor
 */
 void _ConvertDataType(const XTensor * input, XTensor * output)
 {
-    //CheckNTErrors((input->unitSize == output->unitSize), "Input and Output must be same in size!");
-
    if (input->dataType == output->dataType)
        return;
    
@@ -59,6 +94,50 @@ void _ConvertDataType(const XTensor * input, XTensor * output)
    }
    else
        ShowNTErrors("Unsupported data types for conversion!");
+}
+
+/*
+convert data type (return an XTensor structure) 
+make a new tensor to keep the result and return it

+>> input - the input tensor
+<< return - the output tensor with the specified data type
+*/
+XTensor ConvertDataType(const XTensor & input, TENSOR_DATA_TYPE dataType)
+{
+    if (input.dataType == dataType) {
+        XTensor output;
+        output = CopyValues(input);
+
+        return output;
+    }
+
+    int order = input.order;
+    
+    float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
+    XTensor output(order, input.dimSize, dataType, dr, input.devID, input.mem);
+    output.SetTMPFlag();
+
+    _ConvertDataType(&input, &output);
+
+    /* tensor connection */
+    XLink::MakeLink(&input, NULL, &output, GETANDSET_CONVERTDATATYPE);
+
+    return output;
+}
+
+void ConvertDataType(const XTensor & input, XTensor & output, TENSOR_DATA_TYPE dataType)
+{
+    if (!output.isInit || input.dataType != output.dataType) {
+        float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
+        InitTensor(&output, input.order, input.dimSize, dataType, dr, input.devID, input.mem);
+    }
+
+    _ConvertDataType(&input, &output);
+
+    /* tensor connection */
+    if (output.enableGrad)
+        XLink::MakeLink(&input, NULL, &output, GETANDSET_CONVERTDATATYPE);
 }
+
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/getandset/ConvertDataType.cu
+++ b/source/tensor/core/getandset/ConvertDataType.cu
 /* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */

 /*
-* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-06-14
-*/
+ * $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
+ */

 #include "../../XTensor.h"
 #include "../../XDevice.h"
@@ -67,44 +67,7 @@ void KernelIntToFloat(int * inputData, float * outputData, int size)

    if (i < size){
        outputData[i] = (float)(inputData[i]);
-    }}
-
-/* 
-data conversion (cuda code) 
->> devID - device id
->> s - source data array
->> typeS - source data type
->> t - target data array
->> typeT - target data type
->> size - number of the items in s (and t)
-*/
-void _CudaConvertDataType(int devID, void * s, TENSOR_DATA_TYPE typeS, void * t, TENSOR_DATA_TYPE typeT, int size)
-{
-    CheckNTErrors((devID >= 0), "This code must be run on GPUs!");
-
-    if(typeS == typeT)
-        return;
-
-    int gridSize[3];
-    int blockSize[3];
-
-    GDevs.GetCudaThread(devID, size, gridSize, blockSize);
-
-    dim3 blocks(gridSize[0]);
-    dim3 threads(blockSize[0]);
-
-    int devIDBackup;
-    ProtectCudaDev(devID, devIDBackup);
-
-    if(typeS == X_FLOAT && typeT == X_FLOAT16)
-        KernelFloatToFloat16<<<blocks, threads>>>((float*)s, (__half*)t, size);
-    else if(typeS == X_FLOAT16 && typeT == X_FLOAT)
-        KernelFloat16ToFloat<<<blocks, threads>>>((__half*)s, (float*)t, size);
-    else{
-        ShowNTErrors("Unsupported data types for conversion!");
    }
-
-    ProtectCudaDev(devID, devIDBackup);
 }

 /*
@@ -114,8 +77,6 @@ convert data type (cuda code)
 */
 void _CudaConvertDataType(const XTensor * input, XTensor * output)
 {
-    //CheckNTErrors((input->unitSize == output->unitSize), "Input and Output must be same in size!");
-
    if (input->dataType == output->dataType)
        return;

@@ -131,13 +92,17 @@ void _CudaConvertDataType(const XTensor * input, XTensor * output)
    ProtectCudaDev(input->devID, devIDBackup);

    if(input->dataType == X_FLOAT && output->dataType == X_INT)
-        KernelFloatToInt<<<blocks, threads>>>((float*)input->data, (int*)output->data, input->unitNum);
+        KernelFloatToInt<<<blocks, threads>>>
+                         ((float*)input->data, (int*)output->data, input->unitNum);
    else if(input->dataType == X_INT && output->dataType == X_FLOAT)
-        KernelIntToFloat<<<blocks, threads>>>((int*)input->data, (float*)output->data, input->unitNum);
+        KernelIntToFloat<<<blocks, threads>>>
+                         ((int*)input->data, (float*)output->data, input->unitNum);
    else if(input->dataType == X_FLOAT && output->dataType == X_FLOAT16)
-        KernelFloatToFloat16<<<blocks, threads>>>((float*)input->data, (__half*)output->data, input->unitNum);
+        KernelFloatToFloat16<<<blocks, threads>>>
+                             ((float*)input->data, (__half*)output->data, input->unitNum);
    else if(input->dataType == X_FLOAT16 && output->dataType == X_FLOAT)
-        KernelFloat16ToFloat<<<blocks, threads>>>((__half*)input->data, (float*)output->data, input->unitNum);
+        KernelFloat16ToFloat<<<blocks, threads>>>
+                             ((__half*)input->data, (float*)output->data, input->unitNum);
    else{
        ShowNTErrors("Unsupported data types for conversion!");
    }

--- a/source/tensor/core/getandset/ConvertDataType.cuh
+++ b/source/tensor/core/getandset/ConvertDataType.cuh
 /* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */

 /*
-* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
-*/
+ * $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
+ */

 #ifndef __CONVERTDATATYPE_CUH__
 #define __CONVERTDATATYPE_CUH__

--- a/source/tensor/core/getandset/ConvertDataType.h
+++ b/source/tensor/core/getandset/ConvertDataType.h
 /* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */

 /*
-* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
-*/
+ * $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
+ */

 #ifndef __CONVERTDATATYPE_H__
 #define __CONVERTDATATYPE_H__

 #include "../../XTensor.h"
+#include "../../XDataType.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

+/* data conversion (for lower precision computation) */
+void ConvertDataType(int devID, 
+                     void * s, TENSOR_DATA_TYPE typeS, 
+                     void * t, TENSOR_DATA_TYPE typeT, int size);
+
 /* convert data type */
 void _ConvertDataType(const XTensor * input, XTensor * output);

+/* convert data type (return an XTensor structure) */
+XTensor ConvertDataType(const XTensor & input, TENSOR_DATA_TYPE dataType);
+
+/* convert data type */
+void ConvertDataType(const XTensor & input, XTensor & output, TENSOR_DATA_TYPE dataType);
+
 } // namespace nts(NiuTrans.Tensor)

 #endif // __CONVERTDATATYPE_H__
--- a/source/tensor/core/getandset/SetData.cpp
+++ b/source/tensor/core/getandset/SetData.cpp
@@ -466,13 +466,23 @@ void _SetDataLowTri(XTensor * tensor, DTYPE p, int shift)
    }
 }

+/* generate data items with a uniform distribution in [0, 1] */
+void _SetDataRand(XTensor * tensor, int rNum, int cNum)
+{
+    if (tensor == NULL || tensor->isInit == false || tensor->order !=2 ) {
+        InitTensor2D(tensor, rNum, cNum);
+    }
+
+    _SetDataRand(tensor, 0.0F, 1.0F);
+}
+
 /*
 generate data items with a uniform distribution in [lower, upper]
 >> tensor - the tensor whose data array would be initialized
 >> lower - lower value of the range
 >> upper - upper value of the range
 */
-void _SetDataRand(const XTensor * tensor, DTYPE lower, DTYPE upper)
+void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
 {
    CheckNTErrors(upper > lower, "the high value must be greater than low value!");

@@ -525,7 +535,7 @@ the item to a pre-defined value if the item >= p, set the item to 0 otherwise
 >> p - the threshold
 >> value - the value we intend to assign to the item
 */
-void _SetDataRandP(const XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value)
+void _SetDataRandP(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value)
 {
    CheckNTErrors(tensor->dataType == DEFAULT_DTYPE, "TODO");


--- a/source/tensor/core/getandset/SetData.cu
+++ b/source/tensor/core/getandset/SetData.cu
@@ -569,15 +569,17 @@ void _CudaSetDataRand(const XTensor * tensor, DTYPE lower, DTYPE upper)
    ProtectCudaDev(tensor->devID, devIDBackup);
    
    curandGenerator_t & gen = GDevs.GPUs[tensor->devID].gen;
-    curandGenerateUniform(gen , (float*)tensor->data , tensor->unitNum);
+    curandGenerateUniform(gen, (float*)tensor->data, tensor->unitNum);
    
    DTYPE variance = upper - lower;

    if(variance != 1.0F || lower != 0){
        if (tensor->dataType == X_FLOAT)
-            KernelSetDataRandFloat  <<<blocks, threads >>>((float*) tensor->data, tensor->unitNum, lower, variance);
+            KernelSetDataRandFloat  <<<blocks, threads >>>
+                                     ((float*) tensor->data, tensor->unitNum, lower, variance);
        else if (tensor->dataType == X_DOUBLE)
-            KernelSetDataRandDouble <<<blocks, threads >>>((double*)tensor->data, tensor->unitNum, lower, variance);
+            KernelSetDataRandDouble <<<blocks, threads >>>
+                                     ((double*)tensor->data, tensor->unitNum, lower, variance);
    }

    BacktoCudaDev(tensor->devID, devIDBackup);

--- a/source/tensor/core/getandset/SetData.h
+++ b/source/tensor/core/getandset/SetData.h
@@ -63,12 +63,15 @@ void _SetDataIndexed(XTensor * source, XTensor * modify, int dim, int index);
 /* generate data as lower triangular matrics for last two dimensions */
 void _SetDataLowTri(XTensor * tensor, DTYPE p, int shift);

+/* generate data items with a uniform distribution in [0, 1] */
+void _SetDataRand(XTensor * tensor, int rNum, int cNum);
+
 /* generate data items with a uniform distribution in [lower, upper] */
-void _SetDataRand(const XTensor * tensor, DTYPE lower, DTYPE upper);
+void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper);

 /* generate data items with a uniform distribution in [lower, upper] and set 
   the item to a pre-defined value if the item >= p, set the item to 0 otherwise */
-void _SetDataRandP(const XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value);
+void _SetDataRandP(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value);

 /* generate data items with a normal distribution with specified mean and standard deviation */
 void _SetDataRandN(XTensor * tensor, DTYPE mean = 0.0F, DTYPE standardDeviation = 1.0F);

--- a/source/tensor/core/math/Binary.cpp
+++ b/source/tensor/core/math/Binary.cpp
@@ -26,230 +26,167 @@

 namespace nts {

-int scale(int x, int scale)
+template<class T1, class T2>
+T1 BinaryDescale(T1 x, T2 num)
 {
-    return x * scale;
+    return (T1)(x / num);
 }

-float scale(float x, float scale)
+template<class T1, class T2>
+T1 BinaryPower(T1 x, T2 num)
 {
-    return x * scale;
+    if (num == 0)
+        return (T1)1.0;
+    else if (num == 0.5)
+        return (T1)sqrt(x);
+    else if (num == 2)
+        return x * x;
+    else {
+        if (x == 0 && num < 0)
+            return (T1)1e20F;
+        else
+            return (T1)pow(x, num);
+    }
 }

-int descale(int x, int descale)
+template<class T1, class T2>
+T1 BinaryScale(T1 x, T2 num)
 {
-    return x / descale;
+    return (T1)(x * num);
 }

-float descale(float x, float descale)
+template<class T1, class T2>
+T1 BinaryShift(T1 x, T2 num)
 {
-    return x / descale;
+    return (T1)(x + num);
 }

-int shift(int x, int shift)
+int BinaryMod(int x, int num)
 {
-    return x + shift;
+    return x % num;
 }

-float shift(float x, float shift)
-{
-    return x + shift;
-}
-
-int mod(int x, int mod)
-{
-    return x % mod;
-}
-
-#ifdef USE_CUDA
-/* define three marco separately, specify the respective function names  (GPU mode) */
-#define _SIMPLE_BINARY_FUNCTION_INT(_funcName, _cudaFuncName, origFunc)     \
-void _funcName(const XTensor * a, XTensor * b, int num)                     \
-{                                                                           \
-    /* run it on GPUs */                                                    \
-    if (a->devID >= 0) {                                                    \
-        _cudaFuncName(a, b, num);                                           \
-        return;                                                             \
-    }                                                                       \
-    CheckNTErrors((XTensor::IsSameShaped(a, b)),                            \
-                "Input tensors should have the same data type!");           \
-    CheckNTErrors((a->dataType == X_INT&&b->dataType == X_INT), "TODO!");   \
-    int * d = (int*)a->data;                                                \
-    int * db = (int*)b->data;                                               \
-    for (int i = 0; i < a->unitNum; i++)                                    \
-        db[i] = (int)origFunc(d[i], num);                                   \
-}                                                                           \
-
-#define _SIMPLE_BINARY_FUNCTION(_funcName, _cudaFuncName, origFunc)         \
-void _funcName(const XTensor * a, XTensor * b, float num)                   \
-{                                                                           \
-    /* run it on GPUs */                                                    \
-    if (a->devID >= 0) {                                                    \
-        _cudaFuncName(a, b, num);                                           \
-        return;                                                             \
-    }                                                                       \
-    CheckNTErrors((XTensor::IsSameShaped(a, b)),                            \
-                "Input tensors should have the same data type!");           \
-    CheckNTErrors((a->dataType == X_FLOAT&&b->dataType == X_FLOAT), "TODO!");\
-    float * d = (float*)a->data;                                            \
-    float * db = (float*)b->data;                                           \
-    for (int i = 0; i < a->unitNum; i++)                                    \
-        db[i] = (float)origFunc(d[i], num);                                 \
-}
-
-#define SIMPLE_BINARY_FUNCTION_ME_INT(funcName, _funcName)                  \
-void funcName(XTensor &a, int num)                                          \
-{                                                                           \
-    _funcName(&a, &a, num);                                                 \
-}                                                                           \
-
-#define SIMPLE_BINARY_FUNCTION_ME(funcName, _funcName)                      \
-void funcName(XTensor &a, float num)                                        \
-{                                                                           \
-    _funcName(&a, &a, num);                                                 \
-}                                                                           \
-    
-#define SIMPLE_BINARY_FUNCTION_INT(funcName, _funcName)                     \
-void funcName(const XTensor &a, XTensor &b, int num)                        \
-{                                                                           \
-    _funcName(&a, &b, num);                                                 \
-}                                                                           \
-
-#define SIMPLE_BINARY_FUNCTION(funcName, _funcName, operationId)            \
-XTensor funcName(const XTensor &a, float num)                               \
-{                                                                           \
-    XTensor b(&a);                                                          \
-    b.SetTMPFlag();                                                         \
-    _funcName(&a, &b, num);                                                 \
-    XLink::MakeLink(&a, NULL, &b, operationId);                             \
-    return b;                                                               \
-}                                                                           \
-
-#define SIMPLE_BINARY_FUNCTION_VOID(funcName, _funcName, operationId)       \
-void funcName(const XTensor &a, XTensor &b, float num)    \
-{                                                                           \
-    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {                      \
-        InitTensor(&b, &a);                                                 \
-    }                                                                       \
-    _funcName(&a, &b, num);                                                 \
-    if (b.enableGrad) {                                                      \
-        XLink::MakeLink(&a, NULL, &b, operationId);                         \
-    }                                                                       \
-}                                                                           \
-
-_SIMPLE_BINARY_FUNCTION_INT(_Scale, _CudaScale, scale)
-SIMPLE_BINARY_FUNCTION_ME_INT(_ScaleMe, _Scale)
-SIMPLE_BINARY_FUNCTION_INT(Scale, _Scale)
-
-_SIMPLE_BINARY_FUNCTION(_Scale, _CudaScaleFloat, scale)
-SIMPLE_BINARY_FUNCTION_ME(_ScaleMe, _Scale)
-SIMPLE_BINARY_FUNCTION(Scale, _Scale, MATH_SCALE)
-SIMPLE_BINARY_FUNCTION_VOID(Scale, _Scale, MATH_SCALE)
-
-_SIMPLE_BINARY_FUNCTION_INT(_Descale, _CudaDescale, descale)
-SIMPLE_BINARY_FUNCTION_ME_INT(_DescaleMe, _Descale)
-SIMPLE_BINARY_FUNCTION_INT(Descale, _Descale)
-
-_SIMPLE_BINARY_FUNCTION(_Descale, _CudaDescaleFloat, descale)
-SIMPLE_BINARY_FUNCTION_ME(_DescaleMe, _Descale)
+/* define three marco separately, specify the respective function names */
+#define _SIMPLE_BINARY_FUNCTION(_funcName, _cudaFuncName, origFunc)                  \
+template<class T>                                                                    \
+void _funcName(const XTensor * a, XTensor * b, T num)                                \
+{                                                                                    \
+    /* run it on GPUs */                                                             \
+    if (a->devID >= 0) {                                                             \
+        if (useCUDA) {                                                               \
+            _cudaFuncName(a, b, num);                                                \
+            return;                                                                  \
+        }                                                                            \
+        else                                                                         \
+            ShowNTErrors("No GPU devices support!")                                  \
+    }                                                                                \
+    CheckNTErrors((XTensor::IsSameShaped(a, b)),                                     \
+                  "Input tensors should have the same data type!");                  \
+    if (a->dataType == X_INT) {                                                      \
+        int * d = (int*)a->data;                                                     \
+        int * db = (int*)b->data;                                                    \
+        for (int i = 0; i < a->unitNum; i++)                                         \
+            db[i] = (int)origFunc((int)d[i], (T)num);                                \
+    }                                                                                \
+    else if (a->dataType == X_FLOAT) {                                               \
+        float * d = (float*)a->data;                                                 \
+        float * db = (float*)b->data;                                                \
+        for (int i = 0; i < a->unitNum; i++)                                         \
+            db[i] = (float)origFunc((float)d[i], (T)num);                            \
+    }                                                                                \
+    else if (a->dataType == X_DOUBLE) {                                              \
+        double * d = (double*)a->data;                                               \
+        double * db = (double*)b->data;                                              \
+        for (int i = 0; i < a->unitNum; i++)                                         \
+            db[i] = (double)origFunc((double)d[i], (T)num);                          \
+    }                                                                                \
+    else                                                                             \
+        ShowNTErrors("TO DO!");                                                      \
+}                                                                                    \
+template void _funcName<int>(const XTensor*, XTensor*, int);                         \
+template void _funcName<float>(const XTensor*, XTensor*, float);                     \
+template void _funcName<double>(const XTensor*, XTensor*, double);
+
+#define _SIMPLE_BINARY_FUNCTION_ME(_funcNameMe, _funcName)                           \
+template<class T>                                                                    \
+void _funcNameMe(XTensor * a, T num)                                                 \
+{                                                                                    \
+    _funcName(a, a, num);                                                            \
+}                                                                                    \
+template void _funcNameMe<int>(XTensor*, int);                                       \
+template void _funcNameMe<float>(XTensor*, float);                                   \
+template void _funcNameMe<double>(XTensor*, double);                                                                                    
+                                                                                     
+#define SIMPLE_BINARY_FUNCTION_ME(funcNameMe, _funcName)                             \
+template<class T>                                                                    \
+void funcNameMe(XTensor &a, T num)                                                   \
+{                                                                                    \
+    _funcName(&a, &a, num);                                                          \
+}                                                                                    \
+template void funcNameMe<int>(XTensor&, int);                                        \
+template void funcNameMe<float>(XTensor&, float);                                    \
+template void funcNameMe<double>(XTensor&, double);                                                                                    
+                                                                                     
+#define SIMPLE_BINARY_FUNCTION(funcName, _funcName, operationId)                     \
+template<class T>                                                                    \
+XTensor funcName(const XTensor &a, T num)                                            \
+{                                                                                    \
+    XTensor b(&a);                                                                   \
+    b.SetTMPFlag();                                                                  \
+    _funcName(&a, &b, num);                                                          \
+    XLink::MakeLink(&a, NULL, &b, operationId);                                      \
+    XLink::AddParamToHead(&b, num);                                                  \
+    return b;                                                                        \
+}                                                                                    \
+template XTensor funcName<int>(const XTensor&, int);                                 \
+template XTensor funcName<float>(const XTensor&, float);                             \
+template XTensor funcName<double>(const XTensor&, double);                                                                                    
+                                                                                     
+#define SIMPLE_BINARY_FUNCTION_VOID(funcName, _funcName, operationId)                \
+template<class T>                                                                    \
+void funcName(const XTensor &a, XTensor &b, T num)                                   \
+{                                                                                    \
+    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {                               \
+        InitTensor(&b, &a);                                                          \
+    }                                                                                \
+    _funcName(&a, &b, num);                                                          \
+    if (b.enableGrad) {                                                              \
+        XLink::MakeLink(&a, NULL, &b, operationId);                                  \
+        XLink::AddParamToHead(&b, num);                                              \
+    }                                                                                \
+}                                                                                    \
+template void funcName<int>(const XTensor&, XTensor&, int);                          \
+template void funcName<float>(const XTensor&, XTensor&, float);                      \
+template void funcName<double>(const XTensor&, XTensor&, double);                                                                           
+
+_SIMPLE_BINARY_FUNCTION(_Descale, _CudaDescale, BinaryDescale)
+_SIMPLE_BINARY_FUNCTION_ME(_DescaleMe, _Descale)
+SIMPLE_BINARY_FUNCTION_ME(DescaleMe, _Descale)
 SIMPLE_BINARY_FUNCTION(Descale, _Descale, MATH_DESCALE)
 SIMPLE_BINARY_FUNCTION_VOID(Descale, _Descale, MATH_DESCALE)

-_SIMPLE_BINARY_FUNCTION_INT(_Shift, _CudaShift, shift)
-SIMPLE_BINARY_FUNCTION_ME_INT(_ShiftMe, _Shift)
-SIMPLE_BINARY_FUNCTION_INT(Shift, _Shift)
+_SIMPLE_BINARY_FUNCTION(_Mod, _CudaMod, BinaryMod)
+_SIMPLE_BINARY_FUNCTION_ME(_ModMe, _Mod)
+SIMPLE_BINARY_FUNCTION_ME(ModMe, _Mod)
+SIMPLE_BINARY_FUNCTION(Mod, _Mod, MATH_MOD)
+SIMPLE_BINARY_FUNCTION_VOID(Mod, _Mod, MATH_MOD)
+
+_SIMPLE_BINARY_FUNCTION(_Power, _CudaPower, BinaryPower)
+_SIMPLE_BINARY_FUNCTION_ME(_PowerMe, _Power)
+SIMPLE_BINARY_FUNCTION_ME(PowerMe, _Power)
+SIMPLE_BINARY_FUNCTION(Power, _Power, MATH_POWER)
+SIMPLE_BINARY_FUNCTION_VOID(Power, _Power, MATH_POWER)
+
+_SIMPLE_BINARY_FUNCTION(_Scale, _CudaScale, BinaryScale)
+_SIMPLE_BINARY_FUNCTION_ME(_ScaleMe, _Scale)
+SIMPLE_BINARY_FUNCTION_ME(ScaleMe, _Scale)
+SIMPLE_BINARY_FUNCTION(Scale, _Scale, MATH_SCALE)
+SIMPLE_BINARY_FUNCTION_VOID(Scale, _Scale, MATH_SCALE)

-_SIMPLE_BINARY_FUNCTION(_Shift, _CudaShiftFloat, shift)
-SIMPLE_BINARY_FUNCTION_ME(_ShiftMe, _Shift)
+_SIMPLE_BINARY_FUNCTION(_Shift, _CudaShift, BinaryShift)
+_SIMPLE_BINARY_FUNCTION_ME(_ShiftMe, _Shift)
+SIMPLE_BINARY_FUNCTION_ME(ShiftMe, _Shift)
 SIMPLE_BINARY_FUNCTION(Shift, _Shift, MATH_SHIFT)
 SIMPLE_BINARY_FUNCTION_VOID(Shift, _Shift, MATH_SHIFT)

-_SIMPLE_BINARY_FUNCTION_INT(_Mod, _CudaMod, mod)
-SIMPLE_BINARY_FUNCTION_ME_INT(ModMe, _Mod)
-SIMPLE_BINARY_FUNCTION_INT(Mod, _Mod)
-
-#else
-/* define three marco separately, specify the respective function names (CPU mode) */
-#define _SIMPLE_BINARY_FUNCTION_INT(_funcName, origFunc)                    \
-void _funcName(const XTensor * a, XTensor * b, int num)                     \
-{                                                                           \
-    CheckNTErrors(a->devID < 0, "No GPU code is supported");                \
-    CheckNTErrors((XTensor::IsSameShaped(a, b)),                            \
-                "Input tensors should have the same data type!");           \
-    CheckNTErrors((a->dataType == X_INT&&b->dataType == X_INT), "TODO!");   \
-    int * d = (int*)a->data;                                                \
-    int * db = (int*)b->data;                                               \
-    for (int i = 0; i < a->unitNum; i++)                                    \
-        db[i] = (int)origFunc(d[i], num);                                   \
-}                                                                           \
-
-#define _SIMPLE_BINARY_FUNCTION(_funcName, origFunc)         \
-void _funcName(const XTensor * a, XTensor * b, float num)                   \
-{                                                                           \
-    CheckNTErrors(a->devID < 0, "No GPU code is supported");                \
-    CheckNTErrors((XTensor::IsSameShaped(a, b)),                            \
-                "Input tensors should have the same data type!");           \
-    CheckNTErrors((a->dataType == X_FLOAT&&b->dataType == X_FLOAT), "TODO!");\
-    float * d = (float*)a->data;                                            \
-    float * db = (float*)b->data;                                           \
-    for (int i = 0; i < a->unitNum; i++)                                    \
-        db[i] = (float)origFunc(d[i], num);                                 \
-}
-
-#define SIMPLE_BINARY_FUNCTION_ME_INT(funcName, _funcName)                  \
-void funcName(XTensor &a, int num)                                          \
-{                                                                           \
-    _funcName(&a, &a, num);                                                 \
-}                                                                           \
-
-#define SIMPLE_BINARY_FUNCTION_ME(funcName, _funcName)                      \
-void funcName(XTensor &a, float num)                                        \
-{                                                                           \
-    _funcName(&a, &a, num);                                                 \
-}                                                                           \
-
-#define SIMPLE_BINARY_FUNCTION_INT(funcName, _funcName)                     \
-void funcName(const XTensor &a, XTensor &b, int num)                        \
-{                                                                           \
-    _funcName(&a, &b, num);                                                 \
-}                                                                           \
-
-#define SIMPLE_BINARY_FUNCTION(funcName, _funcName)                         \
-void funcName(const XTensor &a, XTensor &b, float num)                      \
-{                                                                           \
-    _funcName(&a, &b, num);                                                 \
-}                                                                           \
-
-    
-_SIMPLE_BINARY_FUNCTION_INT(_Scale, scale)
-SIMPLE_BINARY_FUNCTION_ME_INT(_ScaleMe, _Scale)
-SIMPLE_BINARY_FUNCTION_INT(Scale, _Scale)
-
-_SIMPLE_BINARY_FUNCTION(_Scale, scale)
-SIMPLE_BINARY_FUNCTION_ME(_ScaleMe, _Scale)
-SIMPLE_BINARY_FUNCTION(Scale, _Scale)
-    
-_SIMPLE_BINARY_FUNCTION_INT(_Descale, descale)
-SIMPLE_BINARY_FUNCTION_ME_INT(_DescaleMe, _Descale)
-SIMPLE_BINARY_FUNCTION_INT(Descale, _Descale)
-
-_SIMPLE_BINARY_FUNCTION(_Descale, descale)
-SIMPLE_BINARY_FUNCTION_ME(_DescaleMe, _Descale)
-SIMPLE_BINARY_FUNCTION(Descale, _Descale)
-    
-_SIMPLE_BINARY_FUNCTION_INT(_Shift, shift)
-SIMPLE_BINARY_FUNCTION_ME_INT(_Shift, _Shift)
-SIMPLE_BINARY_FUNCTION_INT(Shift, _Shift)
-
-_SIMPLE_BINARY_FUNCTION(_Shift, shift)
-SIMPLE_BINARY_FUNCTION_ME(_ShiftMe, _Shift)
-SIMPLE_BINARY_FUNCTION(Shift, _Shift)
-    
-_SIMPLE_BINARY_FUNCTION_INT(_Mod, mod)
-SIMPLE_BINARY_FUNCTION_ME_INT(_ModMe, _Mod)
-SIMPLE_BINARY_FUNCTION_INT(Mod, _Mod)
-
-    
-#endif
-
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/math/Binary.cu
+++ b/source/tensor/core/math/Binary.cu
@@ -21,6 +21,7 @@

 #include <math.h>
 #include "../../XDevice.h"
+#include "../../XUtility.h"
 #include "../../XName.h"
 #include "Binary.h"
 #include "Binary.cuh"
@@ -28,134 +29,108 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)

 #ifdef USE_CUDA
-
+    
 __device__
-int cudascale(int x, int scale)
+int BinaryCudaMod(int x, int base)
 {
-    return x * scale;
+    return x % base;
 }

+template<class T1, class T2>
 __device__
-float cudascale(float x, float scale)
+T1 BinaryCudaDescale(T1 x, T2 num)
 {
-    return x * scale;
+    return x / num;
 }

+template<class T1, class T2>
 __device__
-int cudadescale(int x, int descale)
+T1 BinaryCudaPower(T1 x, T2 num)
 {
-    return x / descale;
+    if (num == 0)
+        return (T1)1.0;
+    else if (num == 0.5)
+        return (T1)sqrt((float)x);
+    else if (num == 2)
+        return (T1)(x * x);
+    else {
+        if (x == 0 && num < 0)
+            return (T1)1e20F;
+        else
+            return (T1)pow((float)x, (float)num);
+    }
 }

+template<class T1, class T2>
 __device__
-float cudadescale(float x, float descale)
+T1 BinaryCudaScale(T1 x, T2 num)
 {
-    return x / descale;
+    return x * num;
 }

+template<class T1, class T2>
 __device__
-int cudashift(int x, int shift)
+T1 BinaryCudaShift(T1 x, T2 num)
 {
-    return x + shift;
-}
-
-__device__
-float cudashift(float x, float descale)
-{
-    return x + descale;
-}
-
-__device__
-int cudamod(int x, int mod)
-{
-    return x % mod;
-}
-
-
-#define SIMPLE_BINARY_FUNCTION_GPU(funcName, origFunc)                      \
-__global__                                                                  \
-void Kernel##funcName(int * a, int * b, int size, int num)                  \
-{                                                                           \
-    int i = blockDim.x * blockIdx.x + threadIdx.x;                          \
-                                                                            \
-    if (i < size)                                                           \
-        b[i] = (int)origFunc(a[i], num);                                    \
-}                                                                           \
-                                                                            \
-void _Cuda##funcName(const XTensor * a, XTensor * b, int num)               \
-{                                                                           \
-    CheckNTErrors((XTensor::IsSameShaped(a, b)),                            \
-                  "Input tensors should have the same type!");              \
-    CheckNTErrors((a->isSparse == false), "TODO!");                         \
-                                                                            \
-    int gridSize[3];                                                        \
-    int blockSize[3];                                                       \
-                                                                            \
-    GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);         \
-                                                                            \
-    dim3 blocks(gridSize[0]);                                               \
-    dim3 threads(blockSize[0]);                                             \
-                                                                            \
-    int devIDBackup;                                                        \
-    ProtectCudaDev(a->devID, devIDBackup);                                  \
-                                                                            \
-    if (a->dataType == X_INT) {                                             \
-        Kernel##funcName<<<blocks, threads>>>                               \
-                         ((int*)a->data, (int*)b->data, a->unitNum, num);   \
-    }                                                                       \
-    else {                                                                  \
-        ShowNTErrors("TODO!");                                              \
-    }                                                                       \
-                                                                            \
-    BacktoCudaDev(a->devID, devIDBackup);                                   \
-}                                                                           \
-
-#define SIMPLE_BINARY_FUNCTION_FLOAT_GPU(funcName, origFunc)                \
-__global__                                                                  \
-void Kernel##funcName(float * a, float * b, int size, float num)            \
-{                                                                           \
-    int i = blockDim.x * blockIdx.x + threadIdx.x;                          \
-                                                                            \
-    if (i < size)                                                           \
-        b[i] = (float)origFunc(a[i], num);                                  \
-}                                                                           \
-                                                                            \
-                                                                            \
-void _Cuda##funcName(const XTensor * a, XTensor * b, float num)             \
-{                                                                           \
-    CheckNTErrors((XTensor::IsSameShaped(a, b)),                            \
-                  "Input tensors should have the same type!");              \
-    CheckNTErrors((a->isSparse == false), "TODO!");                         \
-                                                                            \
-    int gridSize[3];                                                        \
-    int blockSize[3];                                                       \
-                                                                            \
-    GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);         \
-                                                                            \
-    dim3 blocks(gridSize[0]);                                               \
-    dim3 threads(blockSize[0]);                                             \
-                                                                            \
-    int devIDBackup;                                                        \
-    ProtectCudaDev(a->devID, devIDBackup);                                  \
-                                                                            \
-    if (a->dataType == X_FLOAT) {                                           \
-        Kernel##funcName<<<blocks, threads>>>                               \
-                        ((float*)a->data, (float*)b->data, a->unitNum, num);\
-    }                                                                       \
-    else {                                                                  \
-        ShowNTErrors("TODO!");                                              \
-    }                                                                       \
-                                                                            \
-    BacktoCudaDev(a->devID, devIDBackup);                                   \
+    return x + num;
 }

-SIMPLE_BINARY_FUNCTION_GPU(Scale, cudascale)
-SIMPLE_BINARY_FUNCTION_FLOAT_GPU(ScaleFloat, cudascale)
-SIMPLE_BINARY_FUNCTION_GPU(Descale, cudadescale)
-SIMPLE_BINARY_FUNCTION_FLOAT_GPU(DescaleFloat, cudadescale)
-SIMPLE_BINARY_FUNCTION_GPU(Shift, cudashift)
-SIMPLE_BINARY_FUNCTION_FLOAT_GPU(ShiftFloat, cudashift)
-SIMPLE_BINARY_FUNCTION_GPU(Mod, cudamod)
+#define SIMPLE_BINARY_FUNCTION_GPU(funcName, origFunc)                              \
+template<class T1, class T2>                                                        \
+__global__                                                                          \
+void Kernel##funcName(T1 * a, T1 * b, int size, T2 num)                             \
+{                                                                                   \
+    int i = blockDim.x * blockIdx.x + threadIdx.x;                                  \
+                                                                                    \
+    if (i < size)                                                                   \
+        b[i] = (T1)origFunc((T1)a[i], (T2)num);                                     \
+}                                                                                   \
+                                                                                    \
+template<class T>                                                                   \
+void _Cuda##funcName(const XTensor * a, XTensor * b, T num)                         \
+{                                                                                   \
+    CheckNTErrors((XTensor::IsSameShaped(a, b)),                                    \
+                  "Input tensors should have the same type!");                      \
+    CheckNTErrors((a->isSparse == false), "TODO!");                                 \
+                                                                                    \
+    int gridSize[3];                                                                \
+    int blockSize[3];                                                               \
+                                                                                    \
+    GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);                 \
+                                                                                    \
+    dim3 blocks(gridSize[0]);                                                       \
+    dim3 threads(blockSize[0]);                                                     \
+                                                                                    \
+    int devIDBackup;                                                                \
+    ProtectCudaDev(a->devID, devIDBackup);                                          \
+                                                                                    \
+    if (a->dataType == X_FLOAT) {                                                   \
+        Kernel##funcName<<<blocks, threads>>>                                       \
+                         ((float*)a->data, (float*)b->data, a->unitNum, (T)num);    \
+    }                                                                               \
+    else if (a->dataType == X_DOUBLE) {                                             \
+        Kernel##funcName<<<blocks, threads>>>                                       \
+                         ((double*)a->data, (double*)b->data, a->unitNum, (T)num);  \
+    }                                                                               \
+    else if (a->dataType == X_INT) {                                                \
+        Kernel##funcName<<<blocks, threads>>>                                       \
+                         ((int*)a->data, (int*)b->data, a->unitNum, (T)num);        \
+    }                                                                               \
+    else {                                                                          \
+        ShowNTErrors("TODO!");                                                      \
+    }                                                                               \
+                                                                                    \
+    BacktoCudaDev(a->devID, devIDBackup);                                           \
+}                                                                                   \
+template void _Cuda##funcName<int>(const XTensor*, XTensor*, int);                  \
+template void _Cuda##funcName<float>(const XTensor*, XTensor*, float);              \
+template void _Cuda##funcName<double>(const XTensor*, XTensor*, double);            
+
+SIMPLE_BINARY_FUNCTION_GPU(Descale, BinaryCudaDescale)
+SIMPLE_BINARY_FUNCTION_GPU(Mod, BinaryCudaMod)
+SIMPLE_BINARY_FUNCTION_GPU(Power, BinaryCudaPower)
+SIMPLE_BINARY_FUNCTION_GPU(Scale, BinaryCudaScale)
+SIMPLE_BINARY_FUNCTION_GPU(Shift, BinaryCudaShift)

 #endif // USE_CUDA


--- a/source/tensor/core/math/Binary.cuh
+++ b/source/tensor/core/math/Binary.cuh
@@ -29,38 +29,25 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 #ifdef USE_CUDA

-/* scale each entry (CUDA Kernel) */
-__global__
-void KernelScale(int * a, int * b, int size, int scale);
-__global__
-void KernelScale(int * a, int * b, int size, float scale);
-/* scale each entry */
-void _CudaScale(const XTensor * a, XTensor * b, int scale);
-void _CudaScaleFloat(const XTensor * a, XTensor * b, float scale);
-
-/* descale each entry (CUDA Kernel) */
-__global__
-void KernelDescale(int * a, int * b, int size, int scale);
-__global__
-void KernelDescale(int * a, int * b, int size, float scale);
 /* descale each entry */
-void _CudaDescale(const XTensor * a, XTensor * b, int scale);
-void _CudaDescaleFloat(const XTensor * a, XTensor * b, float scale);
+template<class T>
+void _CudaDescale(const XTensor * a, XTensor * b, T num);

-/* shift each entry (CUDA Kernel) */
-__global__
-void KernelShift(int * a, int * b, int size, int shift);
-__global__
-void KernelShift(int * a, int * b, int size, float shift);
-/* shift each entry */
-void _CudaShift(const XTensor * a, XTensor * b, int shift);
-void _CudaShiftFloat(const XTensor * a, XTensor * b, float shift);
+/* power each entry */
+template<class T>
+void _CudaPower(const XTensor * a, XTensor * b, T num);

-/* mod each entry (CUDA Kernel) */
-__global__
-void KernelMod(int * a, int * b, int size, int base);
 /* mod each entry */
-void _CudaMod(const XTensor * a, XTensor * b, int base);
+template<class T>
+void _CudaMod(const XTensor * a, XTensor * b, T base);
+
+/* scale each entry */
+template<class T>
+void _CudaScale(const XTensor * a, XTensor * b, T num);
+
+/* shift each entry */
+template<class T>
+void _CudaShift(const XTensor * a, XTensor * b, T num);

 #endif // USE_CUDA


--- a/source/tensor/core/math/Binary.h
+++ b/source/tensor/core/math/Binary.h
@@ -16,8 +16,8 @@
 */

 /*
-* $Created by: JIANG Yufan (email: jiangyufan2018@outlook.com) 2019-04-05
-*/
+ * $Created by: JIANG Yufan (email: jiangyufan2018@outlook.com) 2019-04-05
+ */

 #ifndef __BINARY_H__
 #define __BINARY_H__
@@ -26,132 +26,110 @@

 namespace nts { // namespace nts(NiuTrans.Tensor)

-/*
-scale up tensor entires
-b = a * scale
-*/
-void _Scale(const XTensor * a, XTensor * b, int scale);
-void _Scale(const XTensor * a, XTensor * b, float scale);
-
-/*
-scale up tensor entires (on site)
-b = a * scale
-*/
-void _ScaleMe(XTensor * a, int scale);
-void _ScaleMe(XTensor * a, float scale);
-
-/*
-scale up tensor entires (on site)
-b = a * scale
-*/
-void ScaleMe(XTensor & a, int scale);
-void ScaleMe(XTensor & a, float scale);
-   
-/*
-scale up tensor entires
-b = a * scale
-*/
-void Scale(const XTensor & a, XTensor &b, int scale);
-void Scale(const XTensor & a, XTensor &b, float scale);
-
-/*
-scale up tensor entires (return an XTensor structure)
-b = a * scale
-*/
-XTensor Scale(const XTensor & a, float scale);
-
-/*
-descale tensor entires
-b = a / scale
-*/
-void _Descale(const XTensor * a, XTensor * b, int scale);
-void _Descale(const XTensor * a, XTensor * b, float scale);
-
-/*
-descale tensor entires (on site)
-b = a / scale
-*/
-void _DescaleMe(XTensor * a, int scale);
-void _DescaleMe(XTensor * a, float scale);
-
-/*
-descale tensor entires (on site)
-b = a / scale
-*/
-void DescaleMe(XTensor & a, int scale);
-void DescaleMe(XTensor & a, float scale);
-    
-/*
-descale tensor entires
-b = a / scale
-*/
-void Descale(const XTensor & a, XTensor & b, int scale);
-void Descale(const XTensor & a, XTensor & b, float scale);
-
-/*
-descale tensor entires (return an XTensor structure)
-b = a / scale
-*/
-XTensor Descale(const XTensor & a, float scale);
-
-/*
-shift tensor entires
-b = a + shift
-*/
-void _Shift(const XTensor * a, XTensor * b, int shift);
-void _Shift(const XTensor * a, XTensor * b, float shift);
-
-/*
-shift tensor entires (on site)
-b = a + shift
-*/
-void _ShiftMe(XTensor * a, int shift);
-void _ShiftMe(XTensor * a, float shift);
-
-/*
-shift tensor entires (on site)
-b = a + shift
-*/
-void ShiftMe(XTensor & a, int shift);
-void ShiftMe(XTensor & a, float shift);
-    
-/*
-shift tensor entires
-b = a + shift
-*/
-void Shift(const XTensor & a, XTensor & b, int shift);
-void Shift(const XTensor & a, XTensor & b, float shift);
-
-/*
-shift tensor entires (return an XTensor structure)
-b = a + shift
-*/
-XTensor Shift(const XTensor & a, float shift);
-
-
-/*
-mod tensor entires
-b = a % mod
-*/
-void _Mod(const XTensor * a, XTensor * b, int base);
-
-/*
-mod tensor entires (on site)
-b = a % mod
-*/
-void _ModMe(XTensor * a, int base);
-
-/*
-mod tensor entires (on site)
-b = a % mod
-*/
-void ModMe(XTensor & a, int base);
-    
-/*
-mod tensor entires
-b = a % mod
-*/
-void Mod(const XTensor & a, XTensor & b, int base);
+/* descale tensor entires
+b = a / num */
+template<class T>
+void _Descale(const XTensor * a, XTensor * b, T num);
+/* descale tensor entires (on site)
+b = a / num */
+template<class T>
+void _DescaleMe(XTensor * a, T num);
+/* descale tensor entires (on site)
+b = a / num */
+template<class T>
+void DescaleMe(XTensor & a, T num); 
+/* descale tensor entires
+b = a / num */
+template<class T>
+void Descale(const XTensor & a, XTensor & b, T num);
+/* descale tensor entires (return an XTensor structure)
+b = a / num */
+template<class T>
+XTensor Descale(const XTensor & a, T num);
+
+/* mod tensor entires
+b = a % base */
+template<class T>
+void _Mod(const XTensor * a, XTensor * b, T base);
+/* mod base entires (on site)
+b = a % num */
+template<class T>
+void _ModMe(XTensor * a, T base);
+/* mod tensor entires (on site)
+b = a % base */
+template<class T>
+void ModMe(XTensor & a, T base);
+/* mod tensor entires
+b = a % base */
+template<class T>
+void Mod(const XTensor & a, XTensor & b, T base);
+/* mod tensor entires (return an XTensor structure)
+b = a % base */
+template<class T>
+XTensor Mod(const XTensor & a, T base);
+
+/* get the power(x, y)
+b = power(a, num) */
+template<class T>
+void _Power(const XTensor * a, XTensor * b, T scale);
+/* get the power(x, y) (on site)
+b = power(a, num) */
+template<class T>
+void _PowerMe(XTensor * a, T scale);
+/* get the power(x, y) (on site)
+b = power(a, num) */
+template<class T>
+void PowerMe(XTensor & a, T scale); 
+/* get the power(x, y)
+b = power(a, num) */
+template<class T>
+void Power(const XTensor & a, XTensor & b, T scale);
+/* get the power(x, y) (return an XTensor structure)
+b = power(a, num) */
+template<class T>
+XTensor Power(const XTensor & a, T scale);
+
+/* scale up tensor entires
+b = a * num */
+template<class T>
+void _Scale(const XTensor * a, XTensor * b, T num);
+/* scale up tensor entires (on site)
+b = a * num */
+template<class T>
+void _ScaleMe(XTensor * a, T num);
+/* scale up tensor entires (on site)
+b = a * num */
+template<class T>
+void ScaleMe(XTensor & a, T num);
+/* scale up tensor entires
+b = a * num */
+template<class T>
+void Scale(const XTensor & a, XTensor & b, T num);
+/* scale up tensor entires (return an XTensor structure)
+b = a * num */
+template<class T>
+XTensor Scale(const XTensor & a, T num);
+
+/* shift tensor entires
+b = a + num */
+template<class T>
+void _Shift(const XTensor * a, XTensor * b, T num);
+/* shift tensor entires (on site)
+b = a + num */
+template<class T>
+void _ShiftMe(XTensor * a, T num);
+/* shift tensor entires (on site)
+b = a + num */
+template<class T>
+void ShiftMe(XTensor & a, T num); 
+/* shift tensor entires
+b = a + num */
+template<class T>
+void Shift(const XTensor & a, XTensor & b, T num);
+/* shift tensor entires (return an XTensor structure)
+b = a + num */
+template<class T>
+XTensor Shift(const XTensor & a, T num);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/math/Compare.cpp
+++ b/source/tensor/core/math/Compare.cpp
@@ -37,88 +37,72 @@ DTYPE myIsNotEqual(DTYPE a, DTYPE b)
 }

 #ifdef USE_CUDA
-/* define three marco separately, specify the respective function names  (GPU mode) */
-#define _SIMPLE_COMPARE_FUNCTION(_funcName, _cudaFuncName, origFunc)        \
-void _funcName(const XTensor * a, XTensor * b, DTYPE number)                \
-{                                                                           \
-    CheckNTErrors((XTensor::IsSameShaped(a, b)),                            \
-                  "Input tensors should have the same type!");              \
-    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");                 \
-    /* run it on GPUs */                                                    \
-    if (a->devID >= 0) {                                                    \
-        _cudaFuncName(a, b, number);                                        \
-        return;                                                             \
-    }                                                                       \
-    DTYPE * d = (DTYPE*)a->data;                                            \
-    DTYPE * db = (DTYPE*)b->data;                                           \
-    for (int i = 0; i < a->unitNum; i++)                                    \
-        db[i] = (DTYPE)origFunc(d[i], number);                              \
+/* define three marco separately, specify the respective function names */
+#define _SIMPLE_COMPARE_FUNCTION(_funcName, _cudaFuncName, origFunc)                 \
+void _funcName(const XTensor * a, XTensor * b, DTYPE number)                         \
+{                                                                                    \
+    CheckNTErrors((XTensor::IsSameShaped(a, b)),                                     \
+                  "Input tensors should have the same type!");                       \
+    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");                          \
+    /* run it on GPUs */                                                             \
+    if (a->devID >= 0) {                                                             \
+        if (useCUDA) {                                                               \
+            _cudaFuncName(a, b, number);                                             \
+            return;                                                                  \
+        }                                                                            \
+        else                                                                         \
+            ShowNTErrors("No GPU devices support!")                                  \
+    }                                                                                \
+    DTYPE * d = (DTYPE*)a->data;                                                     \
+    DTYPE * db = (DTYPE*)b->data;                                                    \
+    for (int i = 0; i < a->unitNum; i++)                                             \
+        db[i] = (DTYPE)origFunc(d[i], number);                                       \
+}                                                                                    
+                                                                                     
+#define _SIMPLE_COMPARE_FUNCTION_ME(_funcNameMe, _funcName)                          \
+void _funcNameMe(XTensor * a, DTYPE number)                                          \
+{                                                                                    \
+    _funcName(a, a, number);                                                         \
+}                                                                                    
+                                                                                        
+#define SIMPLE_COMPARE_FUNCTION_ME(funcNameMe, _funcName)                            \
+void funcNameMe(XTensor & a, DTYPE number)                                           \
+{                                                                                    \
+    _funcName(&a, &a, number);                                                       \
+}                                                                                    
+                                                                                     
+#define SIMPLE_COMPARE_FUNCTION(funcName, _funcName, operationId)                    \
+XTensor funcName(const XTensor &a, DTYPE number)                                     \
+{                                                                                    \
+    XTensor b(&a);                                                                   \
+    b.SetTMPFlag();                                                                  \
+    _funcName(&a, &b, number);                                                       \
+    return b;                                                                        \
 }
-
-#define _SIMPLE_COMPARE_FUNCTION_ME(_funcNameMe, _funcName)                 \
-void _funcNameMe(XTensor * a, DTYPE number)                                 \
-{                                                                           \
-    _funcName(a, a, number);                                                \
-}        
-
-#define SIMPLE_COMPARE_FUNCTION(funcName, _funcName, operationId)           \
-XTensor funcName(const XTensor &a, DTYPE number)                            \
-{                                                                           \
-    XTensor b(&a);                                                          \
-    b.SetTMPFlag();                                                         \
-    _funcName(&a, &b, number);                                              \
-    return b;                                                               \
+                                                                                     
+#define SIMPLE_COMPARE_FUNCTION_VOID(funcName, _funcName, operationId)               \
+void funcName(const XTensor &a, XTensor &b, DTYPE number)                            \
+{                                                                                    \
+    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {                               \
+        InitTensor(&b, &a);                                                          \
+    }                                                                                \
+    _funcName(&a, &b, number);                                                       \
 }
+
 // I think we needn't to make link.
 // XLink::MakeLink(&a, NULL, &b, operationId);

 _SIMPLE_COMPARE_FUNCTION(_Equal, _CudaEqual, myIsEqual)
 _SIMPLE_COMPARE_FUNCTION_ME(_EqualMe, _Equal)
+SIMPLE_COMPARE_FUNCTION_ME(EqualMe, _Equal)
 SIMPLE_COMPARE_FUNCTION(Equal, _Equal, MATH_EQUAL)
+SIMPLE_COMPARE_FUNCTION_VOID(Equal, _Equal, MATH_EQUAL)

 _SIMPLE_COMPARE_FUNCTION(_NotEqual, _CudaNotEqual, myIsNotEqual)
 _SIMPLE_COMPARE_FUNCTION_ME(_NotEqualMe, _NotEqual)
+SIMPLE_COMPARE_FUNCTION_ME(NotEqualMe, _NotEqual)
 SIMPLE_COMPARE_FUNCTION(NotEqual, _NotEqual, MATH_NOTEQUAL)
-
-#else
-/* define three marco separately, specify the respective function names (CPU mode) */
-#define _SIMPLE_COMPARE_FUNCTION(_funcName, origFunc)                       \
-void _funcName(const XTensor * a, XTensor * b, DTYPE number)                \
-{                                                                           \
-    CheckNTErrors((XTensor::IsSameShaped(a, b)),                            \
-                  "Input tensors should have the same type!");              \
-    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");                 \
-    DTYPE * d = (DTYPE*)a->data;                                            \
-    DTYPE * db = (DTYPE*)b->data;                                           \
-    for (int i = 0; i < a->unitNum; i++)                                    \
-        db[i] = (DTYPE)origFunc(d[i], number);                              \
-}
-
-#define _SIMPLE_COMPARE_FUNCTION_ME(_funcNameMe, _funcName)                 \
-void _funcNameMe(XTensor * a, DTYPE number)                                 \
-{                                                                           \
-    _funcName(a, a, number);                                                \
-}        
-
-#define SIMPLE_COMPARE_FUNCTION(funcName, _funcName, operationId)           \
-XTensor funcName(const XTensor &a, DTYPE number)                            \
-{                                                                           \
-    XTensor b(&a);                                                          \
-    b.SetTMPFlag();                                                         \
-    _funcName(&a, &b, number);                                              \
-    return b;                                                               \
-}
-
-// I think we needn't to make link.
-// XLink::MakeLink(&a, NULL, &b, operationId);
-
-_SIMPLE_COMPARE_FUNCTION(_Equal, myIsEqual)
-_SIMPLE_COMPARE_FUNCTION_ME(_EqualMe, _Equal)
-SIMPLE_COMPARE_FUNCTION(Equal, _Equal, MATH_EQUAL)
-
-_SIMPLE_COMPARE_FUNCTION(_NotEqual, myIsNotEqual)
-_SIMPLE_COMPARE_FUNCTION_ME(_NotEqualMe, _NotEqual)
-SIMPLE_COMPARE_FUNCTION(NotEqual, _NotEqual, MATH_NOTEQUAL)
+SIMPLE_COMPARE_FUNCTION_VOID(NotEqual, _NotEqual, MATH_NOTEQUAL)

 #endif


--- a/source/tensor/core/math/Compare.h
+++ b/source/tensor/core/math/Compare.h
@@ -38,6 +38,9 @@ void EqualMe(XTensor & a, DTYPE value);
 /* check whether every entry is equal to the given value (return an XTensor structure) */
 XTensor Equal(const XTensor & a, DTYPE value);

+/* check whether every entry is equal to the given value */
+void Equal(const XTensor & a, XTensor & b, DTYPE value);
+
 /* check whether every entry is not equal to the given value */
 void _NotEqual(const XTensor * a, XTensor * b, DTYPE value);

@@ -50,6 +53,9 @@ void NotEqualMe(XTensor & a, DTYPE value);
 /* check whether every entry is not equal to the given value (return an XTensor structure) */
 XTensor NotEqual(const XTensor & a, DTYPE value);

+/* check whether every entry is not equal to the given value */
+void NotEqual(const XTensor & a, XTensor & b, DTYPE value);
+
 } // namespace nts(NiuTrans.Tensor)

 #endif // end __COMPARE_H__
\ No newline at end of file
--- a/source/tensor/core/math/Normalize.cpp
+++ b/source/tensor/core/math/Normalize.cpp
@@ -42,7 +42,9 @@ where a and b are the scalar and bias respectively, and \epsilon is the adjustme
 >> b - the bias
 >> epsilon - a parameter
 */
-void _Normalize(const XTensor * input, XTensor * output, int dim, const XTensor * mean, const XTensor * var, const XTensor * a, const XTensor * b, DTYPE epsilon)
+void _Normalize(const XTensor * input, XTensor * output, int dim, 
+                const XTensor * mean, const XTensor * var, 
+                const XTensor * a, const XTensor * b, DTYPE epsilon)
 {
    int dimRDI = input->order - dim - 1;
    CheckNTErrors((XTensor::IsSameShaped(input, output)), "Unmatched input tensors!");
@@ -109,7 +111,9 @@ where a and b are the scalar and bias respectively, and \epsilon is the adjustme
 >> b - the bias
 >> epsilon - a parameter
 */
-void _NormalizeMe(XTensor * input, int dim, const XTensor * mean, const XTensor * var, const XTensor * a, const XTensor * b, DTYPE epsilon)
+void _NormalizeMe(XTensor * input, int dim, 
+                  const XTensor * mean, const XTensor * var, 
+                  const XTensor * a, const XTensor * b, DTYPE epsilon)
 {
    _Normalize(input, input, dim, mean, var, a, b, epsilon);
 }
@@ -129,7 +133,9 @@ where a and b are the scalar and bias respectively, and \epsilon is the adjustme
 >> b - the bias
 >> epsilon - a parameter
 */
-void NormalizeMe(XTensor& input, int dim, const XTensor& mean, const XTensor& var, const XTensor& a, const XTensor& b, DTYPE epsilon)
+void NormalizeMe(XTensor& input, int dim, 
+                 const XTensor& mean, const XTensor& var, 
+                 const XTensor& a, const XTensor& b, DTYPE epsilon)
 {
    _Normalize(&input, &input, dim, &mean, &var, &a, &b, epsilon);
 }
@@ -150,7 +156,9 @@ where a and b are the scalar and bias respectively, and \epsilon is the adjustme
 >> epsilon - a parameter
 << return - the result of normalized the data with normal distribution
 */
-XTensor Normalize(const XTensor &input, int dim, const XTensor &mean, const XTensor &var, const XTensor &a, const XTensor &b, DTYPE epsilon)
+XTensor Normalize(const XTensor &input, int dim, 
+                  const XTensor &mean, const XTensor &var, 
+                  const XTensor &a, const XTensor &b, DTYPE epsilon)
 {
    XTensor output(&input);
    output.SetTMPFlag();
@@ -171,4 +179,48 @@ XTensor Normalize(const XTensor &input, int dim, const XTensor &mean, const XTen

    return output;
 }
+
+/*
+normalized the data with normal distribution (return an XTensor structure)
+make a new tensor to keep the result and return it 
+
+For an input x, y = a * (x-mean)/sqrt(variance+\epsilon) + b
+where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter.
+
+>> input - the input tensor
+>> output - the output tensor
+>> dim - dimension alone which we generate the mean and variance
+>> mean - the mean of the input
+>> var - the variance of the input
+>> a - the scalar
+>> b - the bias
+>> epsilon - a parameter
+<< return - the result of normalized the data with normal distribution
+*/
+void Normalize(const XTensor &input, XTensor &output, int dim, 
+               const XTensor &mean, const XTensor &var, 
+               const XTensor &a, const XTensor &b, DTYPE epsilon)
+{
+    if (!output.isInit || !XTensor::IsSameShaped(&input, &output)) {
+        InitTensor(&output, &input);
+    }
+
+    /* call _Normalize function */
+    _Normalize(&input, &output, dim, &mean, &var, &a, &b, epsilon);
+
+    if (output.enableGrad == true) {
+        /* tensor connections */
+        TensorList list(5);
+        list.Add((XTensor*)&input);
+        list.Add((XTensor*)&mean);
+        list.Add((XTensor*)&var);
+        list.Add((XTensor*)&a);
+        list.Add((XTensor*)&b);
+        XLink::MakeLink(&list, &output, MATH_NORMALIZE);
+        XLink::AddParamToHeadInt(&output, dim);
+        XLink::AddParamToHead(&output, epsilon);
+    }
+}
+
+
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/math/Normalize.h
+++ b/source/tensor/core/math/Normalize.h
@@ -31,7 +31,9 @@ normalized the data with normal distribution.
 For an input x, y = a * (x-mean)/sqrt(variance+\epsilon) + b
 where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter.
 */
-void _Normalize(const XTensor * input, XTensor * output, int dim, const XTensor * mean, const XTensor * var, const XTensor * a, const XTensor * b, DTYPE epsilon);
+void _Normalize(const XTensor * input, XTensor * output, int dim, 
+                const XTensor * mean, const XTensor * var, 
+                const XTensor * a, const XTensor * b, DTYPE epsilon);

 /*
 normalized the data with normal distribution (do it on site)
@@ -39,7 +41,9 @@ keep the result in the input tenosr and return nothing
 For an input x, x = a * (x-mean)/sqrt(variance+\epsilon) + b
 where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter.
 */
-void _NormalizeMe(XTensor * input, int dim, const XTensor * mean, const XTensor * var, const XTensor * a, const XTensor * b, DTYPE epsilon);
+void _NormalizeMe(XTensor * input, int dim, 
+                  const XTensor * mean, const XTensor * var, 
+                  const XTensor * a, const XTensor * b, DTYPE epsilon);

 /*
 normalized the data with normal distribution (do it on site)
@@ -47,7 +51,9 @@ keep the result in the input tenosr and return nothing
 For an input x, x = a * (x-mean)/sqrt(variance+\epsilon) + b
 where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter.
 */
-void NormalizeMe(XTensor & input, int dim, const XTensor & mean, const XTensor & var, const XTensor & a, const XTensor & b, DTYPE epsilon);
+void NormalizeMe(XTensor & input, int dim, 
+                 const XTensor & mean, const XTensor & var, 
+                 const XTensor & a, const XTensor & b, DTYPE epsilon);

 /*
 normalized the data with normal distribution (return an XTensor structure)
@@ -55,7 +61,19 @@ make a new tensor to keep the result and return it
 For an input x, y = a * (x-mean)/sqrt(variance+\epsilon) + b
 where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter.
 */
-XTensor Normalize(const XTensor &input, int dim, const XTensor &mean, const XTensor &var, const XTensor &a, const XTensor &b, DTYPE epsilon);
+XTensor Normalize(const XTensor &input, int dim, 
+                  const XTensor &mean, const XTensor &var, 
+                  const XTensor &a, const XTensor &b, DTYPE epsilon);
+
+/*
+normalized the data with normal distribution (return an XTensor structure)
+make a new tensor to keep the result and return it 
+For an input x, y = a * (x-mean)/sqrt(variance+\epsilon) + b
+where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter.
+*/
+void Normalize(const XTensor &input, XTensor &output, int dim, 
+               const XTensor &mean, const XTensor &var, 
+               const XTensor &a, const XTensor &b, DTYPE epsilon);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/math/Power.cpp
+++ b/source/tensor/core/math/Power.cpp
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
-
-#include <math.h>
-#include "../../XTensor.h"
-#include "../../XName.h"
-#include "Power.h"
-#include "Power.cuh"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-/*
-get the power(a, p)
->> a - input tensor
->> b - output tensor
->> p - parameter
-*/
-void _Power(const XTensor * a, XTensor * b, DTYPE p)
-{
-#ifdef USE_CUDA
-    /* run it on GPUs */
-    if (a->devID >= 0) {
-        _CudaPower(a, b, p);
-        return;
-    }
-#endif
-
-    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
-
-    DTYPE * aData = (DTYPE*)a->data;
-    DTYPE * bData = (DTYPE*)b->data;
-    if (p == 0) {
-        for (int i = 0; i < a->unitNum; i++)
-            bData[i] = (DTYPE)1.0;
-    }
-    else if (p == (DTYPE)0.5) {
-        for (int i = 0; i < a->unitNum; i++)
-            bData[i] = (DTYPE)sqrt(aData[i]);
-    }
-    else if (p == (DTYPE)2.0) {
-        for (int i = 0; i < a->unitNum; i++)
-            bData[i] = aData[i] * aData[i];
-    }
-    else {
-        for (int i = 0; i < a->unitNum; i++) {
-            if (p < 0 && aData[i] == 0)
-                bData[i] = 1e20F;
-            else
-                bData[i] = (DTYPE)pow(aData[i], p);
-        }
-    }
-}
-
-/*
-get the power(a, p) (do it on site)
-keep the result in the input tensor a and return nothing
->> a - the tensor
->> p - parameter
-*/
-void _PowerMe(XTensor * a, DTYPE p)
-{
-    _Power(a, a, p);
-}
-
-/*
-get the power(a, p) (do it on site)
-keep the result in the input tensor a and return nothing
->> a - the tensor
->> p - parameter
-*/
-void PowerMe(XTensor& a, DTYPE p)
-{
-    _Power(&a, &a, p);
-}
-
-/*
-get the power(a, p) (return an XTensor structure)
-make a new tensor to keep the result and return it
->> a - input tensor
->> p - parameter
-<< return - the power value of the input tensor
-*/
-XTensor Power(const XTensor & a, DTYPE p)
-{
-    XTensor b(&a);
-    b.SetTMPFlag();
-    
-    /* call _Power function */
-    _Power(&a, &b, p);
-    
-    /* tensor connections */
-    XLink::MakeLink(&a, NULL, &b, MATH_POWER);
-    XLink::AddParamToHead(&b, p);
-
-    return b;
-}
-
-/*
-get the power(a, p)
->> a - input tensor
->> b - output tensor
->> p - parameter
-*/
-void Power(const XTensor & a, XTensor & b, DTYPE p)
-{
-    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {
-        InitTensor(&b, &a);
-    }
-
-    /* call _Power function */
-    _Power(&a, &b, p);
-
-    if (b.enableGrad) {
-        /* tensor connections */
-        XLink::MakeLink(&a, NULL, &b, MATH_POWER);
-        XLink::AddParamToHead(&b, p);
-    }
-}
-
-} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/math/Power.cu
+++ b/source/tensor/core/math/Power.cu
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
-
-#include "../../XDevice.h"
-#include "../../XTensor.h"
-#include "../movement/CopyValues.cuh"
-#include "Power.h"
-#include "Power.cuh"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-#ifdef USE_CUDA
-
-/*
-set all entries to its root (CUDA Kernel)
->> a - input data array
->> b - output data array
->> size - size of the data array
-*/
-__global__
-void KernelSqrtV2(DTYPE * a, DTYPE * b, int size)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-    if (i < size)
-        b[i] = sqrt(a[i]);
-}
-
-/*
-set all entries to its root (CUDA Kernel)
->> a - input data array
->> b - output data array
->> size - size of the data array
-*/
-__global__
-void KernelSqrtV2(__half * a, __half * b, int size)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
-    if (i < size)
-        b[i] = hsqrt(a[i]);
-#else
-    if (i < size)
-        b[i] = __float2half(sqrt(__half2float(a[i])));
-#endif
-}
-
-
-/*
-get power(d[i], p)
->> a - input data array
->> b - output data array
->> p - power
->> size - size of the data array
-*/
-__global__
-void KernelPower(DTYPE * a, DTYPE * b, DTYPE p, int size)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-    if (i < size) {
-        DTYPE v = a[i];
-        if (p < 0 && v == 0)
-            b[i] = 1e20;
-        else
-            b[i] = pow(a[i], p);
-    }
-}
-
-/*
-get power(d[i], p)
->> a - input data array
->> b - output data array
->> p - power
->> size - size of the data array
-*/
-__global__
-void KernelPower(__half * a, __half * b, __half p, int size)
-{
-#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
-#else
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-    if (i < size) {
-        float v = __half2float(a[i]);
-        if (__half2float(p) < 0 && v == 0)
-            b[i] = __float2half(1e20);
-        else
-            b[i] = __float2half(pow(__half2float(a[i]), __half2float(p)));
-    }
-#endif
-}
-
-/* get the power of the entries */
-void _CudaPower(const XTensor * a, XTensor * b, DTYPE p)
-{
-    CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
-    
-    int gridSize[3];
-    int blockSize[3];
-
-    GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
-
-    dim3 blocks(gridSize[0]);
-    dim3 threads(blockSize[0]);
-
-    int devIDBackup;
-    ProtectCudaDev(a->devID, devIDBackup);
-
-    if (a->dataType == DEFAULT_DTYPE) {
-        if (p == (DTYPE)0.5) {
-            KernelSqrtV2 << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum);
-        }
-        else if (p == (DTYPE)1.0) {
-            _CudaCopyValues(a, b);
-        }
-        else if (p != (DTYPE)1.0) {
-            KernelPower << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, p, a->unitNum);
-        }
-    }
-    else if (a->dataType == X_FLOAT16) {
-        if (p == (DTYPE)0.5) {
-            KernelSqrtV2 << <blocks, threads >> >((__half*)a->data, (__half*)b->data, a->unitNum);
-        }
-        else if (p != (DTYPE)1.0) {
-            ShowNTErrors("TODO!");
-        }
-    }
-    else {
-        ShowNTErrors("TODO!");
-    }
-
-    BacktoCudaDev(a->devID, devIDBackup);
-}
-
-#endif // USE_CUDA
-} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/math/Power.cuh
+++ b/source/tensor/core/math/Power.cuh
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
-
-#ifndef __POWER_CUH__
-#define __POWER_CUH__
-
-#include "Power.h"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-#ifdef USE_CUDA
-
-/* set all entries to its root (CUDA Kernel) */
-__global__
-void KernelSqrtV2(DTYPE * a, DTYPE * b, int size);
-
-/* set all entries to its root (CUDA Kernel) */
-__global__
-void KernelSqrtV2(__half * a, __half * b, int size);
-
-/* get the power of the entries */
-void _CudaPower(const XTensor * a, XTensor * b, DTYPE p);
-
-#endif // USE_CUDA
-
-} // namespace nts(NiuTrans.Tensor)
-
-#endif // __POWER_CUH__
\ No newline at end of file
--- a/source/tensor/core/math/Power.h
+++ b/source/tensor/core/math/Power.h
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
-
-#ifndef __POWER_H__
-#define __POWER_H__
-
-#include "../../XTensor.h"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-/* get the power(x, y) */
-void _Power(const XTensor * a, XTensor * b, DTYPE p);
-
-/* 
-get the power(x, y) (do it on site)
-keep the result in the input tensor a and return nothing
-*/
-void _PowerMe(XTensor * a, DTYPE p);
-
-/* 
-get the power(x, y) (do it on site)
-keep the result in the input tensor a and return nothing
-*/
-void PowerMe(XTensor & a, DTYPE p);
-
-/* 
-get the power(x, y) (return an XTensor structure)
-make a new tensor to keep the result and return it
-*/
-XTensor Power(const XTensor & a, DTYPE p);
-
-/* get the power(x, y) */
-void Power(const XTensor & a, XTensor & b, DTYPE p);
-
-} // namespace nts(NiuTrans.Tensor)
-
-#endif // __POWER_H__
--- a/source/tensor/core/math/Unary.cpp
+++ b/source/tensor/core/math/Unary.cpp
@@ -26,248 +26,207 @@
 #include "Unary.cuh"

 namespace nts{
-    
-DTYPE square(DTYPE x)
+  
+template<class T>
+T UnaryNegate(T x) {
+    return (T)-x;
+}
+
+template<class T>
+T UnarySquare(T x)
 {
-    return x * x;
+    return (T)(x * x);
 }

-DTYPE round(DTYPE r)
+template<class T>
+T UnaryRound(T r)
 {
-    return (r > 0.0) ? (DTYPE)floor(r + 0.5) : (DTYPE)ceil(r - 0.5);
+	return (r > 0.0) ? (T)floor(r + 0.5) : (T)ceil(r - 0.5);
 }

-DTYPE isnonzero(DTYPE r)
+template<class T>
+T UnarySign(T r)
 {
-    return (r != 0.0) ? (DTYPE)1.0 : (DTYPE)0.0;
+    if (r > 0.0)
+       return (T)1.0;
+    else if (r == 0.0)
+       return (T)0.0;
+    else
+       return (T)-1.0;
 }

-DTYPE iszero(DTYPE r)
+template<class T>
+T UnaryIsNonZero(T r)
 {
-    return (r == 0.0) ? (DTYPE)1.0 : (DTYPE)0.0;
+    return (r != 0.0) ? (T)1.0 : (T)0.0;
 }

-#ifdef USE_CUDA
-/* define three marco separately, specify the respective function names  (GPU mode) */
-#define _SIMPLE_UNARY_FUNCTION(_funcName, _cudaFuncName, origFunc)          \
-void _funcName(const XTensor * a, XTensor * b)                              \
-{                                                                           \
-    /* run it on GPUs */                                                    \
-    if (a->devID >= 0) {                                                    \
-        _cudaFuncName(a, b);                                                \
-    return;                                                                 \
-    }                                                                       \
-    CheckNTErrors((XTensor::IsSameShaped(a, b)),                            \
-                  "Input tensors should have the same type!");              \
-    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");                 \
-    DTYPE * d = (DTYPE*)a->data;                                            \
-    DTYPE * db = (DTYPE*)b->data;                                           \
-    for (int i = 0; i < a->unitNum; i++)                                    \
-        db[i] = (DTYPE)origFunc(d[i]);                                      \
+template<class T>
+T UnaryIsZero(T r)
+{
+    return (r == 0.0) ? (T)1.0 : (T)0.0;
 }

-#define _SIMPLE_UNARY_FUNCTION_ME(_funcNameMe, _funcName)                   \
-void _funcNameMe(XTensor * a)                                               \
-{                                                                           \
-    _funcName(a, a);                                                        \
+/* define three marco separately, specify the respective function names */
+#define _SIMPLE_UNARY_FUNCTION(_funcName, _cudaFuncName, origFunc)                   \
+void _funcName(const XTensor * a, XTensor * b)                                       \
+{                                                                                    \
+    /* run it on GPUs */                                                             \
+    if (a->devID >= 0) {                                                             \
+        if (useCUDA) {                                                               \
+            _cudaFuncName(a, b);                                                     \
+            return;                                                                  \
+        }                                                                            \
+        else                                                                         \
+            ShowNTErrors("No GPU devices support!")                                  \
+    }                                                                                \
+    CheckNTErrors((XTensor::IsSameShaped(a, b)),                                     \
+                  "Input tensors should have the same type!");                       \
+    if (a->dataType == X_INT) {                                                      \
+        int * d = (int*)a->data;                                                     \
+        int * db = (int*)b->data;                                                    \
+        for (int i = 0; i < a->unitNum; i++)                                         \
+            db[i] = (int)origFunc(d[i]);                                             \
+    }                                                                                \
+    else if (a->dataType == X_FLOAT) {                                               \
+        float * d = (float*)a->data;                                                 \
+        float * db = (float*)b->data;                                                \
+        for (int i = 0; i < a->unitNum; i++)                                         \
+            db[i] = (float)origFunc(d[i]);                                           \
+    }                                                                                \
+    else if (a->dataType == X_DOUBLE) {                                              \
+        double * d = (double*)a->data;                                               \
+        double * db = (double*)b->data;                                              \
+        for (int i = 0; i < a->unitNum; i++)                                         \
+            db[i] = (double)origFunc(d[i]);                                          \
+    }                                                                                \
+    else                                                                             \
+        ShowNTErrors("TO DO!");                                                      \
+}                                       
+
+#define _SIMPLE_UNARY_FUNCTION_ME(_funcNameMe, _funcName)                            \
+void _funcNameMe(XTensor * a)                                                        \
+{                                                                                    \
+    _funcName(a, a);                                                                 \
 }        

-#define SIMPLE_UNARY_FUNCTION(funcName, _funcName, operationId)             \
-XTensor funcName(const XTensor &a)                                          \
-{                                                                           \
-    XTensor b(&a);                                                          \
-    b.SetTMPFlag();                                                         \
-    _funcName(&a, &b);                                                      \
-    XLink::MakeLink(&a, NULL, &b, operationId);                             \
-    return b;                                                               \
-}
-
-#define SIMPLE_UNARY_FUNCTION_VOID(funcName, _funcName, operationId)        \
-void funcName(const XTensor &a, XTensor &b)                                 \
-{                                                                           \
-    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {                      \
-        InitTensor(&b, &a);                                                 \
-    }                                                                       \
-    _funcName(&a, &b);                                                      \
-    if (b.enableGrad) {                                                     \
-        XLink::MakeLink(&a, NULL, &b, operationId);                         \
-    }                                                                       \
+#define SIMPLE_UNARY_FUNCTION_ME(funcNameMe, _funcName)                              \
+void funcNameMe(XTensor & a)                                                         \
+{                                                                                    \
+    _funcName(&a, &a);                                                               \
+}                                                                                    
+                                                                                     
+#define SIMPLE_UNARY_FUNCTION(funcName, _funcName, operationId)                      \
+XTensor funcName(const XTensor & a)                                                  \
+{                                                                                    \
+    XTensor b(&a);                                                                   \
+    b.SetTMPFlag();                                                                  \
+    _funcName(&a, &b);                                                               \
+    XLink::MakeLink(&a, NULL, &b, operationId);                                      \
+    return b;                                                                        \
+}                                                                                    
+                                                                                     
+#define SIMPLE_UNARY_FUNCTION_VOID(funcName, _funcName, operationId)                 \
+void funcName(const XTensor & a, XTensor & b)                                        \
+{                                                                                    \
+    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {                               \
+        InitTensor(&b, &a);                                                          \
+    }                                                                                \
+    _funcName(&a, &b);                                                               \
+    if (b.enableGrad) {                                                              \
+        XLink::MakeLink(&a, NULL, &b, operationId);                                  \
+    }                                                                                \
 }

 _SIMPLE_UNARY_FUNCTION(_Absolute, _CudaAbsolute, fabs)
-_SIMPLE_UNARY_FUNCTION_ME(_AbsoluteMe, _Absolute)
-SIMPLE_UNARY_FUNCTION(Absolute, _Absolute, MATH_ABSOLUTE)
-SIMPLE_UNARY_FUNCTION_VOID(Absolute, _Absolute, MATH_ABSOLUTE)
-
 _SIMPLE_UNARY_FUNCTION(_Ceil, _CudaCeil, ceil)
-_SIMPLE_UNARY_FUNCTION_ME(_CeilMe, _Ceil)
-SIMPLE_UNARY_FUNCTION(Ceil, _Ceil, MATH_CEIL)
-SIMPLE_UNARY_FUNCTION_VOID(Ceil, _Ceil, MATH_CEIL)
-
 _SIMPLE_UNARY_FUNCTION(_Exp, _CudaExp, exp)
-_SIMPLE_UNARY_FUNCTION_ME(_ExpMe, _Exp)
-SIMPLE_UNARY_FUNCTION(Exp, _Exp, MATH_EXP)
-SIMPLE_UNARY_FUNCTION_VOID(Exp, _Exp, MATH_EXP)
-
 _SIMPLE_UNARY_FUNCTION(_Floor, _CudaFloor, floor)
-_SIMPLE_UNARY_FUNCTION_ME(_FloorMe, _Floor)
-SIMPLE_UNARY_FUNCTION(Floor, _Floor, MATH_FLOOR)
-SIMPLE_UNARY_FUNCTION_VOID(Floor, _Floor, MATH_FLOOR)
-
-_SIMPLE_UNARY_FUNCTION(_IsNonZero, _CudaIsNonZero, isnonzero)
-_SIMPLE_UNARY_FUNCTION_ME(_IsNonZeroMe, _IsNonZero)
-SIMPLE_UNARY_FUNCTION(IsNonZero, _IsNonZero, MATH_ISNONZERO)
-SIMPLE_UNARY_FUNCTION_VOID(IsNonZero, _IsNonZero, MATH_ISNONZERO)
-
-_SIMPLE_UNARY_FUNCTION(_IsZero, _CudaIsZero, iszero)
-_SIMPLE_UNARY_FUNCTION_ME(_IsZeroMe, _IsZero)
-SIMPLE_UNARY_FUNCTION(IsZero, _IsZero, MATH_ISZERO)
-SIMPLE_UNARY_FUNCTION_VOID(IsZero, _IsZero, MATH_ISZERO)
-
+_SIMPLE_UNARY_FUNCTION(_IsNonZero, _CudaIsNonZero, UnaryIsNonZero)
+_SIMPLE_UNARY_FUNCTION(_IsZero, _CudaIsZero, UnaryIsZero)
 _SIMPLE_UNARY_FUNCTION(_Log, _CudaLog, log)
-_SIMPLE_UNARY_FUNCTION_ME(_LogMe, _Log)
-SIMPLE_UNARY_FUNCTION(Log, _Log, MATH_LOG)
-SIMPLE_UNARY_FUNCTION_VOID(Log, _Log, MATH_LOG)
-
+_SIMPLE_UNARY_FUNCTION(_Negate, _CudaNegate, UnaryNegate)
 _SIMPLE_UNARY_FUNCTION(_Round, _CudaRound, round)
-_SIMPLE_UNARY_FUNCTION_ME(_RoundMe, _Round)
-SIMPLE_UNARY_FUNCTION(Round, _Round, MATH_ROUND)
-SIMPLE_UNARY_FUNCTION_VOID(Round, _Round, MATH_ROUND)
-
+_SIMPLE_UNARY_FUNCTION(_Sign, _CudaSign, UnarySign)
 _SIMPLE_UNARY_FUNCTION(_Sqrt, _CudaSqrt, sqrt)
-_SIMPLE_UNARY_FUNCTION_ME(_SqrtMe, _Sqrt)
-SIMPLE_UNARY_FUNCTION(Sqrt, _Sqrt, MATH_SQRT)
-SIMPLE_UNARY_FUNCTION_VOID(Sqrt, _Sqrt, MATH_SQRT)
-
-_SIMPLE_UNARY_FUNCTION(_Square, _CudaSquare, square)
-_SIMPLE_UNARY_FUNCTION_ME(_SquareMe, _Square)
-SIMPLE_UNARY_FUNCTION(Square, _Square, MATH_SQUARE)
-SIMPLE_UNARY_FUNCTION_VOID(Square, _Square, MATH_SQUARE)
-
+_SIMPLE_UNARY_FUNCTION(_Square, _CudaSquare, UnarySquare)
 _SIMPLE_UNARY_FUNCTION(_Sin, _CudaSin, sin)
-_SIMPLE_UNARY_FUNCTION_ME(_SinMe, _Sin)
-SIMPLE_UNARY_FUNCTION(Sin, _Sin, MATH_SIN)
-SIMPLE_UNARY_FUNCTION_VOID(Sin, _Sin, MATH_SIN)
-
 _SIMPLE_UNARY_FUNCTION(_Cos, _CudaCos, cos)
-_SIMPLE_UNARY_FUNCTION_ME(_CosMe, _Cos)
-SIMPLE_UNARY_FUNCTION(Cos, _Cos, MATH_COS)
-SIMPLE_UNARY_FUNCTION_VOID(Cos, _Cos, MATH_COS)
-
 _SIMPLE_UNARY_FUNCTION(_Tan, _CudaTan, tan)
-_SIMPLE_UNARY_FUNCTION_ME(_TanMe, _Tan)
-SIMPLE_UNARY_FUNCTION(Tan, _Tan, MATH_TAN)
-SIMPLE_UNARY_FUNCTION_VOID(Tan, _Tan, MATH_TAN)
-
-#else
-/* define three marco separately, specify the respective function names (CPU mode) */
-#define _SIMPLE_UNARY_FUNCTION(_funcName, origFunc)          \
-void _funcName(const XTensor * a, XTensor * b)                              \
-{                                                                           \
-    CheckNTErrors((XTensor::IsSameShaped(a, b)),                            \
-                  "Input tensors should have the same type!");              \
-    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");                 \
-    DTYPE * d = (DTYPE*)a->data;                                            \
-    DTYPE * db = (DTYPE*)b->data;                                           \
-    for (int i = 0; i < a->unitNum; i++)                                    \
-        db[i] = (DTYPE)origFunc(d[i]);                                      \
-}
-
-#define _SIMPLE_UNARY_FUNCTION_ME(_funcNameMe, _funcName)                   \
-void _funcNameMe(XTensor * a)                                               \
-{                                                                           \
-    _funcName(a, a);                                                        \
-}        
-
-#define SIMPLE_UNARY_FUNCTION(funcName, _funcName, operationId)             \
-XTensor funcName(const XTensor &a)                                          \
-{                                                                           \
-    XTensor b(&a);                                                          \
-    b.SetTMPFlag();                                                         \
-    _funcName(&a, &b);                                                      \
-    XLink::MakeLink(&a, NULL, &b, operationId);                             \
-    return b;                                                               \
-}
-#define SIMPLE_UNARY_FUNCTION_VOID(funcName, _funcName, operationId)        \
-void funcName(const XTensor &a, XTensor &b)                                 \
-{                                                                           \
-    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {                      \
-        InitTensor(&b, &a);                                                 \
-    }                                                                       \
-    _funcName(&a, &b);                                                      \
-    if (b.enableGrad) {                                                     \
-        XLink::MakeLink(&a, NULL, &b, operationId);                         \
-    }                                                                       \
-}

-_SIMPLE_UNARY_FUNCTION(_Absolute, fabs)
 _SIMPLE_UNARY_FUNCTION_ME(_AbsoluteMe, _Absolute)
+SIMPLE_UNARY_FUNCTION_ME(AbsoluteMe, _Absolute)
 SIMPLE_UNARY_FUNCTION(Absolute, _Absolute, MATH_ABSOLUTE)
 SIMPLE_UNARY_FUNCTION_VOID(Absolute, _Absolute, MATH_ABSOLUTE)

-_SIMPLE_UNARY_FUNCTION(_Ceil, ceil)
 _SIMPLE_UNARY_FUNCTION_ME(_CeilMe, _Ceil)
+SIMPLE_UNARY_FUNCTION_ME(CeilMe, _Ceil)
 SIMPLE_UNARY_FUNCTION(Ceil, _Ceil, MATH_CEIL)
 SIMPLE_UNARY_FUNCTION_VOID(Ceil, _Ceil, MATH_CEIL)

-_SIMPLE_UNARY_FUNCTION(_Exp, exp)
 _SIMPLE_UNARY_FUNCTION_ME(_ExpMe, _Exp)
+SIMPLE_UNARY_FUNCTION_ME(ExpMe, _Exp)
 SIMPLE_UNARY_FUNCTION(Exp, _Exp, MATH_EXP)
 SIMPLE_UNARY_FUNCTION_VOID(Exp, _Exp, MATH_EXP)

-_SIMPLE_UNARY_FUNCTION(_Floor, floor)
 _SIMPLE_UNARY_FUNCTION_ME(_FloorMe, _Floor)
+SIMPLE_UNARY_FUNCTION_ME(FloorMe, _Floor)
 SIMPLE_UNARY_FUNCTION(Floor, _Floor, MATH_FLOOR)
 SIMPLE_UNARY_FUNCTION_VOID(Floor, _Floor, MATH_FLOOR)

-_SIMPLE_UNARY_FUNCTION(_IsNonZero, isnonzero)
 _SIMPLE_UNARY_FUNCTION_ME(_IsNonZeroMe, _IsNonZero)
+SIMPLE_UNARY_FUNCTION_ME(IsNonZeroMe, _IsNonZero)
 SIMPLE_UNARY_FUNCTION(IsNonZero, _IsNonZero, MATH_ISNONZERO)
 SIMPLE_UNARY_FUNCTION_VOID(IsNonZero, _IsNonZero, MATH_ISNONZERO)

-_SIMPLE_UNARY_FUNCTION(_IsZero, iszero)
 _SIMPLE_UNARY_FUNCTION_ME(_IsZeroMe, _IsZero)
+SIMPLE_UNARY_FUNCTION_ME(IsZeroMe, _IsZero)
 SIMPLE_UNARY_FUNCTION(IsZero, _IsZero, MATH_ISZERO)
 SIMPLE_UNARY_FUNCTION_VOID(IsZero, _IsZero, MATH_ISZERO)

-_SIMPLE_UNARY_FUNCTION(_Log, log)
 _SIMPLE_UNARY_FUNCTION_ME(_LogMe, _Log)
+SIMPLE_UNARY_FUNCTION_ME(LogMe, _Log)
 SIMPLE_UNARY_FUNCTION(Log, _Log, MATH_LOG)
 SIMPLE_UNARY_FUNCTION_VOID(Log, _Log, MATH_LOG)

-_SIMPLE_UNARY_FUNCTION(_Round, round)
+_SIMPLE_UNARY_FUNCTION_ME(_NegateMe, _Negate)
+SIMPLE_UNARY_FUNCTION_ME(NegateMe, _Negate)
+SIMPLE_UNARY_FUNCTION(Negate, _Negate, MATH_NEGATE)
+SIMPLE_UNARY_FUNCTION_VOID(Negate, _Negate, MATH_NEGATE)
+
 _SIMPLE_UNARY_FUNCTION_ME(_RoundMe, _Round)
+SIMPLE_UNARY_FUNCTION_ME(RoundMe, _Round)
 SIMPLE_UNARY_FUNCTION(Round, _Round, MATH_ROUND)
 SIMPLE_UNARY_FUNCTION_VOID(Round, _Round, MATH_ROUND)

-_SIMPLE_UNARY_FUNCTION(_Sqrt, sqrt)
+_SIMPLE_UNARY_FUNCTION_ME(_SignMe, _Sign)
+SIMPLE_UNARY_FUNCTION_ME(SignMe, _Sign)
+SIMPLE_UNARY_FUNCTION(Sign, _Sign, MATH_SIGN)
+SIMPLE_UNARY_FUNCTION_VOID(Sign, _Sign, MATH_SIGN)
+
 _SIMPLE_UNARY_FUNCTION_ME(_SqrtMe, _Sqrt)
+SIMPLE_UNARY_FUNCTION_ME(SqrtMe, _Sqrt)
 SIMPLE_UNARY_FUNCTION(Sqrt, _Sqrt, MATH_SQRT)
 SIMPLE_UNARY_FUNCTION_VOID(Sqrt, _Sqrt, MATH_SQRT)

-_SIMPLE_UNARY_FUNCTION(_Square, square)
 _SIMPLE_UNARY_FUNCTION_ME(_SquareMe, _Square)
+SIMPLE_UNARY_FUNCTION_ME(SquareMe, _Square)
 SIMPLE_UNARY_FUNCTION(Square, _Square, MATH_SQUARE)
 SIMPLE_UNARY_FUNCTION_VOID(Square, _Square, MATH_SQUARE)

-_SIMPLE_UNARY_FUNCTION(_Sin, sin)
 _SIMPLE_UNARY_FUNCTION_ME(_SinMe, _Sin)
+SIMPLE_UNARY_FUNCTION_ME(SinMe, _Sin)
 SIMPLE_UNARY_FUNCTION(Sin, _Sin, MATH_SIN)
 SIMPLE_UNARY_FUNCTION_VOID(Sin, _Sin, MATH_SIN)

-_SIMPLE_UNARY_FUNCTION(_Cos, cos)
 _SIMPLE_UNARY_FUNCTION_ME(_CosMe, _Cos)
+SIMPLE_UNARY_FUNCTION_ME(CosMe, _Cos)
 SIMPLE_UNARY_FUNCTION(Cos, _Cos, MATH_COS)
 SIMPLE_UNARY_FUNCTION_VOID(Cos, _Cos, MATH_COS)

-_SIMPLE_UNARY_FUNCTION(_Tan, tan)
 _SIMPLE_UNARY_FUNCTION_ME(_TanMe, _Tan)
+SIMPLE_UNARY_FUNCTION_ME(TanMe, _Tan)
 SIMPLE_UNARY_FUNCTION(Tan, _Tan, MATH_TAN)
 SIMPLE_UNARY_FUNCTION_VOID(Tan, _Tan, MATH_TAN)

-/*_SIMPLE_UNARY_FUNCTION(_Round, round)
-_SIMPLE_UNARY_FUNCTION_ME(_RoundMe, _Round)
-SIMPLE_UNARY_FUNCTION(Round, _Round, MATH_ROUND)*/
-#endif
-
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/math/Unary.cu
+++ b/source/tensor/core/math/Unary.cu
@@ -24,55 +24,139 @@
 #include "../../XName.h"
 #include "Unary.h"
 #include "Unary.cuh"
+#include<cuda_runtime.h>

 namespace nts { // namespace nts(NiuTrans.Tensor)

 #ifdef USE_CUDA

+template<class T>
 __device__
-DTYPE cudasquare(DTYPE x)
+T UnaryCudaCeil(T x)
+{
+    return (T)ceil((float)x);
+}
+
+template<class T>
+__device__
+T UnaryCudaExp(T x)
+{
+    return (T)exp((float)x);
+}
+
+template<class T>
+__device__
+T UnaryCudaFabs(T x)
+{
+    return (T)fabs((float)x);
+}
+
+template<class T>
+__device__
+T UnaryCudaFloor(T x)
+{
+    return (T)floor((float)x);
+}
+
+template<class T>
+__device__
+T UnaryCudaIsNonZero(T r)
+{
+    return (r != (T)0.0) ? (T)1.0 : (T)0.0;
+}
+
+template<class T>
+__device__
+T UnaryCudaIsZero(T r)
+{
+    return (r == (T)0.0) ? (T)1.0 : (T)0.0;
+}
+
+template<class T>
+__device__
+T UnaryCudaLog(T x)
+{
+    return (T)log((float)x);
+}
+
+template<class T>
+__device__
+T UnaryCudaNegate(T x)
+{
+    return -x;
+}
+
+template<class T>
+__device__
+T UnaryCudaSign(T r)
+{
+    if (r > (T)0)
+       return 1.0;
+    else if (r == (T)0)
+       return 0.0;
+    else
+       return -1.0;
+}
+
+template<class T>
+__device__
+T UnaryCudaSqrt(T x)
+{
+    return (T)sqrt((float)x);
+}
+
+template<class T>
+__device__
+T UnaryCudaSquare(T x)
 {
    return x * x;
 }

+template<class T>
 __device__
-DTYPE cudaround(DTYPE r)
+T UnaryCudaRound(T r)
 {
-    return (r > 0.0) ? (DTYPE)floor(r + 0.5) : (DTYPE)ceil(r - 0.5);
+	return (r > (T)0.0) ? (T)UnaryCudaFloor(r + (T)0.5) : (T)UnaryCudaCeil(r - (T)0.5);
 }

+
+template<class T>
 __device__
-DTYPE cudaisnonzero(DTYPE r)
+T UnaryCudaSin(T x)
 {
-    return (r != 0.0) ? (DTYPE)1.0 : (DTYPE)0.0;
+    return (T)sin((float)x);
 }

+template<class T>
 __device__
-DTYPE cudaiszero(DTYPE r)
+T UnaryCudaCos(T x)
 {
-    return (r == 0.0) ? (DTYPE)1.0 : (DTYPE)0.0;
+    return (T)cos((float)x);
+}
+
+template<class T>
+__device__
+T UnaryCudaTan(T x)
+{
+    return (T)tan((float)x);
 }


 #define SIMPLE_UNARY_FUNCTION_GPU(funcName, origFunc)                       \
+template<class T>                                                           \
 __global__                                                                  \
-void Kernel##funcName(DTYPE * a, DTYPE * b, int size)                       \
+void Kernel##funcName(T * a, T * b, int size)                               \
 {                                                                           \
    int i = blockDim.x * blockIdx.x + threadIdx.x;                          \
                                                                            \
    if (i < size)                                                           \
-        b[i] = (DTYPE)origFunc(a[i]);                                       \
-}                                                                           \
-__global__                                                                  \
-void Kernel##funcName(__half * a, __half * b, int size)                     \
-{                                                                           \
-    return;                                                                 \
+        b[i] = (T)origFunc(a[i]);                                           \
 }                                                                           \
 void _Cuda##funcName(const XTensor * a, XTensor * b)                        \
 {                                                                           \
    CheckNTErrors((XTensor::IsSameShaped(a, b)),                            \
                  "Input tensors should have the same type!");              \
-    CheckNTErrors((a->isSparse == false), "TODO!");                         \
+    CheckNTErrors(a->isSparse == false, "TODO!");                           \
                                                                            \
    int gridSize[3];                                                        \
    int blockSize[3];                                                       \
@@ -85,35 +169,43 @@ void _Cuda##funcName(const XTensor * a, XTensor * b)                        \
    int devIDBackup;                                                        \
    ProtectCudaDev(a->devID, devIDBackup);                                  \
                                                                            \
-    if (a->dataType == DEFAULT_DTYPE) {                                     \
+    if (a->dataType == X_FLOAT) {                                           \
        Kernel##funcName<<<blocks, threads>>>                               \
-                         ((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum);    \
+                         ((float*)a->data, (float*)b->data, a->unitNum);    \
    }                                                                       \
-    else if (a->dataType == X_FLOAT16) {                                    \
+    else if (a->dataType == X_DOUBLE) {                                     \
        Kernel##funcName<<<blocks, threads>>>                               \
-                         ((__half*)a->data, (__half*)b->data, a->unitNum);  \
+                         ((double*)a->data, (double*)b->data, a->unitNum);  \
+    }                                                                       \
+    else if (a->dataType == X_INT) {                                        \
+        Kernel##funcName<<<blocks, threads>>>                               \
+                         ((int*)a->data, (int*)b->data, a->unitNum);        \
    }                                                                       \
    else {                                                                  \
        ShowNTErrors("TODO!");                                              \
    }                                                                       \
                                                                            \
    BacktoCudaDev(a->devID, devIDBackup);                                   \
-}                                                                           \
+}
+
+
+
+SIMPLE_UNARY_FUNCTION_GPU(Absolute, UnaryCudaFabs)
+SIMPLE_UNARY_FUNCTION_GPU(Ceil, UnaryCudaCeil)
+SIMPLE_UNARY_FUNCTION_GPU(Exp, UnaryCudaExp)
+SIMPLE_UNARY_FUNCTION_GPU(Floor, UnaryCudaFloor)
+SIMPLE_UNARY_FUNCTION_GPU(IsNonZero, UnaryCudaIsNonZero)
+SIMPLE_UNARY_FUNCTION_GPU(IsZero, UnaryCudaIsZero)
+SIMPLE_UNARY_FUNCTION_GPU(Log, UnaryCudaLog)
+SIMPLE_UNARY_FUNCTION_GPU(Negate, UnaryCudaNegate)
+SIMPLE_UNARY_FUNCTION_GPU(Round, UnaryCudaRound)
+SIMPLE_UNARY_FUNCTION_GPU(Sign, UnaryCudaSign)
+SIMPLE_UNARY_FUNCTION_GPU(Sqrt, UnaryCudaSqrt)
+SIMPLE_UNARY_FUNCTION_GPU(Square, UnaryCudaSquare)

-SIMPLE_UNARY_FUNCTION_GPU(Absolute, fabs)
-SIMPLE_UNARY_FUNCTION_GPU(Ceil, ceil)
-SIMPLE_UNARY_FUNCTION_GPU(Exp, exp)
-SIMPLE_UNARY_FUNCTION_GPU(Floor, floor)
-SIMPLE_UNARY_FUNCTION_GPU(IsNonZero, cudaisnonzero)
-SIMPLE_UNARY_FUNCTION_GPU(IsZero, cudaiszero)
-SIMPLE_UNARY_FUNCTION_GPU(Log, log)
-SIMPLE_UNARY_FUNCTION_GPU(Round, cudaround)
-SIMPLE_UNARY_FUNCTION_GPU(Sqrt, sqrt)
-SIMPLE_UNARY_FUNCTION_GPU(Square, cudasquare)
-
-SIMPLE_UNARY_FUNCTION_GPU(Sin, sin)
-SIMPLE_UNARY_FUNCTION_GPU(Cos, cos)
-SIMPLE_UNARY_FUNCTION_GPU(Tan, tan)
+SIMPLE_UNARY_FUNCTION_GPU(Sin, UnaryCudaSin)
+SIMPLE_UNARY_FUNCTION_GPU(Cos, UnaryCudaCos)
+SIMPLE_UNARY_FUNCTION_GPU(Tan, UnaryCudaTan)

 #endif // USE_CUDA


--- a/source/tensor/core/math/Unary.cuh
+++ b/source/tensor/core/math/Unary.cuh
@@ -29,121 +29,49 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 #ifdef USE_CUDA

-/* set each entry to its absolute value (CUDA Kernel) */
-__global__
-void KernelAbsolute(DTYPE * a, DTYPE * b, int size);
-/* set each entry to its absolute value (CUDA Kernel) with float16 data type*/
-__global__
-void KernelAbsolute(__half * a, __half * b, int size);
 /* set each entry to its absolute value */
 void _CudaAbsolute(const XTensor * a, XTensor * b);

-/* set each entry to its ceil value (CUDA Kernel) */
-__global__
-void KernelCeil(DTYPE * a, DTYPE * b, int size);
-/* set each entry to its ceil value (CUDA Kernel) with float16 data type*/
-__global__
-void KernelCeil(__half * a, __half * b, int size);
 /* set each entry to its ceil value */
 void _CudaCeil(const XTensor * a, XTensor * b);

-/* set each entry to its exponent value (CUDA Kernel) */
-__global__
-void KernelExp(DTYPE * a, DTYPE * b, int size);
-/* set each entry to its exponent value (CUDA Kernel) with float16 data type*/
-__global__
-void KernelExp(__half * a, __half * b, int size);
 /* set each entry to its exponent value */
 void _CudaExp(const XTensor * a, XTensor * b);

-/* set each entry to its floor value (CUDA Kernel) */
-__global__
-void KernelFloor(DTYPE * a, DTYPE * b, int size);
-/* set each entry to its floor value (CUDA Kernel) with float16 data type*/
-__global__
-void KernelFloor(__half * a, __half * b, int size);
 /* set each entry to its floor value */
 void _CudaFloor(const XTensor * a, XTensor * b);

-/* if source entry is non-zero, set target entry to be one, otherwise zero (CUDA Kernel) */
-__global__
-void KernelIsNonZero(DTYPE * a, DTYPE * b, int size);
-/* if source entry is non-zero, set target entry to be one, otherwise zero (CUDA Kernel) with float16 data type*/
-__global__
-void KernelIsNonZero(__half * a, __half * b, int size);
 /* if source entry is non-zero, set target entry to be one, otherwise zero */
 void _CudaIsNonZero(const XTensor * a, XTensor * b);

-/* if source entry is zero, set target entry to be one, otherwise zero (CUDA Kernel) */
-__global__
-void KernelIsZero(DTYPE * a, DTYPE * b, int size);
-/* if source entry is zero, set target entry to be one, otherwise zero (CUDA Kernel) with float16 data type*/
-__global__
-void KernelIsZero(__half * a, __half * b, int size);
 /* if source entry is zero, set target entry to be one, otherwise zero */
 void _CudaIsZero(const XTensor * a, XTensor * b);

-/* set each entry to its logarithm value (CUDA Kernel) */
-__global__
-void KernelLog(DTYPE * a, DTYPE * b, int size);
-/* set each entry to its logarithm value (CUDA Kernel) with float16 data type*/
-__global__
-void KernelLog(__half * a, __half * b, int size);
 /* set each entry to its logarithm value */
 void _CudaLog(const XTensor * a, XTensor * b);

-/* set each entry to its round value (CUDA Kernel) */
-__global__
-void KernelRound(DTYPE * a, DTYPE * b, int size);
-/* set each entry to its round value (CUDA Kernel) with float16 data type*/
-__global__
-void KernelRound(__half * a, __half * b, int size);
+/* set each entry to its negative value */
+void _CudaNegate(const XTensor * a, XTensor * b);
+
 /* set each entry to its round value */
 void _CudaRound(const XTensor * a, XTensor * b);

-/* set each entry to its sqrt value (CUDA Kernel) */
-__global__
-void KernelSqrt(DTYPE * a, DTYPE * b, int size);
-/* set each entry to its sqrt value (CUDA Kernel) with float16 data type*/
-__global__
-void KernelSqrt(__half * a, __half * b, int size);
+/* set each entry to its sign value */
+void _CudaSign(const XTensor * a, XTensor * b);
+
 /* set each entry to its sqrt value */
 void _CudaSqrt(const XTensor * a, XTensor * b);

-/* set each entry to its square value (CUDA Kernel) */
-__global__
-void KernelSquare(DTYPE * a, DTYPE * b, int size);
-/* set each entry to its square value (CUDA Kernel) with float16 data type*/
-__global__
-void KernelSquare(__half * a, __half * b, int size);
 /* set each entry to its square value */
 void _CudaSquare(const XTensor * a, XTensor * b);


-/* set each entry to its sine value (CUDA Kernel) */
-__global__
-void KernelSin(DTYPE * a, DTYPE * b, int size);
-/* set each entry to its sine value (CUDA Kernel) with float16 data type*/
-__global__
-void KernelSin(__half * a, __half * b, int size);
 /* set each entry to its sine value */
 void _CudaSin(const XTensor * a, XTensor * b);

-/* set each entry to its cosine value (CUDA Kernel) */
-__global__
-void KernelCos(DTYPE * a, DTYPE * b, int size);
-/* set each entry to its cosine value (CUDA Kernel) with float16 data type*/
-__global__
-void KernelCos(__half * a, __half * b, int size);
 /* set each entry to its cosine value */
 void _CudaCos(const XTensor * a, XTensor * b);

-/* set each entry to its tangent value (CUDA Kernel) */
-__global__
-void KernelTan(DTYPE * a, DTYPE * b, int size);
-/* set each entry to its tangent value (CUDA Kernel) with float16 data type*/
-__global__
-void KernelTan(__half * a, __half * b, int size);
 /* set each entry to its tangent value */
 void _CudaTan(const XTensor * a, XTensor * b);


--- a/source/tensor/core/math/Unary.h
+++ b/source/tensor/core/math/Unary.h
@@ -124,6 +124,20 @@ XTensor Log(const XTensor & a);
 /* set every entry to its logarithm value */
 void Log(const XTensor & a, XTensor & b);

+/* set every entry to its negative value */
+void _Negate(const XTensor * a, XTensor * b);
+/* set every entry to its negative value (do it on site)
+keep the result in the input tensor a and return nothing */
+void _NegateMe(XTensor * a);
+/* set every entry to its negative value (do it on site)
+keep the result in the input tensor a and return nothing */
+void NegateMe(XTensor & a);
+/* set every entry to its negative value (return an XTensor structure)
+make a new tensor to keep the result and return it */
+XTensor Negate(const XTensor & a);
+/* set every entry to its negative value */
+void Negate(const XTensor & a, XTensor & b);
+
 /* set every entry to its round value */
 void _Round(const XTensor * a, XTensor * b);
 /* set every entry to its round value (do it on site)
@@ -138,6 +152,20 @@ XTensor Round(const XTensor & a);
 /* set every entry to its round value */
 void Round(const XTensor & a, XTensor & b);

+/* set every entry to its sign value */
+void _Sign(const XTensor * a, XTensor * b);
+/* set every entry to its sign value (do it on site)
+keep the result in the input tensor a and return nothing */
+void _SignMe(XTensor * a);
+/* set every entry to its sign value (do it on site)
+keep the result in the input tensor a and return nothing */
+void SignMe(XTensor & a);
+/* set every entry to its sign value (return an XTensor structure)
+make a new tensor to keep the result and return it */
+XTensor Sign(const XTensor & a);
+/* set every entry to its sign value */
+void Sign(const XTensor & a, XTensor & b);
+
 /* set every entry to its sqrt value */
 void _Sqrt(const XTensor * a, XTensor * b);
 /* set every entry to its sqrt value (do it on site)
@@ -166,7 +194,6 @@ XTensor Square(const XTensor & a);
 /* set every entry to its square value */
 void Square(const XTensor & a, XTensor & b);

-
 /* set every entry to its sine value */
 void _Sin(const XTensor * a, XTensor * b);
 /* set every entry to its sine value (do it on site)

--- a/source/tensor/core/movement/CopyIndexed.cpp
+++ b/source/tensor/core/movement/CopyIndexed.cpp
@@ -189,6 +189,29 @@ void _CopyIndexed(const XTensor * s, XTensor * t, int dim,
    }
 }

+/* 
+copy selected sub-tensors
+
+>> s - the source tensor
+>> t - the target tensor
+>> dim - the leading dimension to define "sub-tensors"
+         e.g., for a tensor of size (3, 2, 4) and dim = 2, 
+         we have 4 sub-tensors of size (3, 2)
+>> srcIndex - the tensor to save the index of the source sub-tensors
+>> copyNum - number of the sub-tensors we copy for each source index, 
+             e.g., for srcIndex = [1,4] and copyNum = 2,
+             we actually copy the source sub-tensors 1, 2, 4, 5
+*/
+void _CopyIndexed(const XTensor * s, XTensor * t, int dim,                   
+                  const XTensor * srcIndex, int copyNum)
+{
+    XTensor * tgtIndex = NewTensor(srcIndex);
+    tgtIndex->SetAscendingOrder(0);
+
+    _CopyIndexed(s, t, dim, srcIndex, tgtIndex, copyNum);
+    delete tgtIndex;
+}
+
 /*
 copy selected sub-tensors where indeces are kept in tensors (return an XTensor structure)
 make a new tensor to keep the result and return it

--- a/source/tensor/core/movement/CopyIndexed.h
+++ b/source/tensor/core/movement/CopyIndexed.h
@@ -31,16 +31,14 @@ void _CopyIndexed(const XTensor * s, XTensor * t, int dim,
                  int * srcIndex, int indexSize, int * tgtIndex,
                  int copyNum = 1);

-/* copy selected sub-tensors where indeces are kept in tensors */
+/* copy selected sub-tensors */
 void _CopyIndexed(const XTensor * s, XTensor * t, int dim, 
                  const XTensor * srcIndex, const XTensor * tgtIndex, 
                  int copyNum = 1);

-/* 
-copy selected sub-tensors (return a XTensor structure)
-make a new tensor to keep the result and return it (remove this???)
-*/
-//XTensor CopyIndexed(const XTensor &s, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum);
+/* copy selected sub-tensors */
+void _CopyIndexed(const XTensor * s, XTensor * t, int dim,                   
+                  const XTensor * srcIndex, int copyNum = 1);

 /*
 copy selected sub-tensors where indeces are kept in tensors (return an XTensor structure)

--- a/source/tensor/core/movement/CopyValues.cpp
+++ b/source/tensor/core/movement/CopyValues.cpp
@@ -23,6 +23,7 @@
 #include "../../XUtility.h"
 #include "CopyValues.h"
 #include "CopyValues.cuh"
+#include "../getandset/ConvertDataType.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/movement/CopyValues.cu
+++ b/source/tensor/core/movement/CopyValues.cu
@@ -52,15 +52,15 @@ void _CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream)
    }
    /* dense -> sparse */
    else if (!s->isSparse && t->isSparse &&
-        s->dataType == DEFAULT_DTYPE &&
-        t->dataType == DEFAULT_DTYPE)
+              s->dataType == DEFAULT_DTYPE &&
+              t->dataType == DEFAULT_DTYPE)
    {
        ShowNTErrors("TODO!");
    }
    /* sparse -> dense */
    else if (s->isSparse && !t->isSparse &&
-        s->dataType == DEFAULT_DTYPE &&
-        t->dataType == DEFAULT_DTYPE)
+             s->dataType == DEFAULT_DTYPE &&
+             t->dataType == DEFAULT_DTYPE)
    {
        ShowNTErrors("TODO!");
    }

--- a/source/tensor/core/movement/Gather.cpp
+++ b/source/tensor/core/movement/Gather.cpp
@@ -33,28 +33,6 @@ gather indexed sub-tensors

 >> s - the source tensor
 >> t - the target tensor
->> dim - the leading dimension to define "sub-tensors"
-         e.g., for a tensor of size (3, 2, 4) and dim = 0, 
-         we have 3 sub-tensors of size (2, 4)
->> srcIndex - index of the source sub-tensors
->> indexSize - length of srcIndex (and tgtIndex)
-*/
-void _Gather(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize)
-{
-    int * tgtIndex = new int[indexSize];
-    for(int i = 0; i < indexSize; i++)
-        tgtIndex[i] = i;
-
-    _CopyIndexed(s, t, dim, srcIndex, indexSize, tgtIndex, 1);
-
-    delete[] tgtIndex;
-}
-
-/*
-gather indexed sub-tensors
-
->> s - the source tensor
->> t - the target tensor
 >> srcIndex - the tensor to save the index of the source tensor
 */
 void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex)
@@ -101,15 +79,10 @@ XTensor Gather(XTensor &s, XTensor &index)

    CheckNTErrors(s.order == 2, "The order of the input tensor must be 2!");
 
-    int order = s.order;
+    int order = index.order + 1;
    int * dimSize = new int[order];
-
-    for (int i = 0; i < s.order; i++) {
-        if (i == dim)
-            dimSize[i] = index.unitNum;
-        else
-            dimSize[i] = s.dimSize[i];
-    }
+    memcpy(dimSize, index.dimSize, index.order * sizeof(int));
+    dimSize[index.order] = s.GetDim(-1);
    
    float dr = (!s.isSparse) ? 1.0F : s.denseRatio;
    XTensor t(order, dimSize, s.dataType, dr, s.devID, s.mem);
@@ -122,20 +95,7 @@ XTensor Gather(XTensor &s, XTensor &index)
    /* tensor connection */
    XLink::MakeLink(&s, &index, &t, MOVEMENT_GATHER);

-    if(index.order > 1) {
-        int * dims = new int[index.order + 1];
-        memcpy(dims, index.dimSize, index.order * sizeof(int));
-        dims[index.order] = t.GetDim(-1);
-
-        XTensor tt;
-        tt = Reshape(t, index.order + 1, dims);
-        delete[] dims;
-
-        return tt;
-    }
-    else {
-        return t;
-    }   
+    return t;
 }

 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/movement/Gather.h
+++ b/source/tensor/core/movement/Gather.h
@@ -27,9 +27,6 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)

 /* gather selected sub-tensors */
-void _Gather(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize);
-
-/* gather selected sub-tensors */
 void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex);

 /* gather selected sub-tensors (return an XTensor structure)

--- a/source/tensor/core/movement/Spread.cpp
+++ b/source/tensor/core/movement/Spread.cpp
@@ -219,7 +219,6 @@ void _SpreadForCopyIndexed(XTensor * s, XTensor * c, int dim,
        
        }
    }
-
 }

 /*
@@ -236,15 +235,18 @@ void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index)
    int order = source->order;

    CheckNTErrors(source->dataType == DEFAULT_DTYPE, "TODO!");
+    CheckNTErrors(collection->GetDim(-1) == source->GetDim(-1), "Illegal dimension!");
+    CheckNTErrors(collection->unitNum/collection->GetDim(-1) == index->unitNum, 
+                 "Illegal dimension!");
    
-    for(int i = 0; i < order; i++){
-        if(i == dim){
-            CheckNTErrors(collection->GetDim(i) == index->unitNum, "Illegal dimension!");
-        }
-        else {
-            CheckNTErrors(collection->GetDim(i) == source->GetDim(i), "Illegal dimension!");
-        }
-    }
+    //for(int i = 0; i < order; i++){
+    //    if(i == dim){
+    //        CheckNTErrors(collection->GetDim(i) == index->unitNum, "Illegal dimension!");
+    //    }
+    //    else {
+    //        CheckNTErrors(collection->GetDim(i) == source->GetDim(i), "Illegal dimension!");
+    //    }
+    //}

 #ifdef USE_CUDA
    if(source->devID >= 0 && collection->devID >= 0) {

--- a/source/tensor/core/shape/Concatenate.cpp
+++ b/source/tensor/core/shape/Concatenate.cpp
@@ -137,6 +137,115 @@ XTensor Concatenate(const TensorList &smalls, int dim)
    }
 }

+bool CheckConcatenateShape(const TensorList &smalls, int dim, XTensor &big, bool uniform)
+{
+    XTensor * tensor = (XTensor*)smalls.GetItem(0);
+    int order = tensor->order;
+    int * dimSize = new int[order];
+
+    if (uniform) {
+        for (int i = 0; i < tensor->order; i++) {
+            if (i != dim)
+                dimSize[i] = tensor->dimSize[i];
+            else
+                dimSize[i] = tensor->dimSize[dim] * smalls.count;
+        }
+    }
+    else {
+        for (int i = 0; i < tensor->order; i++)
+            if (i != dim)
+                dimSize[i] = tensor->dimSize[i];
+
+        int catDimSize = 0;
+        for (int i = 0; i < smalls.count; i++) {
+            XTensor * tensor = (XTensor*)smalls.GetItem(i);
+            catDimSize += tensor->dimSize[dim];
+        }
+        dimSize[dim] = catDimSize;
+    }
+
+    for (int i = 0; i < order; i++) {
+        if (dimSize[i] != big.dimSize[i]) {
+            delete[] dimSize;
+            return false;
+        }
+    }
+
+    delete[] dimSize;
+    return false;
+}
+
+void Concatenate(const TensorList & smalls, XTensor & big, int dim)
+{
+    CheckNTErrors(smalls.count > 0, "Empty list!");
+    CheckNTErrors(dim >= 0, "Illegal dimension to concatenate!");
+
+    bool uniform = true;
+    for (int i = 1; i < smalls.count; i++) {
+        XTensor * a = (XTensor*)smalls.GetItem(i - 1);
+        XTensor * b = (XTensor*)smalls.GetItem(i);
+        CheckNTErrors((a && b), "Empty input tensors!");
+        if (!XTensor::IsSameShaped(a, b))
+            uniform = false;
+    }
+
+    if (!big.isInit || !CheckConcatenateShape(smalls, dim, big, uniform)) {
+        XTensor * tensor = (XTensor*)smalls.GetItem(0);
+        int order = tensor->order;
+        int * dimSize = new int[order];
+
+        if (uniform) {
+            for (int i = 0; i < tensor->order; i++) {
+                if (i != dim)
+                    dimSize[i] = tensor->dimSize[i];
+                else
+                    dimSize[i] = tensor->dimSize[dim] * smalls.count;
+            }
+
+            float dr = (!tensor->isSparse) ? 1.0F : tensor->denseRatio;
+            InitTensor(&big, order, dimSize, tensor->dataType, dr, tensor->devID, tensor->mem);
+        }
+        else {
+            for (int i = 0; i < tensor->order; i++)
+                if (i != dim)
+                    dimSize[i] = tensor->dimSize[i];
+
+            int catDimSize = 0;
+            for (int i = 0; i < smalls.count; i++) {
+                XTensor * tensor = (XTensor*)smalls.GetItem(i);
+                catDimSize += tensor->dimSize[dim];
+            }
+            dimSize[dim] = catDimSize;
+
+            float dr = (!tensor->isSparse) ? 1.0F : tensor->denseRatio;
+            InitTensor(&big, order, dimSize, tensor->dataType, dr, tensor->devID, tensor->mem);
+        }    
+        /* destroy variables */
+        delete[] dimSize;
+    }
+
+    if (uniform) {
+        /* call _Merge function */
+        _Merge(&smalls, &big, dim);
+                
+        /* tensor connection */
+        if (big.enableGrad) {
+            XLink::MakeLink(&smalls, &big, SHAPE_MERGE);
+            XLink::AddParamToHeadInt(&big, dim);
+        }
+    }
+    else {
+        /* call _ConcatenateSolely function */
+        _ConcatenateSolely(&smalls, &big, dim);
+
+        /* tensor connection */
+        if (big.enableGrad) {
+            XLink::MakeLink(&smalls, &big, SHAPE_CONCATENATE);
+            XLink::AddParamToHeadInt(&big, dim);    
+        }
+    }
+}
+
 /*
 concatenate two tensors along a given dimension


--- a/source/tensor/core/shape/Concatenate.h
+++ b/source/tensor/core/shape/Concatenate.h
@@ -41,6 +41,8 @@ Note that this is actually a wrapper that selects
 */
 XTensor Concatenate(const TensorList &smalls, int dim);

+void Concatenate(const TensorList & smalls, XTensor & big, int dim);
+
 /* concatenate two tensors along a given dimension */
 void _Concatenate(const XTensor * smallA, const XTensor * smallB, XTensor * big, int dim);


--- a/source/tensor/core/shape/Merge.cpp
+++ b/source/tensor/core/shape/Merge.cpp
@@ -273,16 +273,16 @@ void Merge(const XTensor &s, XTensor &t, int whereToMerge, int leadingDim)
 merge small tensors into a big tensor

 >> smalls - the list of the small tensors
->> big - the merged tensor (for return)
+>> t - the merged tensor (for return)
 >> whereToMerge - the merging operation is along with which dimension
 */
-void _Merge(const TensorList * smalls, XTensor * big, int whereToMerge)
+void _Merge(const TensorList * smalls, XTensor * t, int whereToMerge)
 {
-    whereToMerge = (whereToMerge < 0 ? big->order - 1 : whereToMerge);
+    whereToMerge = (whereToMerge < 0 ? t->order - 1 : whereToMerge);

    CheckNTErrors((smalls != NULL), "Invalid list!");
    CheckNTErrors((smalls->count > 0), "Empty list!");
-    CheckNTErrors((whereToMerge >= 0 && whereToMerge < big->order), "Wrong range of  whereToMerge");
+    CheckNTErrors((whereToMerge >= 0 && whereToMerge < t->order), "Wrong range of  whereToMerge");

    bool uniform = true;

@@ -292,7 +292,7 @@ void _Merge(const TensorList * smalls, XTensor * big, int whereToMerge)

    for (int i = 0; i < smalls->count; i++) {
        XTensor* smallsItem = smalls->GetItem(i);
-        CheckNTErrors((big->unitNum == smallsItem->unitNum * mergeNum), "Unmatched tensors!");
+        CheckNTErrors((t->unitNum == smallsItem->unitNum * mergeNum), "Unmatched tensors!");

        if (i > 0) {
            XTensor * preItem = smalls->GetItem(i - 1);
@@ -325,17 +325,17 @@ void _Merge(const TensorList * smalls, XTensor * big, int whereToMerge)
    /* merging with fewer data copy operations */
    if (mergedNum * gridNum <= MIN_TENSOR_MERGE_LIST_NUM) {
        int sPitch = blockSize * s0->unitSize;
-        int tPtich = blockSize * mergedNum * big->unitSize;
-        int mSize = blockSize * big->unitSize;
+        int tPtich = blockSize * mergedNum * t->unitSize;
+        int mSize = blockSize * t->unitSize;
        int n = blockNum;
        int sStep = 0;
-        int tStep = blockSize * big->unitSize;
+        int tStep = blockSize * t->unitSize;
        for (int g = 0; g < gridNum; g++) {
-            char * tData = (char*)big->data + g * blockSize * blockNum * big->unitSize;
+            char * tData = (char*)t->data + g * blockSize * blockNum * t->unitSize;
            for (int k = 0; k < mergedNum; k++) {
                XTensor * s = smalls->GetItem(k);
                char * sData = (char*)s->data + g * blockSize * blockNum * s->unitSize;
-                XMemCopy2D(tData + k * tStep, tPtich, big->devID,
+                XMemCopy2D(tData + k * tStep, tPtich, t->devID,
                    sData + k * sStep, sPitch, s->devID,
                    mSize, n);
            }
@@ -358,7 +358,7 @@ void _Merge(const TensorList * smalls, XTensor * big, int whereToMerge)
        if (uniform)
            dataTMP = smallsItem0->data;
        else
-            dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(big->devID, size);
+            dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(t->devID, size);

        tensorTMP->data = dataTMP;

@@ -370,7 +370,7 @@ void _Merge(const TensorList * smalls, XTensor * big, int whereToMerge)
            }
        }

-        _Merge(tensorTMP, big, whereToMerge + 1);
+        _Merge(tensorTMP, t, whereToMerge + 1);

        delete[] dimSizeTMP;

@@ -380,7 +380,7 @@ void _Merge(const TensorList * smalls, XTensor * big, int whereToMerge)
        if ((!uniform) && (mem != NULL))
            mem->ReleaseBuf(mem->devID, size);
        else
-            XMemFree(big->devID, dataTMP);
+            XMemFree(t->devID, dataTMP);
    }
 }


--- a/source/tensor/core/shape/Merge.h
+++ b/source/tensor/core/shape/Merge.h
@@ -36,7 +36,7 @@ XTensor Merge(const XTensor &s, int whereToMerge, int leadingDim = -1);
 void Merge(const XTensor &s, XTensor &t, int whereToMerge, int leadingDim = -1);

 /* merge small tensors into a big tensor */
-void _Merge(const TensorList * smalls, XTensor * big, int whereToMerge);
+void _Merge(const TensorList * smalls, XTensor * t, int whereToMerge);

 /* merge small tensors into a big tensor (return an XTensor structure) */
 XTensor Merge(const TensorList &smalls, int whereToMerge);

--- a/source/tensor/core/shape/Split.cpp
+++ b/source/tensor/core/shape/Split.cpp
@@ -31,7 +31,7 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)

 /*
-transform a tensor by splitting it, e.g., (N, M) -> (N/3, M, 3)
+transform a tensor by splitting it, e.g., (N, M) -> (3, N/3, M)

 >> s - the source tensor
 >> t - the target tensor (for return)
@@ -61,7 +61,7 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
    }

    /* for the case that we split the last dimension. Actually
-    (N, M) and (N, M/3, 3) have the same memory layout */
+    (N, M) and (3, N/3, M) have the same memory layout */
    if (s->order - 1 == whereToSplitRDI) {
        XMemCopy(t->data, t->devID, s->data, s->devID, s->unitNum * s->unitSize);
        return;
@@ -184,7 +184,7 @@ bool CheckSplitSize(const XTensor * s, const XTensor * t, int whereToSplit, int 
 }

 /*
-transform a tensor by splitting it, e.g., (N, M) -> (N/3, M, 3) (return an XTensor structure)
+transform a tensor by splitting it, e.g., (N, M) -> (3, N/3, M) (return an XTensor structure)
 make a new tensor to keep the result and return it

 >> s - the source tensor

--- a/source/tensor/function/FHeader.h
+++ b/source/tensor/function/FHeader.h
@@ -27,6 +27,7 @@
 #include "../XTensor.h"

 #include "Dropout.h"
+#include "DropoutWithIndex.h"
 #include "HardTanH.h"
 #include "Identity.h"
 #include "LogSoftmax.h"

--- a/source/tensor/function/HardTanH.cpp
+++ b/source/tensor/function/HardTanH.cpp
@@ -16,14 +16,13 @@
 */

 /*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-25
-*/
+ * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-25
+ */

 #include <stdlib.h>
 #include "../XName.h"
 #include "HardTanH.h"
 #include "HardTanH.cuh"
-#include "../loss/LHeader.h"

 namespace nts{ // namespace nts(NiuTrans.Tensor)

@@ -37,27 +36,27 @@ y =  1    if x > 1
 */
 void _HardTanH(const XTensor * x, XTensor * y)
 {
+    CheckNTErrors(XTensor::IsSameShaped(x, y), 
+                 "The input tensor and output tensor must have the same shape!")
+
 #ifdef USE_CUDA
    if(x->devID >= 0 || y->devID >= 0){
        _CudaHardTanH(x, y);
        return;
    }
 #endif
-    if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
-        int n = x->GetSize();
-        DTYPE * ip = (DTYPE*)x->data;
-        DTYPE * op = (DTYPE*)y->data;
-        for(int i = 0; i < n; i++){
-            DTYPE p = ip[i];
-            if(p > 1.0)
-                p = 1.0;
-            else if(p < -1.0)
-                p = -1.0;
-            op[i] = p;
-        }
+
+    int n = x->GetSize();
+    DTYPE * ip = (DTYPE*)x->data;
+    DTYPE * op = (DTYPE*)y->data;
+    for(int i = 0; i < n; i++){
+        DTYPE p = ip[i];
+        if(p > 1.0)
+            p = 1.0;
+        else if(p < -1.0)
+            p = -1.0;
+        op[i] = p;
    }
-    else
-        ShowNTErrors("TODO!");
 }

 /* 
@@ -111,50 +110,36 @@ hard tanh: y =  1    if x > 1
   and dy/dx =  1    if -1 <= x <= 1
                0    otherwise

->> gold - gold standard to measure error (or loss)
->> y - output of the function
->> x - input of the function
+>> y - output of the hardtanh function
+>> x - input of the hardtanh function
 >> dedy - dE/dy
 >> dedx - dE/dx
->> lossName - type of loss function, e.g., cross entropy
 */
-void _HardTanHBackward(XTensor * gold, XTensor * y, XTensor * x, 
-                       XTensor * dedy, XTensor * dedx,
-                       LOSS_FUNCTION_NAME lossName)
+void _HardTanHBackward(XTensor * y, XTensor * x, 
+                       XTensor * dedy, XTensor * dedx)
 {
-    CheckNTErrors((gold == NULL || XTensor::IsSameShaped(gold, y)), 
-                   "The tensors must be of the same size!");
+    CheckNTErrors(x != NULL, "The input tensor x must be not NULL!")

 #ifdef USE_CUDA
-    if(x->devID >= 0 || y->devID >= 0){
-        _CudaHardTanHBackward(gold, y, x, dedy, dedx, lossName);
+    if(x->devID >= 0){
+        _CudaHardTanHBackward(y, x, dedy, dedx);
        return;
    }
 #endif

-    if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
-        /* calculate dE/dy */
-        if(lossName == CROSSENTROPY)
-            _CrossEntropyBackward(dedy, y, gold);
-        else if(lossName != NOLOSS)
-            _LossBackward(dedy, gold, y, lossName);
-
-        DTYPE * dedyp = (DTYPE*)dedy->data;
-        DTYPE * dedxp = (DTYPE*)dedx->data;
-        DTYPE * ip = (DTYPE*)x->data;
-        int size = y->unitNum;
-
-        /* dE/dx = dE/dy * dy/dx */
-        for(int i = 0; i < size; i++){
-            DTYPE s =ip[i];
-            if(s > 1.0 || s < -1.0)
-                dedxp[i] = 0;
-            else
-                dedxp[i] = dedyp[i];
-        }
+    DTYPE * dedyp = (DTYPE*)dedy->data;
+    DTYPE * dedxp = (DTYPE*)dedx->data;
+    DTYPE * ip = (DTYPE*)x->data;
+    int size = x->unitNum;
+
+    /* dE/dx = dE/dy * dy/dx */
+    for(int i = 0; i < size; i++){
+        DTYPE s =ip[i];
+        if(s > 1.0 || s < -1.0)
+            dedxp[i] = 0;
+        else
+            dedxp[i] = dedyp[i];
    }
-    else
-        ShowNTErrors("TODO!");
 }

 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/function/HardTanH.cu
+++ b/source/tensor/function/HardTanH.cu
@@ -21,8 +21,6 @@

 #include "HardTanH.h"
 #include "HardTanH.cuh"
-#include "Loss.cuh"
-#include "../loss/CrossEntropy.cuh"
 #include "../XDevice.h"

 namespace nts{ // namespace nts(NiuTrans.Tensor)
@@ -63,25 +61,19 @@ y =  1    if x > 1
 */
 void _CudaHardTanH(const XTensor * x, XTensor * y)
 {
-    if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
+    CheckNTErrors(!x->isSparse && !y->isSparse, 
+                  "The hard tanh activation function does not support sparse tensors.");

-        CheckNTErrors(!x->isSparse && !y->isSparse, "The hard tanh activation function does not support sparse tensors.");
-        CheckNTErrors(x->unitNum && y->unitNum, "The x vectors must be of the same length.");
+    int gridSize[3], blockSize[3];

-        int gridSize[3], blockSize[3];
+    GDevs.GetCudaThread(x->devID, x->unitNum, gridSize, blockSize);

-        GDevs.GetCudaThread(x->devID, x->unitNum, gridSize, blockSize);
+    int devIDBackup;
+    ProtectCudaDev(x->devID, devIDBackup);

-        int devIDBackup;
-        ProtectCudaDev(x->devID, devIDBackup);
+    KernelHardtanhCompute<<<dim3(gridSize[0]), dim3(blockSize[0])>>>((DTYPE*)x->data, (DTYPE*)y->data, x->unitNum);

-        KernelHardtanhCompute<<<dim3(gridSize[0]), dim3(blockSize[0])>>>((DTYPE*)x->data, (DTYPE*)y->data, x->unitNum);
-
-        BacktoCudaDev(x->devID, devIDBackup);
-    }
-    else{
-        ShowNTErrors("TODO!");
-    }
+    BacktoCudaDev(x->devID, devIDBackup);
 }

 /* 
@@ -92,13 +84,12 @@ dy/dx = 1     if -1 <= x <= 1

 >> dedy - dE/dy
 >> dedx - dE/dx
->> gold - gold standard
 >> y - y of the function
 >> x - x of the function
 >> size - size of y/x
 */
 __global__ 
-void KernelHardtanhBackward(DTYPE * dedy, DTYPE * dedx, DTYPE * gold, DTYPE * y, DTYPE * x, int size)
+void KernelHardtanhBackward(DTYPE * dedy, DTYPE * dedx, DTYPE * x, int size)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;

@@ -123,44 +114,29 @@ hard tanh: y =  1    if x > 1
   and dy/dx =  1    if -1 <= x <= 1
                0    otherwise

->> gold - gold standard to measure error (or loss)
->> y - output of the function
->> x - input of the function
+>> y - output of the hardtanh function
+>> x - input of the hardtanh function
 >> dedy - dE/dy
 >> dedx - dE/dx
->> lossName - type of loss function, e.g., cross entropy
 */
-void _CudaHardTanHBackward(XTensor * gold, XTensor * y, XTensor * x, 
-                           XTensor * dedy, XTensor * dedx,
-                           LOSS_FUNCTION_NAME lossName)
+void _CudaHardTanHBackward(XTensor * y, XTensor * x, 
+                           XTensor * dedy, XTensor * dedx)
 {
-    if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
+    int gridSize[3], blockSize[3];

-        /* calculate dE/dy */
-        if(lossName == CROSSENTROPY)
-            _CudaCrossEntropyBackward(dedy, y, gold);
-        else if(lossName != NOLOSS)
-            _CudaLossBackward(dedy, gold, y, lossName);
+    GDevs.GetCudaThread(x->devID, x->unitNum, gridSize, blockSize);

-        int gridSize[3], blockSize[3];
+    int devIDBackup;
+    ProtectCudaDev(x->devID, devIDBackup);

-        GDevs.GetCudaThread(x->devID, x->unitNum, gridSize, blockSize);
+    /* dE/dx = dE/dy * dy/dx */
+    KernelHardtanhBackward<<<dim3(gridSize[0]),dim3(blockSize[0])>>>
+                            ((DTYPE*)dedy->data, 
+                            (DTYPE*)dedx->data,
+                            (DTYPE*)x->data, 
+                             x->unitNum);

-        int devIDBackup;
-        ProtectCudaDev(x->devID, devIDBackup);
-
-        /* dE/dx = dE/dy * dy/dx */
-        KernelHardtanhBackward<<<dim3(gridSize[0]),dim3(blockSize[0])>>>
-                               ((DTYPE*)dedy->data, 
-                                (DTYPE*)dedx->data,
-                                 gold == NULL ? NULL : (DTYPE*)gold->data, 
-                                (DTYPE*)y->data, (DTYPE*)x->data, 
-                                 x->unitNum);
-
-        BacktoCudaDev(x->devID, devIDBackup);
-    }
-    else
-        ShowNTErrors("TODO!");
+    BacktoCudaDev(x->devID, devIDBackup);
 }

 #endif

--- a/source/tensor/function/HardTanH.cuh
+++ b/source/tensor/function/HardTanH.cuh
@@ -23,7 +23,6 @@
 #define __HARDTANH_CUH__

 #include "../XTensor.h"
-#include "Loss.h"

 namespace nts{ // namespace nts(NiuTrans.Tensor)

@@ -38,9 +37,8 @@ y =  1    if x > 1
 void _CudaHardTanH(const XTensor * input, XTensor * output);

 /* de/dx (Cuda version) */
-void _CudaHardTanHBackward(XTensor * gold, XTensor * y, XTensor * x, 
-                           XTensor * dedy, XTensor * dedx,
-                           LOSS_FUNCTION_NAME lossName);
+void _CudaHardTanHBackward(XTensor * y, XTensor * x, 
+                           XTensor * dedy, XTensor * dedx);

 #endif // USE_CUDA


--- a/source/tensor/function/HardTanH.h
+++ b/source/tensor/function/HardTanH.h
@@ -23,7 +23,6 @@
 #define __HARDHANH_H__

 #include "../XTensor.h"
-#include "Loss.h"

 namespace nts{ // namespace nts(NiuTrans.Tensor)

@@ -43,9 +42,8 @@ XTensor HardTanH(const XTensor &x);
 void HardTanH(const XTensor &x, XTensor &y);

 /* de/dx */
-void _HardTanHBackward(XTensor * gold, XTensor * y, XTensor * x, 
-                       XTensor * dedy, XTensor * dedx,
-                       LOSS_FUNCTION_NAME lossName);
+void _HardTanHBackward(XTensor * y, XTensor * x, 
+                       XTensor * dedy, XTensor * dedx);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/function/Identity.cpp
+++ b/source/tensor/function/Identity.cpp
@@ -19,9 +19,8 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-27
 */

-#include "../XName.h"
 #include "Identity.h"
-#include "../loss/LHeader.h"
+#include "../XName.h"
 #include "../XUtility.h"
 #include "../core/movement/CopyValues.h"

@@ -30,10 +29,12 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 /* 
 identity function y = x 
 >> x - input tensor
->> y - result
+>> y - output tensor
 */
 void _Identity(const XTensor * x, XTensor * y)
 {
+    CheckNTErrors(XTensor::IsSameShaped(x, y), 
+                 "The input tensor and output tensor must have the same shape!")
    _CopyValues(x, y);
 }

@@ -42,7 +43,7 @@ identity function y = x (return an XTensor structure)
 make a new tensor to keep the result and return it

 >> x - input tensor
-<< return - y
+<< return - output tensor
 */
 XTensor Identity(const XTensor &x)
 {
@@ -78,33 +79,16 @@ backward computation for identity function y = x

 dE/dx = dE/dy * dy/dx = dE/dy

->> gold - gold standard to measure error (or loss)
->> y - output of the function
->> x - input of the function
+>> y - output of the identity function
+>> x - input of the identity function
 >> dedy - dE/dy
 >> dedx - dE/dx
->> lossName - type of loss function, e.g., cross entropy
 */
-void _IdentityBackward(XTensor * gold, XTensor * y, XTensor * x, 
-                       XTensor * dedy, XTensor * dedx,
-                       LOSS_FUNCTION_NAME lossName)
+void _IdentityBackward(const XTensor * y, const XTensor * x,
+                       const XTensor * dedy, XTensor * dedx)
 {
-    CheckNTErrors((gold == NULL || XTensor::IsSameShaped(gold, y)), 
-                  "The tensors must be of the same size!");
-
-    if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE)
-    {
-        /* calculate dE/dy */
-        if(lossName == CROSSENTROPY)
-            _CrossEntropyBackward(dedy, y, gold);
-        else if(lossName != NOLOSS)
-            _LossBackward(dedy, gold, y, lossName);
-
-        if(dedy->data != dedx->data)
-            _CopyValues(dedy, dedx);
-    }
-    else
-        ShowNTErrors("TODO!");
+    if(dedy->data != dedx->data)
+        _CopyValues(dedy, dedx);
 }

 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/function/Identity.h
+++ b/source/tensor/function/Identity.h
@@ -23,7 +23,6 @@
 #define __IDENTITY_H__

 #include "../XTensor.h"
-#include "Loss.h"

 namespace nts{ // namespace nts(NiuTrans.Tensor)

@@ -36,9 +35,8 @@ XTensor Identity(const XTensor &x);
 void Identity(const XTensor &x, XTensor &y);

 /* de/dx */
-void _IdentityBackward(XTensor * gold, XTensor * y, XTensor * x, 
-                       XTensor * dedy, XTensor * dedx,
-                       LOSS_FUNCTION_NAME lossName);
+void _IdentityBackward(const XTensor * y, const XTensor * x, 
+                       const XTensor * dedy, XTensor * dedx);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/function/LogSoftmax.cpp
+++ b/source/tensor/function/LogSoftmax.cpp
@@ -222,7 +222,6 @@ void LogSoftmax(const XTensor &x, XTensor &y, int leadDim)
    }
 }

-
 /*
 backward computation for dense matrices with default data type


--- a/source/tensor/function/LogSoftmax.cuh
+++ b/source/tensor/function/LogSoftmax.cuh
@@ -16,8 +16,8 @@
 */

 /*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-26
-*/
+ * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-26
+ */

 #ifndef __LOGSOFTMAX_CUH__
 #define __LOGSOFTMAX_CUH__

--- a/source/tensor/function/LogSoftmax.h
+++ b/source/tensor/function/LogSoftmax.h
@@ -16,8 +16,8 @@
 */

 /*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-25
-*/
+ * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-25
+ */

 #ifndef __LOGSOFTMAX_H__
 #define __LOGSOFTMAX_H__
@@ -36,6 +36,9 @@ XTensor LogSoftmax(const XTensor &x, int leadDim);
 /* log scale softmax y = log(e^x / \sum_{i} e^{x_i}) (with both argument of x and y) */
 void LogSoftmax(const XTensor &x, XTensor &y, int leadDim);

+/* log scale softmax y = log(e^x / \sum_{i} e^{x_i}) (with both argument of x and y) */
+void LogSoftmax(const XTensor &x, XTensor &y, int leadDim);
+
 /* de/dx */
 void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, 
                         XTensor * dedy, XTensor * dedx, 

--- a/source/tensor/function/Loss.cu
+++ b/source/tensor/function/Loss.cu
@@ -22,10 +22,9 @@
 #include "Loss.h"
 #include "Loss.cuh"
 #include "../XDevice.h"
-#include "../core/math/Power.h"
 #include "../core/math/ScaleAndShift.h"
 #include "../core/math/Unary.h"
-#include "../core/arithmetic/Negate.h"
+#include "../core/math/Binary.h"
 #include "../core/arithmetic/Sum.h"
 #include "../core/arithmetic/Multiply.h"
 #include "../core/reduce/ReduceSum.h"

--- a/source/tensor/function/Rectify.cpp
+++ b/source/tensor/function/Rectify.cpp
@@ -16,23 +16,25 @@
 */

 /*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
+ * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
+ */

 #include "../XName.h"
 #include "Rectify.h"
 #include "Rectify.cuh"
-#include "../loss/LHeader.h"

 namespace nts{ // namespace nts(NiuTrans.Tensor)

 /*
 rectify function y = max(0, x)
->> input - input tensor
->> output - result
+>> x - input tensor
+>> y - output tensor
 */
 void _Rectify(const XTensor * x, XTensor * y)
 {
+    CheckNTErrors(XTensor::IsSameShaped(x, y), 
+                 "The input tensor and output tensor must have the same shape!")
+
 #ifdef USE_CUDA
    if(x->devID >= 0 || y->devID >= 0){
        _CudaRectify(x, y);
@@ -40,28 +42,24 @@ void _Rectify(const XTensor * x, XTensor * y)
    }
 #endif

-    if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
-        DTYPE * ip = (DTYPE*)x->data;
-        DTYPE * op = (DTYPE*)y->data;
-        int n = x->GetSize();
-        for(int i = 0; i < n; i++){
-            DTYPE p = ip[i];
-            if(p < 0)
-                p = 0;
-
-            op[i] = p;
-        }
+    DTYPE * ip = (DTYPE*)x->data;
+    DTYPE * op = (DTYPE*)y->data;
+    int n = x->GetSize();
+    for(int i = 0; i < n; i++){
+        DTYPE p = ip[i];
+        if(p < 0)
+            p = 0;
+
+        op[i] = p;
    }
-    else
-        ShowNTErrors("TODO!");
 }

 /*
 rectify function y = max(0, x) (return an XTensor structure) 
 make a new tensor to keep the result and return it

->> input - input tensor
-<< return - y
+>> x - input tensor
+<< return - output tensor
 */
 XTensor Rectify(const XTensor &x)
 {
@@ -107,50 +105,36 @@ rectified: y = 0     if x < 0
   and dy/ds = 0     if x < 0
               1     otherwise

->> gold - gold standard to measure error (or loss)
->> y - output of the function
->> x - input of the function
+>> y - output of the rectify function
+>> x - input of the rectify function
 >> dedy - dE/dy
 >> dedx - dE/dx
->> lossName - type of loss function, e.g., cross entropy
 */
-void _RectifyBackward(XTensor * gold, XTensor * y, XTensor * x, 
-                      XTensor * dedy, XTensor * dedx,
-                      LOSS_FUNCTION_NAME lossName)
+void _RectifyBackward(XTensor * y, XTensor * x, 
+                      XTensor * dedy, XTensor * dedx)
 {
-    CheckNTErrors((gold == NULL || XTensor::IsSameShaped(gold, y)), 
-                  "The tensors must be of the same size!");
+    CheckNTErrors(x != NULL, "The input tensor x must be not NULL!")

 #ifdef USE_CUDA
-    if(x->devID >= 0 || y->devID >= 0){
-        _CudaRectifyBackward(gold, y, x, dedy, dedx, lossName);
+    if(x->devID >= 0){
+        _CudaRectifyBackward(y, x, dedy, dedx);
        return;
    }
 #endif

-    if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE)
-    {
-        /* calculate dE/dy */
-        if(lossName == CROSSENTROPY)
-            _CrossEntropyBackward(dedy, y, gold);
-        else if(lossName != NOLOSS)
-            _LossBackward(dedy, gold, y, lossName);
-
-        DTYPE * dedyp = (DTYPE*)dedy->data;
-        DTYPE * dedxp = (DTYPE*)dedx->data;
-        DTYPE * ip = (DTYPE*)x->data;
-        int size = y->unitNum;
-        for(int i = 0; i < size; i++){
-            /* dE/ds = dE/dy * dy/ds = dE/dy */
-            DTYPE s = ip[i];
-            if(s < 0)
-                dedxp[i] = 0;
-            else
-                dedxp[i] = dedyp[i];
-        }
+    DTYPE * dedyp = (DTYPE*)dedy->data;
+    DTYPE * dedxp = (DTYPE*)dedx->data;
+    DTYPE * ip = (DTYPE*)x->data;
+    int size = x->unitNum;
+
+    for(int i = 0; i < size; i++){
+        /* dE/ds = dE/dy * dy/ds = dE/dy */
+        DTYPE s = ip[i];
+        if(s < 0)
+            dedxp[i] = 0;
+        else
+            dedxp[i] = dedyp[i];
    }
-    else
-        ShowNTErrors("TODO!");
 }

 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/function/Rectify.cu
+++ b/source/tensor/function/Rectify.cu
@@ -16,13 +16,10 @@
 */

 /*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
+ * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
+ */

-#include "Rectify.h"
 #include "Rectify.cuh"
-#include "Loss.cuh"
-#include "../loss/CrossEntropy.cuh"
 #include "../XDevice.h"

 namespace nts{ // namespace nts(NiuTrans.Tensor)
@@ -57,24 +54,17 @@ rectify function y = max(0, x)
 */
 void _CudaRectify(const XTensor * x, XTensor * y)
 {
-    if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
-
-        CheckNTErrors(!x->isSparse && !y->isSparse, "The Rectify function does not support sparse matrices.");
-        CheckNTErrors(x->unitNum && y->unitNum, "The input vectors must be of the same length.");
-
-        int gridSize[3], blockSize[3];
+    int gridSize[3], blockSize[3];

-        GDevs.GetCudaThread(x->devID, x->unitNum, gridSize, blockSize);
+    GDevs.GetCudaThread(x->devID, x->unitNum, gridSize, blockSize);

-        int devIDBackup;
-        ProtectCudaDev(x->devID, devIDBackup);
+    int devIDBackup;
+    ProtectCudaDev(x->devID, devIDBackup);

-        KernelRectify<<<dim3(gridSize[0]), dim3(blockSize[0])>>>((DTYPE*)x->data, (DTYPE*)y->data, x->unitNum);
+    KernelRectify<<<dim3(gridSize[0]), dim3(blockSize[0])>>>
+                  ((DTYPE*)x->data, (DTYPE*)y->data, x->unitNum);

-        BacktoCudaDev(x->devID, devIDBackup);
-    }
-    else
-        ShowNTErrors("TODO!");
+    BacktoCudaDev(x->devID, devIDBackup);
 }

 /* 
@@ -85,13 +75,11 @@ dy/dx =  1    if x >= 0

 >> dedy - dE/dy
 >> dedx - dE/dx
->> gold - gold standard
->> y - output of the function
 >> x - input of the function
 >> size - size of output/input
 */
 __global__ 
-void KernelRectifyBackward(DTYPE * dedy, DTYPE * dedx, DTYPE * gold, DTYPE * y, DTYPE * x, int size)
+void KernelRectifyBackward(DTYPE * dedy, DTYPE * dedx, DTYPE * x, int size)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;

@@ -104,11 +92,10 @@ void KernelRectifyBackward(DTYPE * dedy, DTYPE * dedx, DTYPE * gold, DTYPE * y, 
    }
 }

-
 /*
 backward computation (Cuda version)

-dE/ds = dE/dy * dy/ds
+dE/dx = dE/dy * dy/dx

 rectify  : y =  s    if s >= 0
                0    if s < 0
@@ -116,48 +103,29 @@ rectify  : y =  s    if s >= 0
   and dy/ds =  1    if s >= 0
                0    otherwise

->> gold - gold standard to measure error (or loss)
->> output - output of the activation function, i.e., y
->> input - input of the activation function , i.e., s
->> dEdY - dE/dy
->> dEdS - dE/ds
->> lossName - type of loss function, e.g., cross entropy
->> gBeg - where to start in the gold standard (along the leading dimension)
->> gLen - segment length from gBeg (along the leading dimension)
->> oBeg - where to start in the model output (along the leading dimension)
->> parallelRunner - parallel processing module
+>> y - output of the rectify function
+>> x - input of the rectify function
+>> dedy - dE/dy
+>> dedx - dE/dx
 */
-void _CudaRectifyBackward(XTensor * gold, XTensor * y, XTensor * x, 
-                          XTensor * dedy, XTensor * dedx,
-                          LOSS_FUNCTION_NAME lossName)
+void _CudaRectifyBackward(XTensor * y, XTensor * x, 
+                          XTensor * dedy, XTensor * dedx)
 {
-    if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
-
-        /* calculate dE/dy */
-        if(lossName == CROSSENTROPY)
-            _CudaCrossEntropyBackward(dedy, y, gold);
-        else if(lossName != NOLOSS)
-            _CudaLossBackward(dedy, gold, y, lossName);
-        
-        int gridSize[3], blockSize[3];
-
-        GDevs.GetCudaThread(x->devID, x->unitNum, gridSize, blockSize);
-
-        int devIDBackup;
-        ProtectCudaDev(x->devID, devIDBackup);
-
-        /* dE/ds = dE/dy * dy/ds */
-        KernelRectifyBackward<<<dim3(gridSize[0]),dim3(blockSize[0])>>>
-                              ((DTYPE*)dedy->data, 
-                               (DTYPE*)dedx->data,
-                                gold == NULL ? NULL : (DTYPE*)gold->data, 
-                               (DTYPE*)y->data, (DTYPE*)x->data, 
-                                x->unitNum);
-
-        BacktoCudaDev(x->devID, devIDBackup);
-    }
-    else
-        ShowNTErrors("TODO!");
+    int gridSize[3], blockSize[3];
+
+    GDevs.GetCudaThread(x->devID, x->unitNum, gridSize, blockSize);
+
+    int devIDBackup;
+    ProtectCudaDev(x->devID, devIDBackup);
+
+    /* dE/ds = dE/dy * dy/ds */
+    KernelRectifyBackward<<<dim3(gridSize[0]),dim3(blockSize[0])>>>
+                          ((DTYPE*)dedy->data, 
+                           (DTYPE*)dedx->data,
+                           (DTYPE*)x->data, 
+                            x->unitNum);
+
+    BacktoCudaDev(x->devID, devIDBackup);
 }

 #endif

--- a/source/tensor/function/Rectify.cuh
+++ b/source/tensor/function/Rectify.cuh
@@ -23,7 +23,6 @@
 #define __RECTIFY_CUH__

 #include "../XTensor.h"
-#include "Loss.h"

 namespace nts{ // namespace nts(NiuTrans.Tensor)

@@ -33,9 +32,8 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 void _CudaRectify(const XTensor * input, XTensor * output);

 /* de/dx (Cuda version) */
-void _CudaRectifyBackward(XTensor * gold, XTensor * y, XTensor * x, 
-                          XTensor * dedy, XTensor * dedx,
-                          LOSS_FUNCTION_NAME lossName);
+void _CudaRectifyBackward(XTensor * y, XTensor * x, 
+                          XTensor * dedy, XTensor * dedx);

 #endif // USE_CUDA


--- a/source/tensor/function/Rectify.h
+++ b/source/tensor/function/Rectify.h
@@ -23,7 +23,6 @@
 #define __RECTIFY_H__

 #include "../XTensor.h"
-#include "Loss.h"

 namespace nts{ // namespace nts(NiuTrans.Tensor)

@@ -36,9 +35,8 @@ XTensor Rectify(const XTensor &x);
 void Rectify(const XTensor &x, XTensor &y);

 /* de/dx */
-void _RectifyBackward(XTensor * gold, XTensor * y, XTensor * x, 
-                      XTensor * dedy, XTensor * dedx,
-                      LOSS_FUNCTION_NAME lossName);
+void _RectifyBackward(XTensor * y, XTensor * x, 
+                      XTensor * dedy, XTensor * dedx);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/function/Sigmoid.cpp
+++ b/source/tensor/function/Sigmoid.cpp
@@ -34,6 +34,9 @@ sigmoid function y = 1/(1+exp(-x))
 */
 void _Sigmoid(const XTensor * x, XTensor * y)
 {
+    CheckNTErrors(XTensor::IsSameShaped(x, y), 
+                 "The input tensor and output tensor must have the same shape!")
+
 #ifdef USE_CUDA
    if(x->devID >= 0 || y->devID >= 0){
        _CudaSigmoid(x, y);
@@ -59,7 +62,7 @@ sigmoid function y = 1/(1+exp(-x)) (return an XTensor structure)
 make a new tensor to keep the result and return it

 >> x - input tensor
-<< return - y
+<< return - output tensor
 */
 XTensor Sigmoid(const XTensor &x)
 {
@@ -97,50 +100,32 @@ dE/ds = dE/dy * dy/dx

 sigmoid: y = 1/(1+exp(-x))

-   and dy/dx = y * (1 -y)
+   and dy/dx = y * (1 - y)

->> gold - gold standard to measure the error (or loss)
 >> y - output of the function
 >> x - input of the function
 >> dedy - dE/dy
 >> dedx - dE/dx
->> lossName - type of loss function, e.g., cross entropy
 */
-void _SigmoidBackward(XTensor * gold, XTensor * y, XTensor * x, 
-                      XTensor * dedy, XTensor * dedx,
-                      LOSS_FUNCTION_NAME lossName)
+void _SigmoidBackward(XTensor * y, XTensor * x, 
+                      XTensor * dedy, XTensor * dedx)
 {
-    CheckNTErrors((gold == NULL || XTensor::IsSameShaped(gold, y)), 
-                  "The tensors must be of the same size!");
-
 #ifdef USE_CUDA
-    if(x->devID >= 0 || y->devID >= 0){
-        _CudaSigmoidBackward(gold, y, x, dedy, dedx, lossName);
+    if(x->devID >= 0){
+        _CudaSigmoidBackward(y, x, dedy, dedx);
        return;
    }
 #endif
-
-    if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE)
-    {
-        /* calculate dE/dy */
-        if(lossName == CROSSENTROPY)
-            _CrossEntropyBackward(dedy, y, gold);
-        else if(lossName != NOLOSS)
-            _LossBackward(dedy, gold, y, lossName);
-
-        DTYPE * dedyp = (DTYPE*)dedy->data;
-        DTYPE * dedxp = (DTYPE*)dedx->data;
-        DTYPE * op = (DTYPE*)y->data;
-        int size = y->unitNum;
-
-        /* dE/dx = dE/dy * dy/dx */
-        for(int i = 0; i < size; i++){
-            DTYPE y = op[i];
-            dedxp[i] = dedyp[i] * (DTYPE)y * ((DTYPE)1.0 - y);
-        }
+    DTYPE * dedyp = (DTYPE*)dedy->data;
+    DTYPE * dedxp = (DTYPE*)dedx->data;
+    DTYPE * op = (DTYPE*)y->data;
+    int size = y->unitNum;
+
+    /* dE/dx = dE/dy * dy/dx */
+    for(int i = 0; i < size; i++){
+        DTYPE y = op[i];
+        dedxp[i] = dedyp[i] * (DTYPE)y * ((DTYPE)1.0 - y);
    }
-    else
-        ShowNTErrors("TODO!");
 }

 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/function/Sigmoid.cu
+++ b/source/tensor/function/Sigmoid.cu
@@ -61,24 +61,19 @@ sigmoid function y = 1/(1+exp(-x)) (Cuda version)
 */
 void _CudaSigmoid(const XTensor * x, XTensor * y)
 {
-    if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
+    CheckNTErrors(!x->isSparse && !y->isSparse, "the activation function (rectify) does not support sparse matrices.");
+    CheckNTErrors(x->unitNum && y->unitNum, "we require two vectors with the same length.");

-        CheckNTErrors(!x->isSparse && !y->isSparse, "the activation function (rectify) does not support sparse matrices.");
-        CheckNTErrors(x->unitNum && y->unitNum, "we require two vectors with the same length.");
+    int gridSize[3], blockSize[3];

-        int gridSize[3], blockSize[3];
+    GDevs.GetCudaThread(x->devID, x->unitNum, gridSize, blockSize);

-        GDevs.GetCudaThread(x->devID, x->unitNum, gridSize, blockSize);
+    int devIDBackup;
+    ProtectCudaDev(x->devID, devIDBackup);

-        int devIDBackup;
-        ProtectCudaDev(x->devID, devIDBackup);
+    KernelSigmoidCompute<<<dim3(gridSize[0]), dim3(blockSize[0])>>>((DTYPE*)x->data, (DTYPE*)y->data, x->unitNum);

-        KernelSigmoidCompute<<<dim3(gridSize[0]), dim3(blockSize[0])>>>((DTYPE*)x->data, (DTYPE*)y->data, x->unitNum);
-
-        BacktoCudaDev(x->devID, devIDBackup);
-    }
-    else
-        ShowNTErrors("TODO!");
+    BacktoCudaDev(x->devID, devIDBackup);
 }

 /* 
@@ -92,13 +87,12 @@ sigmoid: y = 1/(1+exp(-x))

 >> dedy - dE/dy
 >> dedx - dE/ds
->> gold - gold standard
 >> y - output of the function
 >> x - input of the function
 >> size - size of output/input
 */
 __global__ 
-void KernelSigmoidBackward(DTYPE * dedy, DTYPE * dedx, DTYPE * gold, DTYPE * y, DTYPE * x, int size)
+void KernelSigmoidBackward(DTYPE * dedy, DTYPE * dedx, DTYPE * y, int size)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;

@@ -116,46 +110,31 @@ sigmoid: y = 1/(1+exp(-x))

   and dy/dx = y * (1 -y)

->> gold - gold standard to measure error (or loss)
 >> y - output of the function
 >> x - input of the function
 >> dedy - dE/dy
 >> dedx - dE/dx
->> lossName - type of loss function, e.g., cross entropy
 */
-void _CudaSigmoidBackward(XTensor * gold, XTensor * y, XTensor * x, 
-                          XTensor * dedy, XTensor * dedx,
-                          LOSS_FUNCTION_NAME lossName)
+void _CudaSigmoidBackward(XTensor * y, XTensor * x, 
+                          XTensor * dedy, XTensor * dedx)
 {
-    if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
-        /* calculate dE/dy */
-        if(lossName == CROSSENTROPY)
-            _CudaCrossEntropyBackward(dedy, y, gold);
-        else if(lossName != NOLOSS)
-            _LossBackward(dedy, gold, y, lossName);
-
-        
-        int gridSize[3], blockSize[3];
-
-        GDevs.GetCudaThread(x->devID, x->unitNum, gridSize, blockSize);
-
-        int devIDBackup;
-        ProtectCudaDev(x->devID, devIDBackup);
-
-        /* dE/ds = dE/dy * dy/ds */
-        KernelSigmoidBackward<<<dim3(gridSize[0]),dim3(blockSize[0])>>>
-                              ((DTYPE*)dedy->data, 
-                               (DTYPE*)dedx->data,
-                                gold == NULL ? NULL : (DTYPE*)gold->data, 
-                               (DTYPE*)y->data, (DTYPE*)x->data, 
-                                x->unitNum);
-
-        BacktoCudaDev(x->devID, devIDBackup);
-    }
-    else
-        ShowNTErrors("TODO!");
+    int gridSize[3], blockSize[3];
+
+    GDevs.GetCudaThread(y->devID, y->unitNum, gridSize, blockSize);
+
+    int devIDBackup;
+    ProtectCudaDev(y->devID, devIDBackup);
+
+    /* dE/dx = dE/dy * dy/dx */
+    KernelSigmoidBackward<<<dim3(gridSize[0]),dim3(blockSize[0])>>>
+                            ((DTYPE*)dedy->data,
+                            (DTYPE*)dedx->data,
+                            (DTYPE*)y->data,
+                            y->unitNum);
+
+    BacktoCudaDev(x->devID, devIDBackup);
 }

-#endif
+#endif // USE_CUDA

 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/function/Sigmoid.cuh
+++ b/source/tensor/function/Sigmoid.cuh
@@ -23,7 +23,6 @@
 #define __SIGMOID_CUH__

 #include "../XTensor.h"
-#include "Loss.h"

 namespace nts{ // namespace nts(NiuTrans.Tensor)

@@ -33,9 +32,8 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 void _CudaSigmoid(const XTensor * input, XTensor * output);

 /* de/dx (Cuda version) */
-void _CudaSigmoidBackward(XTensor * gold, XTensor * y, XTensor * x, 
-                          XTensor * dedy, XTensor * dedx,
-                          LOSS_FUNCTION_NAME lossName);
+void _CudaSigmoidBackward(XTensor * y, XTensor * x, 
+                          XTensor * dedy, XTensor * dedx);

 #endif // USE_CUDA


--- a/source/tensor/function/Sigmoid.h
+++ b/source/tensor/function/Sigmoid.h
@@ -16,14 +16,13 @@
 */

 /*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-25
-*/
+ * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-25
+ */

 #ifndef __SIGMOID_H__
 #define __SIGMOID_H__

 #include "../XTensor.h"
-#include "Loss.h"

 namespace nts{ // namespace nts(NiuTrans.Tensor)

@@ -36,9 +35,8 @@ XTensor Sigmoid(const XTensor &x);
 void Sigmoid(const XTensor &x, XTensor &y);

 /* de/dx */
-void _SigmoidBackward(XTensor * gold, XTensor * y, XTensor * x, 
-                      XTensor * dedy, XTensor * dedx,
-                      LOSS_FUNCTION_NAME lossName);
+void _SigmoidBackward(XTensor * y, XTensor * x, 
+                      XTensor * dedy, XTensor * dedx);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/function/Softmax.cpp
+++ b/source/tensor/function/Softmax.cpp
@@ -16,8 +16,8 @@
 */

 /*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-27
-*/
+ * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-27
+ */

 #include <math.h>
 #include "Softmax.h"

--- a/source/tensor/function/Softmax.h
+++ b/source/tensor/function/Softmax.h
@@ -16,8 +16,8 @@
 */

 /*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-27
-*/
+ * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-27
+ */

 #ifndef __SOFTMAX_H__
 #define __SOFTMAX_H__

--- a/source/tensor/loss/CrossEntropy.cpp
+++ b/source/tensor/loss/CrossEntropy.cpp
@@ -28,7 +28,6 @@
 #include "../core/arithmetic/Multiply.h"
 #include "../core/math/Unary.h"
 #include "../core/math/ScaleAndShift.h"
-#include "../core/arithmetic/Negate.h"
 #include "../core/reduce/ReduceSum.h"
 #include "../core/reduce/ReduceSumAll.h"

@@ -63,23 +62,6 @@ void _CrossEntropy(const XTensor * output, const XTensor * gold,
    CheckNTErrors(loss->order == output->order - 1, "Wrong loss dimension!");
    CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, "TODO!");

-    /*XTensor * interBuf1 = NewTensorBuf(output, output->devID, output->mem);
-    XTensor * interBuf2 = NewTensorBuf(output, output->devID, output->mem);
-    
-    _Log(output, interBuf1);
-    _Multiply(gold, interBuf1, interBuf2);
-
-    if(weight != NULL)
-        _MultiplyDimMe(interBuf2, weight, n);
-    _NegateMe(interBuf2);
-    _ReduceSum(interBuf2, loss, n);
-    
-    if(padding != NULL)
-        _MultiplyMe(loss, padding);
-
-    DelTensorBuf(interBuf2);
-    DelTensorBuf(interBuf1);*/
-
    XTensor * inter = NewTensor(output);
    
    _Log(output, inter);
@@ -94,7 +76,6 @@ void _CrossEntropy(const XTensor * output, const XTensor * gold,
        _MultiplyMe(loss, padding);

    DelTensor(inter);
-
 }

 /*

--- a/source/tensor/loss/CrossEntropy.cu
+++ b/source/tensor/loss/CrossEntropy.cu
@@ -29,7 +29,6 @@
 #include "../core/arithmetic/Div.h"
 #include "../core/arithmetic/Multiply.h"
 #include "../core/arithmetic/MultiplyDim.h"
-#include "../core/arithmetic/Negate.h"
 #include "../core/math/Unary.h"
 #include "../core/math/ScaleAndShift.h"
 #include "../core/reduce/ReduceSum.h"

--- a/source/tensor/test/TCrossEntropy.cpp
+++ b/source/tensor/test/TCrossEntropy.cpp
@@ -21,6 +21,7 @@

 #include <math.h>
 #include "TCrossEntropy.h"
+#include "../loss/CrossEntropy.h"
 #include "../core/math/ScaleAndShift.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/test/TCrossEntropy.h
+++ b/source/tensor/test/TCrossEntropy.h
@@ -22,8 +22,6 @@
 #ifndef __TEST_CROSSENTROPY_H__
 #define __TEST_CROSSENTROPY_H__

-#include "../loss/CrossEntropy.h"
-
 namespace nts { // namespace nts(NiuTrans.Tensor)

 /* test for CrossEntropy Function */

--- a/source/tensor/test/TGather.cpp
+++ b/source/tensor/test/TGather.cpp
@@ -25,211 +25,10 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /* 
 case 1: gather indexed sub-tensors 
-In this case, (3, 2, 3) -> (3, 2, 2), dim = 2, 
-srcIndex = [0, 2], indexSize = 2
-*/
-bool TestGather1()
-{
-    /* a input tensor of size (3, 2, 3) */
-    int sOrder = 3;
-    int * sDimSize = new int[sOrder];
-    sDimSize[0] = 3;
-    sDimSize[1] = 2;
-    sDimSize[2] = 3;
-
-    int sUnitNum = 1;
-    for (int i = 0; i < sOrder; i++)
-        sUnitNum *= sDimSize[i];
-
-    /* a output tensor of size (3, 2, 2) */
-    int tOrder = 3;
-    int * tDimSize = new int[tOrder];
-    tDimSize[0] = 3;
-    tDimSize[1] = 2;
-    tDimSize[2] = 2;
-
-    int tUnitNum = 1;
-    for (int i = 0; i < tOrder; i++)
-        tUnitNum *= tDimSize[i];
-
-    DTYPE sData[3][2][3] = { { {0.0F, -1.0F, 2.0F},
-                               {2.0F, 1.0F, 3.0F} },
-                             { {1.0F, 2.0F, 4.0F}, 
-                               {3.0F, 1.0F, 2.0F}},
-                             { {-1.0F, 3.0F, 2.0F}, 
-                               {1.0F, -1.0F, 0.0F} } };
-
-    DTYPE answer[3][2][2] = { { {0.0F, 2.0F},
-                                {2.0F, 3.0F} },
-                              { {1.0F, 4.0F}, 
-                                {3.0F, 2.0F}},
-                              { {-1.0F, 2.0F}, 
-                                {1.0F, 0.0F} } };
-    int dim = 2;
-    int indexSize = 2;
-    int srcIndex[2] = {0, 2};
-
-    /* CPU test */
-    bool cpuTest = true;
-
-    /* create tensors */
-    XTensor * s = NewTensor(sOrder, sDimSize);
-    XTensor * t = NewTensor(tOrder, tDimSize);
-
-    /* initialize variables */
-    s->SetData(sData, sUnitNum);
-    t->SetZeroAll();
-
-    /* call Gather function */
-    _Gather(s, t, dim, srcIndex, indexSize);
-
-    /* check results */
-    cpuTest = t->CheckData(answer, tUnitNum);
-    
-#ifdef USE_CUDA
-    /* GPU test */
-    bool gpuTest = true;
-
-    /* create tensors */
-    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * tGPU = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
-    XTensor tUserGPU;
-
-    /* initialize variables */
-    sGPU->SetData(sData, sUnitNum);
-    tGPU->SetZeroAll();
-
-    /* call Gather function */
-    _Gather(sGPU, tGPU, dim, srcIndex, indexSize);
-
-    /* check results */
-    gpuTest = tGPU->CheckData(answer, tUnitNum);
-
-    /* destroy variables */
-    delete s;
-    delete t;
-    delete sGPU;
-    delete tGPU;
-    delete[] sDimSize;
-    delete[] tDimSize;
-
-    return cpuTest && gpuTest;
-#else
-    /* destroy variables */
-    delete s;
-    delete t;
-    delete[] sDimSize;
-    delete[] tDimSize;
-
-    return cpuTest;
-#endif // USE_CUDA
-}
-
-/* 
-case 2: gather indexed sub-tensors 
-In this case, (3, 2, 3) -> (3, 1, 3), dim = 1, 
-srcIndex = [0], indexSize = 1
-*/
-bool TestGather2()
-{
-    /* a input tensor of size (3, 2, 3) */
-    int sOrder = 3;
-    int * sDimSize = new int[sOrder];
-    sDimSize[0] = 3;
-    sDimSize[1] = 2;
-    sDimSize[2] = 3;
-
-    int sUnitNum = 1;
-    for (int i = 0; i < sOrder; i++)
-        sUnitNum *= sDimSize[i];
-
-    /* a output tensor of size (3, 1, 3) */
-    int tOrder = 3;
-    int * tDimSize = new int[tOrder];
-    tDimSize[0] = 3;
-    tDimSize[1] = 1;
-    tDimSize[2] = 3;
-
-    int tUnitNum = 1;
-    for (int i = 0; i < tOrder; i++)
-        tUnitNum *= tDimSize[i];
-
-    DTYPE sData[3][2][3] = { { {0.0F, -1.0F, 2.0F},
-                               {2.0F, 1.0F, 3.0F} },
-                             { {1.0F, 2.0F, 4.0F}, 
-                               {3.0F, 1.0F, 2.0F}},
-                             { {-1.0F, 3.0F, 2.0F}, 
-                               {1.0F, -1.0F, 0.0F} } };
-
-    DTYPE answer[3][1][3] = { { {0.0F, -1.0F, 2.0F} },
-                              { {1.0F, 2.0F, 4.0F} } , 
-                              { {-1.0F, 3.0F, 2.0F} } };
-    int dim = 1;
-    int indexSize = 1;
-    int srcIndex[2] = {0};
-
-    /* CPU test */
-    bool cpuTest = true;
-
-    /* create tensors */
-    XTensor * s = NewTensor(sOrder, sDimSize);
-    XTensor * t = NewTensor(tOrder, tDimSize);
-
-    /* initialize variables */
-    s->SetData(sData, sUnitNum);
-    t->SetZeroAll();
-
-    /* call Gather function */
-    _Gather(s, t, dim, srcIndex, indexSize);
-    
-    /* check results */
-    cpuTest = t->CheckData(answer, tUnitNum);
-    
-#ifdef USE_CUDA
-    /* GPU test */
-    bool gpuTest = true;
-
-    /* create tensors */
-    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * tGPU = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
-    XTensor tUserGPU;
-
-    /* initialize variables */
-    sGPU->SetData(sData, sUnitNum);
-    tGPU->SetZeroAll();
-
-    /* call Gather function */
-    _Gather(sGPU, tGPU, dim, srcIndex, indexSize);
-
-    /* check results */
-    gpuTest = tGPU->CheckData(answer, tUnitNum);
-
-    /* destroy variables */
-    delete s;
-    delete t;
-    delete sGPU;
-    delete tGPU;
-    delete[] sDimSize;
-    delete[] tDimSize;
-
-    return cpuTest && gpuTest;
-#else
-    /* destroy variables */
-    delete s;
-    delete t;
-    delete[] sDimSize;
-    delete[] tDimSize;
-
-    return cpuTest;
-#endif // USE_CUDA
-}
-
-/* 
-case 3: gather indexed sub-tensors 
 In this case, (3, 3) -> (2, 3), dim = 0, 
 srcIndex = [0, 2]
 */
-bool TestGather3()
+bool TestGather1()
 {
    /* a input tensor of size (3, 3) */
    int sOrder = 2;
@@ -286,7 +85,7 @@ bool TestGather3()
    index->SetData(srcIndex, indexSize);

    /* call Gather function */
-    _Gather(s, t, dim, srcIndex, indexSize);
+    _Gather(s, t, index);
    tUser = Gather(*s, *index);

    /* check results */
@@ -309,7 +108,7 @@ bool TestGather3()
    indexGPU->SetData(srcIndex, indexSize);

    /* call Gather function */
-    _Gather(sGPU, tGPU, dim, srcIndex, indexSize);
+    _Gather(sGPU, tGPU, indexGPU);
    tUserGPU = Gather(*sGPU, *indexGPU);

    /* check results */
@@ -360,24 +159,6 @@ bool TestGather()
    }
    else
        XPRINT(0, stdout, ">> case 1 passed!\n");
-    
-    /* case 2 test */
-    caseFlag = TestGather2();
-    if (!caseFlag) {
-        returnFlag = false;
-        XPRINT(0, stdout, ">> case 2 failed!\n");
-    }
-    else
-        XPRINT(0, stdout, ">> case 2 passed!\n");
-         
-    /* case 2 test */
-    caseFlag = TestGather3();
-    if (!caseFlag) {
-        returnFlag = false;
-        XPRINT(0, stdout, ">> case 3 failed!\n");
-    }
-    else
-        XPRINT(0, stdout, ">> case 3 passed!\n");

    /* other cases test */
    /*

--- a/source/tensor/test/THardTanH.cpp
+++ b/source/tensor/test/THardTanH.cpp
@@ -129,14 +129,12 @@ bool TestHardTanH2()

    DTYPE xData[2][3] = { {0.5F, -1.0F, 2.0F},
                          {3.5F, -4.5F, 1.0F} };
-    DTYPE goldData[2][3] = { {1.0F, 1.0F, 1.0F},
-                             {1.0F, 1.0F, 1.0F} };
    DTYPE yAnswer[2][3] = { {0.5F, -1.0F, 1.0F},
                            {1.0F, -1.0F, 1.0F} };
-    DTYPE dedyAnswer[2][3] = { {-0.5F, -2.0F, 0.0F},
-                               {0.0F, -2.0F, 0.0F} };
    DTYPE dedxAnswer[2][3] = { {-0.5F, -2.0F, 0.0F},
                               {0.0F, 0.0F, -0.0F} };
+    DTYPE dedyData[2][3] = { {-0.5F, -2.0F, 0.0F},
+	                         {0.0F, -2.0F, 0.0F} };

    /* CPU test */
    bool cpuTest = true;
@@ -144,27 +142,24 @@ bool TestHardTanH2()
    /* create tensors */
    XTensor * x = NewTensor(order, dimSize);
    XTensor * y = NewTensor(order, dimSize);
-    XTensor * gold = NewTensor(order, dimSize);
    XTensor * dedy = NewTensor(order, dimSize);
    XTensor * dedx = NewTensor(order, dimSize);

    /* initialize variables */
    x->SetData(xData, unitNum);
-    gold->SetData(goldData, unitNum);
    y->SetZeroAll();
-    dedy->SetZeroAll();
    dedx->SetZeroAll();
+    dedy->SetData(dedyData, unitNum);

    /* call HardTanH function */
    _HardTanH(x, y);

    /* call HardTanHBackward function */
-    _HardTanHBackward(gold, y, x, dedy, dedx, SQUAREDERROR);
+	_HardTanHBackward(y, x, dedy, dedx);

    /* check results */
-    cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F) 
-              && dedx->CheckData(dedxAnswer, unitNum, 1e-4F)
-              && dedy->CheckData(dedyAnswer, unitNum, 1e-4F);
+	cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F) && 
+              dedx->CheckData(dedxAnswer, unitNum, 1e-4F);

 #ifdef USE_CUDA
    /* GPU test */
@@ -173,37 +168,32 @@ bool TestHardTanH2()
    /* create tensors */
    XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
    XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
-    XTensor * goldGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
    XTensor * dedyGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
    XTensor * dedxGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);

    /* initialize variables */
    xGPU->SetData(xData, unitNum);
-    goldGPU->SetData(goldData, unitNum);
    yGPU->SetZeroAll();
-    dedyGPU->SetZeroAll();
-    dedxGPU->SetZeroAll();
+	dedxGPU->SetZeroAll();
+	dedyGPU->SetData(dedyData, unitNum);

    /* call HardTanH function */
    _HardTanH(xGPU, yGPU);

    /* call hardtanhbackward function */
-    _HardTanHBackward(goldGPU, yGPU, xGPU, dedyGPU, dedxGPU, SQUAREDERROR);
+	_HardTanHBackward(yGPU, xGPU, dedyGPU, dedxGPU);

    /* check results */
-    gpuTest = y->CheckData(yAnswer, unitNum, 1e-4F) 
-              && dedxGPU->CheckData(dedxAnswer, unitNum, 1e-4F)
-              && dedyGPU->CheckData(dedyAnswer, unitNum, 1e-4F);
+	gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F) && 
+              dedxGPU->CheckData(dedxAnswer, unitNum, 1e-4F);

    /* destroy variables */
    delete x;
    delete y;
-    delete gold;
    delete dedx;
    delete dedy;
    delete xGPU;
    delete yGPU;
-    delete goldGPU;
    delete dedxGPU;
    delete dedyGPU;
    delete[] dimSize;
@@ -213,7 +203,6 @@ bool TestHardTanH2()
    /* destroy variables */
    delete x;
    delete y;
-    delete gold;
    delete dedx;
    delete dedy;
    delete[] dimSize;

--- a/source/tensor/test/TIdentity.cpp
+++ b/source/tensor/test/TIdentity.cpp
@@ -120,9 +120,8 @@ bool TestIdentity2()
        unitNum *= dimSize[i];

    DTYPE xData[3] = {1.0F, 1.0F, 2.0F};
-    DTYPE gData[3] = {0.0F, 0.0F, 1.0F};
    DTYPE yAnswer[3] = {1.0F, 1.0F, 2.0F};
-    DTYPE dedyAnswer[3] = {0.0F, 0.0F, -0.5F};
+    DTYPE dedyData[3] = {0.0F, 0.0F, -0.5F};
    DTYPE dedxAnswer[3] = {0.0F, 0.0F, -0.5F};

    /* CPU test */
@@ -131,27 +130,24 @@ bool TestIdentity2()
    /* create tensors */
    XTensor * x = NewTensor(order, dimSize);
    XTensor * y = NewTensor(order, dimSize);
-    XTensor * g = NewTensor(order, dimSize);
    XTensor * dedy = NewTensor(order, dimSize);
    XTensor * dedx = NewTensor(order, dimSize);

    /* initialize variables */
    x->SetData(xData, unitNum);
-    g->SetData(gData, unitNum);
    y->SetZeroAll();
    dedx->SetZeroAll();
-    dedy->SetZeroAll();
+    dedy->SetData(dedyData, unitNum);

    /* call Identity function */
    _Identity(x, y);

    /* call IdentityBackward function */
-    _IdentityBackward(g, y, x, dedy, dedx, CROSSENTROPY);
+    _IdentityBackward(y, x, dedy, dedx);
    
    /* check result */
-    cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F)
-              && dedx->CheckData(dedxAnswer, unitNum, 1e-4F)
-              && dedy->CheckData(dedyAnswer, unitNum, 1e-4F);
+    cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F) && 
+              dedx->CheckData(dedxAnswer, unitNum, 1e-4F);

 #ifdef USE_CUDA
    /* GPU test */
@@ -160,37 +156,32 @@ bool TestIdentity2()
        /* create tensors */
    XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
    XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
-    XTensor * gGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
    XTensor * dedyGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
    XTensor * dedxGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);

    /* initialize variables */
    xGPU->SetData(xData, unitNum);
-    gGPU->SetData(gData, unitNum);
    yGPU->SetZeroAll();
    dedxGPU->SetZeroAll();
-    dedyGPU->SetZeroAll();
+    dedyGPU->SetData(dedyData, unitNum);

    /* call Identity function */
    _Identity(xGPU, yGPU);

    /* call IdentityBackward function */
-    _IdentityBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, CROSSENTROPY);
+    _IdentityBackward(yGPU, xGPU, dedyGPU, dedxGPU);
    
    /* check result */
-    gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F)
-              && dedxGPU->CheckData(dedxAnswer, unitNum, 1e-4F)
-              && dedyGPU->CheckData(dedyAnswer, unitNum, 1e-4F);
+    gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F) && 
+              dedxGPU->CheckData(dedxAnswer, unitNum, 1e-4F);

    /* destroy variables */
    delete x;
    delete y;
-    delete g;
    delete dedx;
    delete dedy;
    delete xGPU;
    delete yGPU;
-    delete gGPU;
    delete dedxGPU;
    delete dedyGPU;
    delete[] dimSize;
@@ -200,7 +191,6 @@ bool TestIdentity2()
    /* destroy variables */
    delete x;
    delete y;
-    delete g;
    delete dedx;
    delete dedy;
    delete[] dimSize;

--- a/source/tensor/test/TNegate.h
+++ b/source/tensor/test/TNegate.h
@@ -22,7 +22,7 @@
 #ifndef __TEST_NEGATE_H__
 #define __TEST_NEGATE_H__

-#include "../core/arithmetic/Negate.h"
+#include "../core/math/Unary.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/test/TPower.cpp
+++ b/source/tensor/test/TPower.cpp
@@ -19,6 +19,7 @@
 * $Created by: Lin Ye (email: linye2015@outlook.com) 2018-06-15
 */

+#include "../core/math/Binary.h"
 #include "../XUtility.h"
 #include "TPower.h"


--- a/source/tensor/test/TPower.h
+++ b/source/tensor/test/TPower.h
@@ -22,8 +22,6 @@
 #ifndef __TEST_POWER_H__
 #define __TEST_POWER_H__

-#include "../core/math/Power.h"
-
 namespace nts { // namespace nts(NiuTrans.Tensor)

 /* test for Power Function */

--- a/source/tensor/test/TRectify.cpp
+++ b/source/tensor/test/TRectify.cpp
@@ -119,16 +119,14 @@ bool TestRectify2()
    for (int i = 0; i < order; i++)
        unitNum *= dimSize[i];

-    DTYPE xData[2][3] = { {1.0F, 1.0F, 2.0F},
-                          {2.0F, 4.0F, 5.0F} };
-    DTYPE goldData[2][3] = { {1.0F, 1.0F, 1.0F},
-                             {1.0F, 1.0F, 1.0F} };
-    DTYPE yAnswer[2][3] = { {1.0F, 1.0F, 2.0F},
-                            {2.0F, 4.0F, 5.0F} };
-    DTYPE dedyAnswer[2][3] = { {-0.5F, -0.5F, -0.25F},
-                               {-0.25F, -0.125F, -0.1F} };
-    DTYPE dedxAnswer[2][3] = { {-0.5F, -0.5F, -0.25F},
-                               {-0.25F, -0.125F, -0.1F} };
+	DTYPE xData[2][3] = { {-1.0F, 1.0F, 2.0F},
+	                      {-2.0F, 4.0F, 5.0F} };
+    DTYPE yData[2][3] = { {0.0F, 1.0F, 2.0F},
+	                      {0.0F, 4.0F, 5.0F} };
+	DTYPE dedyData[2][3] = { {-0.5F, -0.5F, -0.25F},
+	                         {-0.25F, -0.125F, -0.1F} };
+	DTYPE dedxAnswer[2][3] = { {0.0F, -0.5F, -0.25F},
+	                           {0.0F, -0.125F, -0.1F} };

    /* CPU test */
    bool cpuTest = true;
@@ -136,27 +134,22 @@ bool TestRectify2()
    /* create tensors */
    XTensor * x = NewTensor(order, dimSize);
    XTensor * y = NewTensor(order, dimSize);
-    XTensor * gold = NewTensor(order, dimSize);
    XTensor * dedy = NewTensor(order, dimSize);
    XTensor * dedx = NewTensor(order, dimSize);

    /* initialize variables */
    x->SetData(xData, unitNum);
-    gold->SetData(goldData, unitNum);
-    y->SetZeroAll();
-    dedy->SetZeroAll();
+	y->SetData(yData, unitNum);
+	dedy->SetData(dedyData, unitNum);
    dedx->SetZeroAll();

    /* call Rectify function */
-    _Rectify(x, y);

    /* call RectifyBackward function */
-    _RectifyBackward(gold, y, x, dedy, dedx, CROSSENTROPY);
+	_RectifyBackward(y, x, dedy, dedx);

    /* check results */
-    cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F)
-              && dedx->CheckData(dedxAnswer, unitNum, 1e-4F)
-              && dedy->CheckData(dedyAnswer, unitNum, 1e-4F);
+    cpuTest = dedx->CheckData(dedxAnswer, unitNum, 1e-4F);

 #ifdef USE_CUDA
    /* GPU test */
@@ -165,39 +158,32 @@ bool TestRectify2()
    /* create tensors */
    XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
    XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
-    XTensor * goldGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
    XTensor * dedyGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
    XTensor * dedxGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);

    /* initialize variables */
    xGPU->SetData(xData, unitNum);
-    goldGPU->SetData(goldData, unitNum);
-    yGPU->SetZeroAll();
-    dedyGPU->SetZeroAll();
+	yGPU->SetData(yData, unitNum);
+	dedyGPU->SetData(dedyData, unitNum);
    dedxGPU->SetZeroAll();
    
    /* call Rectify function */
-    _Rectify(xGPU, yGPU);

    /* call rectifybackward function */
-    _RectifyBackward(goldGPU, yGPU, xGPU, dedyGPU, dedxGPU, CROSSENTROPY);
+	_RectifyBackward(yGPU, xGPU, dedyGPU, dedxGPU);
    
    /* check results */
-    gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F)
-              && dedxGPU->CheckData(dedxAnswer, unitNum, 1e-4F)
-              && dedyGPU->CheckData(dedyAnswer, unitNum, 1e-4F);
+    gpuTest = dedxGPU->CheckData(dedxAnswer, unitNum, 1e-4F);

    /* destroy variables */
    delete x;
    delete y;
    delete dedy;
    delete dedx;
-    delete gold;
    delete xGPU;
    delete yGPU;
    delete dedyGPU;
    delete dedxGPU;
-    delete goldGPU;
    delete[] dimSize;

    return cpuTest && gpuTest;
@@ -207,8 +193,7 @@ bool TestRectify2()
    delete y;
    delete dedy;
    delete dedx;
-    delete gold;
-    delete[] dimSize;
+	delete[] dimSize;

    return cpuTest;
 #endif // USE_CUDA

--- a/source/tensor/test/TSigmoid.cpp
+++ b/source/tensor/test/TSigmoid.cpp
@@ -59,7 +59,8 @@ bool TestSigmoid1()
    yUser = Sigmoid(*x);

    /* check result */
-    cpuTest = y->CheckData(answer, unitNum, 1e-4F) && yUser.CheckData(answer, unitNum, 1e-4F);
+	cpuTest = y->CheckData(answer, unitNum, 1e-4F) && 
+              yUser.CheckData(answer, unitNum, 1e-4F);

 #ifdef USE_CUDA
    /* GPU test */
@@ -79,7 +80,8 @@ bool TestSigmoid1()
    yUserGPU = Sigmoid(*xGPU);

    /* check result */
-    gpuTest = yGPU->CheckData(answer, unitNum, 1e-4F) && yUserGPU.CheckData(answer, unitNum, 1e-4F);
+	gpuTest = yGPU->CheckData(answer, unitNum, 1e-4F) && 
+              yUserGPU.CheckData(answer, unitNum, 1e-4F);

    /* destroy variables */
    delete x;
@@ -104,7 +106,7 @@ case 2: test Sigmoid function and SigmoidBackward function.
 sigmoid function: y = 1/(1+exp(-x))
 backward computation: 
 dE/ds = dE/dy * dy/dx
-dy/dx = y * (1 -y)
+dy/dx = y * (1 - y)
 In this case, LossName=CROSSENTROPY.
 */
 bool TestSigmoid2()
@@ -119,10 +121,9 @@ bool TestSigmoid2()
        unitNum *= dimSize[i];

    DTYPE xData[3] = {0.0F, 1.0F, 2.0F};
-    DTYPE gData[3] = {0.4F, 0.8F, 1.0F};
    DTYPE yAnswer[3] = {0.5F, 0.7311F, 0.8808F};
-    DTYPE dedyAnswer[3] = {-0.8F, -1.0943F, -1.1353F};
-    DTYPE dedxAnswer[3] = {-0.2F, -0.2151F, -0.1192F};
+    DTYPE dedyData[3] = {0.0F, 1.0F, 2.0F};
+    DTYPE dedxAnswer[3] = {0.0F, 0.1966F, 0.2100F};

    /* CPU test */
    bool cpuTest = true;
@@ -130,65 +131,58 @@ bool TestSigmoid2()
    /* create tensors */
    XTensor * x = NewTensor(order, dimSize);
    XTensor * y = NewTensor(order, dimSize);
-    XTensor * g = NewTensor(order, dimSize);
    XTensor * dedy = NewTensor(order, dimSize);
    XTensor * dedx = NewTensor(order, dimSize);

    /* initialize variables */
    x->SetData(xData, unitNum);
-    g->SetData(gData, unitNum);
    y->SetZeroAll();
-    dedy->SetZeroAll();
    dedx->SetZeroAll();
+    dedy->SetData(dedyData, unitNum);

    /* call Sigmoid function */
    _Sigmoid(x, y);

    /* call SigmoidBackward function */
-    _SigmoidBackward(g, y, x, dedy, dedx, CROSSENTROPY);
-    
+    _SigmoidBackward(y, x, dedy, dedx);
+
    /* check result */
-    cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F)
-              && dedx->CheckData(dedxAnswer, unitNum, 1e-4F)
-              && dedy->CheckData(dedyAnswer, unitNum, 1e-4F);
+    cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F) &&
+              dedx->CheckData(dedxAnswer, unitNum, 1e-4F);

 #ifdef USE_CUDA
    /* GPU test */
    bool gpuTest = true;

-        /* create tensors */
+    /* create tensors */
    XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
    XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
-    XTensor * gGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
    XTensor * dedyGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
    XTensor * dedxGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);

    /* initialize variables */
    xGPU->SetData(xData, unitNum);
-    gGPU->SetData(gData, unitNum);
    yGPU->SetZeroAll();
-    dedyGPU->SetZeroAll();
    dedxGPU->SetZeroAll();
+    dedyGPU->SetData(dedyData, unitNum);

    /* call Sigmoid function */
    _Sigmoid(xGPU, yGPU);

    /* call SigmoidBackward function */
-    _SigmoidBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, CROSSENTROPY);
+    _SigmoidBackward(yGPU, xGPU, dedyGPU, dedxGPU);
    
    /* check result */
-    gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F)
-              && dedxGPU->CheckData(dedxAnswer, unitNum, 1e-4F)
-              && dedyGPU->CheckData(dedyAnswer, unitNum, 1e-4F);
+    gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F) &&
+              dedxGPU->CheckData(dedxAnswer, unitNum, 1e-4F);
+
    /* destroy variables */
    delete x;
    delete y;
-    delete g;
    delete dedx;
    delete dedy;
    delete xGPU;
    delete yGPU;
-    delete gGPU;
    delete dedxGPU;
    delete dedyGPU;
    delete[] dimSize;
@@ -198,7 +192,6 @@ bool TestSigmoid2()
    /* destroy variables */
    delete x;
    delete y;
-    delete g;
    delete dedx;
    delete dedy;
    delete[] dimSize;

--- a/source/tensor/test/TSign.h
+++ b/source/tensor/test/TSign.h
@@ -22,7 +22,7 @@
 #ifndef __TEST_SIGN_H__
 #define __TEST_SIGN_H__

-#include "../core/arithmetic/Sign.h"
+#include "../core/math/Unary.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/test/TSpread.cpp
+++ b/source/tensor/test/TSpread.cpp
@@ -192,22 +192,22 @@ bool TestSpread2()
    XTensor * s2 = NewTensor(sOrder, sDimSize);
    XTensor * t = NewTensor(tOrder, tDimSize);
    XTensor * sIndex = NewTensor(indexOrder, indexDimSize, X_INT);
-    XTensor * cIndex = NewTensor(indexOrder, indexDimSize, X_INT);
+    XTensor * tIndex = NewTensor(indexOrder, indexDimSize, X_INT);

    /* initialize variables */
    s1->SetData(sData, sUnitNum);
    s2->SetData(sData, sUnitNum);
    t->SetData(tData, tUnitNum);
    sIndex->SetData(srcIndex, indexSize);
-    cIndex->SetData(tgtIndex, indexSize);
+    tIndex->SetData(tgtIndex, indexSize);

    /* call _SpreadForGather function */
-    _SpreadForCopyIndexed(s1, t, dim, sIndex, cIndex, 1);
+    _SpreadForCopyIndexed(s1, t, dim, sIndex, tIndex, 1);
    _SpreadForGather(s2, t, sIndex);

    /* check results */
-    cpuTest = s1->CheckData(answer, tUnitNum) &&
-              s2->CheckData(answer, tUnitNum);
+    cpuTest = s1->CheckData(answer, sUnitNum) &&
+              s2->CheckData(answer, sUnitNum);
    
 #ifdef USE_CUDA
    /* GPU test */
@@ -218,34 +218,34 @@ bool TestSpread2()
    XTensor * sGPU2 = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
    XTensor * tGPU = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
    XTensor * sIndexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
-    XTensor * cIndexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
+    XTensor * tIndexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);

    /* initialize variables */
    sGPU1->SetData(sData, sUnitNum);
    sGPU2->SetData(sData, sUnitNum);
    tGPU->SetData(tData, tUnitNum);
    sIndexGPU->SetData(srcIndex, indexSize);
-    cIndexGPU->SetData(tgtIndex, indexSize);
+    tIndexGPU->SetData(tgtIndex, indexSize);

    /* call _SpreadForGather function */
-    _SpreadForCopyIndexed(sGPU1, tGPU, dim, sIndex, cIndex, 1);
+    _SpreadForCopyIndexed(sGPU1, tGPU, dim, sIndexGPU, tIndexGPU, 1);
    _SpreadForGather(sGPU2, tGPU, sIndexGPU);

    /* check results */
-    gpuTest = sGPU1->CheckData(answer, tUnitNum) && 
-              sGPU2->CheckData(answer, tUnitNum);
+    gpuTest = sGPU1->CheckData(answer, sUnitNum) && 
+              sGPU2->CheckData(answer, sUnitNum);

    /* destroy variables */
    delete s1;
    delete s2;
    delete t;
    delete sIndex;
-    delete cIndex;
+    delete tIndex;
    delete sGPU1;
    delete sGPU2;
    delete tGPU;
    delete sIndexGPU;
-    delete cIndexGPU;
+    delete tIndexGPU;
    delete[] sDimSize;
    delete[] tDimSize;
    delete[] indexDimSize;
@@ -257,7 +257,142 @@ bool TestSpread2()
    delete s2;
    delete t;
    delete sIndex;
-    delete cIndex;
+    delete tIndex;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    delete[] indexDimSize;
+
+    return cpuTest;
+#endif // USE_CUDA
+}
+
+
+/* 
+case 3: test _SpreadForGather and _SpreadForCopyIndexed function 
+spread a collection tensor to source tensor
+*/
+bool TestSpread3()
+{
+    /* a input tensor of size (3, 3) */
+    int sOrder = 2;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 3;
+    sDimSize[1] = 3;
+
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+
+    /* a output tensor of size (2, 3) */
+    int tOrder = 2;
+    int * tDimSize = new int[tOrder];
+    tDimSize[0] = 3;
+    tDimSize[1] = 2;
+
+    int tUnitNum = 1;
+    for (int i = 0; i < tOrder; i++)
+        tUnitNum *= tDimSize[i];
+        
+    /* a index tensor of size (2) */
+    int indexOrder = 1;
+    int * indexDimSize = new int[indexOrder];
+    indexDimSize[0] = 2;
+
+    int indexUnitNum = 1;
+    for (int i = 0; i < indexOrder; i++)
+        indexUnitNum *= indexDimSize[i];
+
+    DTYPE sData[3][3] = { {0.0F, 0.0F, 2.0F},
+                          {2.0F, 1.0F, 3.0F},
+                          {2.0F, 2.0F, 4.0F} };
+
+    DTYPE tData[3][2] = { {0.0F, -1.0F}, 
+                          {2.0F, 1.0F},
+                          {2.0F, 0.0F} };
+
+    DTYPE answer[3][3] = { {-1.0F, 0.0F, 2.0F},
+                           {3.0F, 1.0F, 5.0F},
+                           {2.0F, 2.0F, 6.0F} };
+
+    int dim = 1;
+    int indexSize = 2;
+    int srcIndex[2] = {0, 2};
+    int tgtIndex[2] = {1, 0};
+
+    /* CPU test */
+    bool cpuTest = true;
+
+    /* create tensors */
+    XTensor * s1 = NewTensor(sOrder, sDimSize);
+    XTensor * s2 = NewTensor(sOrder, sDimSize);
+    XTensor * t = NewTensor(tOrder, tDimSize);
+    XTensor * sIndex = NewTensor(indexOrder, indexDimSize, X_INT);
+    XTensor * tIndex = NewTensor(indexOrder, indexDimSize, X_INT);
+
+    /* initialize variables */
+    s1->SetData(sData, sUnitNum);
+    s2->SetData(sData, sUnitNum);
+    t->SetData(tData, tUnitNum);
+    sIndex->SetData(srcIndex, indexSize);
+    tIndex->SetData(tgtIndex, indexSize);
+
+    /* call _SpreadForGather function */
+    _SpreadForCopyIndexed(s1, t, dim, sIndex, tIndex, 1);
+    _SpreadForCopyIndexed(s2, t, dim, sIndex, tIndex, 1);
+
+    /* check results */
+    cpuTest = s1->CheckData(answer, sUnitNum) &&
+              s2->CheckData(answer, sUnitNum);
+    
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+
+    /* create tensors */
+    XTensor * sGPU1 = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * sGPU2 = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * sIndexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
+    XTensor * tIndexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
+
+    /* initialize variables */
+    sGPU1->SetData(sData, sUnitNum);
+    sGPU2->SetData(sData, sUnitNum);
+    tGPU->SetData(tData, tUnitNum);
+    sIndexGPU->SetData(srcIndex, indexSize);
+    tIndexGPU->SetData(tgtIndex, indexSize);
+
+    /* call _SpreadForGather function */
+    _SpreadForCopyIndexed(sGPU1, tGPU, dim, sIndexGPU, tIndexGPU, 1);
+    _SpreadForCopyIndexed(sGPU2, tGPU, dim, sIndexGPU, tIndexGPU, 1);
+
+    /* check results */
+    gpuTest = sGPU1->CheckData(answer, sUnitNum) && 
+              sGPU2->CheckData(answer, sUnitNum);
+
+    /* destroy variables */
+    delete s1;
+    delete s2;
+    delete t;
+    delete sIndex;
+    delete tIndex;
+    delete sGPU1;
+    delete sGPU2;
+    delete tGPU;
+    delete sIndexGPU;
+    delete tIndexGPU;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    delete[] indexDimSize;
+
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s1;
+    delete s2;
+    delete t;
+    delete sIndex;
+    delete tIndex;
    delete[] sDimSize;
    delete[] tDimSize;
    delete[] indexDimSize;
@@ -285,6 +420,24 @@ bool TestSpread()
    }
    else
        XPRINT(0, stdout, ">> case 1 passed!\n");
+        
+    /* case 1 test */
+    caseFlag = TestSpread2();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 2 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 2 passed!\n");
+        
+    /* case 1 test */
+    caseFlag = TestSpread3();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 3 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 3 passed!\n");

    /* other cases test */
    /*

--- a/source/tensor/test/TSumByColumnTV.cpp
+++ b/source/tensor/test/TSumByColumnTV.cpp
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-06
-*/
-
-#include "TSumByColumnTV.h"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-/* 
-case 1: test SumByColumnTV function
-sum of a tensor and a vector (column vector) in a column by column manner
-*/
-bool TestSumByColumnTV1()
-{
-    /* a tensor of size (2, 4) */
-    int aOrder = 2;
-    int * aDimSize = new int[aOrder];
-    aDimSize[0] = 2;
-    aDimSize[1] = 4;
-
-    int aUnitNum = 1;
-    for (int i = 0; i < aOrder; i++)
-        aUnitNum *= aDimSize[i];
-
-    /* a tensor of size (2, 1) */
-    int bOrder = 2;
-    int * bDimSize = new int[bOrder];
-    bDimSize[0] = 2;
-    bDimSize[1] = 1;
-
-    int bUnitNum = 1;
-    for (int i = 0; i < bOrder; i++)
-        bUnitNum *= bDimSize[i];
-
-    /* a tensor of size (2, 4) */
-    int cOrder = 2;
-    int * cDimSize = new int[cOrder];
-    cDimSize[0] = 2;
-    cDimSize[1] = 4;
-
-    int cUnitNum = 1;
-    for (int i = 0; i < cOrder; i++)
-        cUnitNum *= cDimSize[i];
-
-    DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
-                          {4.0F, 5.0F, 6.0F, 7.0F} };
-    DTYPE bData[2][1] = { {1.0F},
-                          {0.0F} };
-    DTYPE answer[2][4] = { {1.0F, 2.0F, 3.0F, 4.0F},
-                           {4.0F, 5.0F, 6.0F, 7.0F} };
-
-    /* CPU test */
-    bool cpuTest = true;
-
-    /* create tensors */
-    XTensor * a = NewTensor(aOrder, aDimSize);
-    XTensor * b = NewTensor(bOrder, bDimSize);
-    XTensor * c = NewTensor(cOrder, cDimSize);
-
-    /* initialize variables */
-    a->SetData(aData, aUnitNum);
-    b->SetData(bData, bUnitNum);
-
-    /* call SumByColumnTV function */
-    _SumByColumnTV(a, b, c);
-
-    /* check results */
-    cpuTest = c->CheckData(answer, cUnitNum);
-
-#ifdef USE_CUDA
-    /* GPU test */
-    bool gpuTest = true;
-
-    /* create tensor */
-    XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * cGPU = NewTensor(cOrder, cDimSize, X_FLOAT, 1.0F, 0);
-
-    /* Initialize variables */
-    aGPU->SetData(aData, aUnitNum);
-    bGPU->SetData(bData, bUnitNum);
-    cGPU->SetZeroAll();
-
-    /* call SumByColumnTV function */
-    _SumByColumnTV(aGPU, bGPU, cGPU);
-
-    /* check results */
-    gpuTest = cGPU->CheckData(answer, cUnitNum);
-
-    /* destroy variables */
-    delete a;
-    delete b;
-    delete c;
-    delete aGPU;
-    delete bGPU;
-    delete cGPU;
-    delete[] aDimSize;
-    delete[] bDimSize;
-    delete[] cDimSize;
-
-    return cpuTest && gpuTest;
-#else
-    /* destroy variables */
-    delete a;
-    delete b;
-    delete c;
-    delete[] aDimSize;
-    delete[] bDimSize;
-    delete[] cDimSize;
-
-    return cpuTest;
-#endif // USE_CUDA
-}
-
-/* other cases */
-/*
-    TODO!!
-*/
-
-/* test for SumByColumnTV Function */
-bool TestSumByColumnTV() 
-{
-    XPRINT(0, stdout, "[TEST SumByColumnTV] sum of a tensor and a vector (column vector) in a column by column manner \n");
-    bool returnFlag = true, caseFlag = true;
-
-    /* case 1 test */
-    caseFlag = TestSumByColumnTV1();
-    if (!caseFlag) {
-        returnFlag = false;
-        XPRINT(0, stdout, ">> case 1 failed!\n");
-    }
-    else
-        XPRINT(0, stdout, ">> case 1 passed!\n");
-
-    /* other cases test */
-    /*
-        TODO!!
-    */
-
-    if (returnFlag) {
-        XPRINT(0, stdout, ">> All Passed!\n");
-    }
-    else
-        XPRINT(0, stdout, ">> Failed!\n");
-
-    XPRINT(0, stdout, "\n");
-
-    return returnFlag;
-}
-
-} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/test/TSumByColumnTV.h
+++ b/source/tensor/test/TSumByColumnTV.h
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-06
-*/
-
-#ifndef __TEST_SUMBYCOLUMNTV_H__
-#define __TEST_SUMBYCOLUMNTV_H__
-
-#include "../core/arithmetic/SumByColumnTV.h"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-/* test for SumByColumnTV Function */
-extern "C"
-bool TestSumByColumnTV();
-
-} // namespace nts(NiuTrans.Tensor)
-#endif // __TEST_SUMBYCOLUMNTV_H__
--- a/source/tensor/test/TSumByColumnVT.cpp
+++ b/source/tensor/test/TSumByColumnVT.cpp
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-06
-*/
-
-#include "TSumByColumnVT.h"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-/* 
-case 1: test SumByColumnVT function
-sum of a vector (column vector) and a tensor in a column by column manner
-*/
-bool TestSumByColumnVT1()
-{
-    /* a tensor of size (2, 1) */
-    int aOrder = 2;
-    int * aDimSize = new int[aOrder];
-    aDimSize[0] = 2;
-    aDimSize[1] = 1;
-
-    int aUnitNum = 1;
-    for (int i = 0; i < aOrder; i++)
-        aUnitNum *= aDimSize[i];
-
-    /* a tensor of size (2, 4) */
-    int bOrder = 2;
-    int * bDimSize = new int[bOrder];
-    bDimSize[0] = 2;
-    bDimSize[1] = 4;
-
-    int bUnitNum = 1;
-    for (int i = 0; i < bOrder; i++)
-        bUnitNum *= bDimSize[i];
-
-    /* a tensor of size (2, 1) */
-    int cOrder = 2;
-    int * cDimSize = new int[cOrder];
-    cDimSize[0] = 2;
-    cDimSize[1] = 1;
-
-    int cUnitNum = 1;
-    for (int i = 0; i < cOrder; i++)
-        cUnitNum *= cDimSize[i];
-
-    DTYPE aData[2][1] = { {1.0F},
-                          {0.0F} };
-    DTYPE bData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
-                          {4.0F, 5.0F, 6.0F, 7.0F} };
-    DTYPE answer[2][1] = { {7.0F},
-                           {22.0F} };
-
-    /* CPU test */
-    bool cpuTest = true;
-
-    /* create tensors */
-    XTensor * a = NewTensor(aOrder, aDimSize);
-    XTensor * b = NewTensor(bOrder, bDimSize);
-    XTensor * c = NewTensor(cOrder, cDimSize);
-
-    /* initialize variables */
-    a->SetData(aData, aUnitNum);
-    b->SetData(bData, bUnitNum);
-    c->SetZeroAll();
-
-    /* call SumByColumnVT function */
-    _SumByColumnVT(a, b, c);
-    
-    /* check results */
-    cpuTest = c->CheckData(answer, cUnitNum);
-
-#ifdef USE_CUDA
-    /* GPU test */
-    bool gpuTest = true;
-
-    /* create tensor */
-    XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * cGPU = NewTensor(cOrder, cDimSize, X_FLOAT, 1.0F, 0);
-
-    /* Initialize variables */
-    aGPU->SetData(aData, aUnitNum);
-    bGPU->SetData(bData, bUnitNum);
-    cGPU->SetZeroAll();
-
-    /* call SumByColumnVT function */
-    _SumByColumnVT(aGPU, bGPU, cGPU);
-    
-    /* check results */
-    gpuTest = cGPU->CheckData(answer, cUnitNum);
-
-    /* destroy variables */
-    delete a;
-    delete b;
-    delete c;
-    delete aGPU;
-    delete bGPU;
-    delete cGPU;
-    delete[] aDimSize;
-    delete[] bDimSize;
-    delete[] cDimSize;
-
-    return cpuTest && gpuTest;
-#else
-    /* destroy variables */
-    delete a;
-    delete b;
-    delete c;
-    delete[] aDimSize;
-    delete[] bDimSize;
-    delete[] cDimSize;
-
-    return cpuTest;
-#endif // USE_CUDA
-}
-
-/* other cases */
-/*
-    TODO!!
-*/
-
-/* test for SumByColumnVT Function */
-bool TestSumByColumnVT() 
-{
-    XPRINT(0, stdout, "[TEST SumByColumnVT] sum of a vector (column vector) and a tensor in a column by column manner \n");
-    bool returnFlag = true, caseFlag = true;
-
-    /* case 1 test */
-    caseFlag = TestSumByColumnVT1();
-    if (!caseFlag) {
-        returnFlag = false;
-        XPRINT(0, stdout, ">> case 1 failed!\n");
-    }
-    else
-        XPRINT(0, stdout, ">> case 1 passed!\n");
-
-    /* other cases test */
-    /*
-        TODO!!
-    */
-
-    if (returnFlag) {
-        XPRINT(0, stdout, ">> All Passed!\n");
-    }
-    else
-        XPRINT(0, stdout, ">> Failed!\n");
-
-    XPRINT(0, stdout, "\n");
-
-    return returnFlag;
-}
-
-} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/test/TSumByColumnVT.h
+++ b/source/tensor/test/TSumByColumnVT.h
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-06
-*/
-
-#ifndef __TEST_SUMBYCOLUMNVT_H__
-#define __TEST_SUMBYCOLUMNVT_H__
-
-#include "../core/arithmetic/SumByColumnVT.h"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-/* test for SumByColumnVT Function */
-extern "C"
-bool TestSumByColumnVT();
-
-} // namespace nts(NiuTrans.Tensor)
-#endif // __TEST_SUMBYCOLUMNVT_H__
--- a/source/tensor/test/Test.cpp
+++ b/source/tensor/test/Test.cpp
@@ -68,10 +68,9 @@ bool Test()
    wrong = !TestSin() || wrong;
    wrong = !TestSort() || wrong;
    wrong = !TestSplit() || wrong;
+    wrong = !TestSpread() || wrong;
    wrong = !TestSub() || wrong;
    wrong = !TestSum() || wrong;
-    wrong = !TestSumByColumnTV() || wrong;
-    wrong = !TestSumByColumnVT() || wrong;
    wrong = !TestSumDim() || wrong;
    wrong = !TestTan() || wrong;
    wrong = !TestTranspose() || wrong;

--- a/source/tensor/test/Test.h
+++ b/source/tensor/test/Test.h
@@ -61,10 +61,9 @@
 #include "TSin.h"
 #include "TSort.h"
 #include "TSplit.h"
+#include "TSpread.h"
 #include "TSub.h"
 #include "TSum.h"
-#include "TSumByColumnTV.h"
-#include "TSumByColumnVT.h"
 #include "TSumDim.h"
 #include "TTan.h"
 #include "TTranspose.h"