1. fixed the bug in the Spread function. 2. add compare and OnehotAndIndex…

1. fixed the bug in the Spread function. 2. add compare and OnehotAndIndex function 3. optimize the code

1. fixed the bug in the Spread function. 2. add compare and OnehotAndIndex…
1. fixed the bug in the Spread function. 2. add compare and OnehotAndIndex function 3. optimize the code
854a4835 · xuchen · 61c4d15c · 854a4835 · 854a4835 · 854a4835
Commit 854a4835 authored Dec 24, 2018 by xuchen
--- a/source/network/XBackwardShape.cpp
+++ b/source/network/XBackwardShape.cpp
@@ -25,6 +25,7 @@
 #include "XNoder.h"
 #include "XBackwardShape.h"
 #include "../tensor/XName.h"
+#include "../tensor/XUtility.h"
 #include "../tensor/core/CHeader.h"
 #include "../tensor/core/getandset/SetData.h"
@@ -40,7 +41,7 @@ void XShapeGrad::MakeGrad(XTensor * node, bool isEfficent)
    if(operID == MOVEMENT_COPYINDEXED)
        GradCopyIndexed(node, isEfficent);
-    if(operID == MOVEMENT_GATHER)
+    else if(operID == MOVEMENT_GATHER)
        GradGather(node, isEfficent);
    else if(operID == SHAPE_MERGE)
        GradMerge(node, isEfficent);
@@ -80,7 +81,7 @@ gradient computation for copying indexed sub-tensors
 for
 b = copyindexed(a) 
 we have
-dE/da = spread(b)
+dE/da = spreadforcopyindexed(b)
 >> node - the node (c) for backward computation
 >> isEfficient - indicates whether the computation is in
                 an efficient manner
@@ -91,32 +92,14 @@ void XShapeGrad::GradCopyIndexed(XTensor * node, bool isEfficent)
    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for CopyIndexed!");
    int dim = income.GetParamInt(0);
-    int * srcIndex = (int *)income.GetParamPointer(1);
+    int copyNum = income.GetParamInt(1);
-    int indexSize = income.GetParamInt(2);
-    int * tgtIndex = (int *)income.GetParamPointer(3);
-    int copyNum = income.GetParamInt(4);
-    int realIndexSize = indexSize * copyNum;
-    int * realSrcIndex = new int[realIndexSize];
-    int * realTgtIndex = new int[realIndexSize];
-    for(int i = 0; i < indexSize; i++) {
-        for(int j = 0; j < copyNum; j++) {
-            realSrcIndex[i * copyNum + j] = srcIndex[i] + j;
-            realTgtIndex[i * copyNum + j] = tgtIndex[i] + j;
-        }
-    }
    XTensor * input = income.tails[0];
-    XNoder::MakeGrad(input);
+    XTensor * srcIndex = income.tails[1];
+    XTensor * tgtIndex = income.tails[2];
-    _Spread(input->grad, node->grad, dim, realSrcIndex, realIndexSize, realTgtIndex);
-    delete[] realSrcIndex;
+    XNoder::MakeGrad(input);
-    delete[] realTgtIndex;
+    _SpreadForCopyIndexed(input->grad, node->grad, dim, srcIndex, tgtIndex, copyNum);
-    delete[] srcIndex;
-    delete[] tgtIndex;
-    node->visitMark = NODE_FINISHED;
 }
 /* 
@@ -143,7 +126,6 @@ void XShapeGrad::GradGather(XTensor * node, bool isEfficent)
    node->visitMark = NODE_FINISHED;
 }
 /* 
 gradient for merge
 for 
@@ -181,6 +163,7 @@ void XShapeGrad::GradMerge(XTensor * node, bool isEfficent)
    XNoder::MakeGrad(input);
    int * dims = new int[input->order];
+    memset(dims, 0, sizeof(int) * input->order);
    for(int i = 0, j = 0; i < input->order; i++){
        if(i >= leadDim){
            dims[j++] = input->dimSize[i];

--- a/source/sample/fnnlm/FNNLM.cpp
+++ b/source/sample/fnnlm/FNNLM.cpp
@@ -231,7 +231,7 @@ void LoadArgs(int argc, const char ** argv, FNNModel &model)
    }
    for(int i = 0; i < argc; i++){
-        if(!strcmp(argv[i], "-mempool"))
+        if (!strcmp(argv[i], "-mempool"))
            model.mem = new XMem(model.devID);
    }
 }
@@ -715,24 +715,16 @@ The indexed cell is set to 1, and 0 otherwise.
 >> devID - device id
 >> mem - memory pool
 */
-void InitZeroOneTensor2D(XTensor &tensor, int rowNum, int colNum, int * rows, int * cols, int itemNum, int devID, XMem * mem)
+void InitZeroOneTensor2D(XTensor &tensor, int rowNum, int colNum, int * rows, int * cols, 
+                         int itemNum, int devID, XMem * mem)
 {
-    if(devID >= 0 || (mem != NULL && mem->devID >= 0))
+    InitTensor2D(&tensor, rowNum, colNum, X_FLOAT, devID, mem);
-        InitTensor2D(&tensor, rowNum, colNum, X_FLOAT, -1);
-    else
-        InitTensor2D(&tensor, rowNum, colNum, X_FLOAT, devID, mem);
    tensor.SetZeroAll();
    /* set none-zero cells */
    for(int i = 0; i < itemNum; i++)
        tensor.Set2D(1.0F, rows[i], cols[i]);
-    if(devID >= 0 || (mem != NULL && mem->devID >= 0)){
-        XList list(1);
-        list.Add(&tensor);
-        CPUToGPUFlush(&list, devID, mem);
-    }
 }
 /*
@@ -859,8 +851,6 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
        /* y = softmax(s) */
        _LogSoftmax(&s, &y, 1);
    }
 }
 /*
@@ -998,7 +988,6 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model
    XTensor embeddingBig;
    XTensor hidden;
    XTensor b;
-    XTensor srcIndex;
    int size = batch * (n-1);
    int * index = new int[size];
@@ -1010,28 +999,25 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model
        }
    }
-    InitTensor1D(&srcIndex, size, X_INT, model.devID, model.mem);
+    InitTensor1D(&words, size, X_INT, model.devID, model.mem);
-    srcIndex.SetData(index, size);
+    words.SetData(index, size);
+    embeddingBig = Gather(model.embeddingW, words);
-    XTensor embedding;
-    embedding = Gather(model.embeddingW, srcIndex);
    delete[] index;
    int dimSize[2];
-    dimSize[0] = embedding.GetDim(0) / (n - 1);
+    dimSize[0] = embeddingBig.GetDim(0) / (n - 1);
-    dimSize[1] = embedding.GetDim(1) * (n - 1);
+    dimSize[1] = embeddingBig.GetDim(1) * (n - 1);
-    hidden = Reshape(embedding, embedding.order, dimSize);
+    hidden = Reshape(embeddingBig, embeddingBig.order, dimSize);
    /* hidden layers */
    for(int i = 0; i < depth; i++)
-        hidden = MMul(hidden, model.hiddenW[i]) + model.hiddenB[i];
+        hidden = HardTanH(MMul(hidden, model.hiddenW[i]) + model.hiddenB[i]);
    /* output layer */
    output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1);
-    //XLink::ShowNetwork(stderr, &output);
 }
 /*
@@ -1071,7 +1057,6 @@ void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model)
    /* output layer */
    output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1);
-    //XLink::ShowNetwork(stderr, &output);
 }
 /* 

--- a/source/sample/transformer/T2TDecoder.cpp
+++ b/source/sample/transformer/T2TDecoder.cpp
@@ -70,7 +70,7 @@ make the decoding network
 >> inputDec - the input tensor of the decoder
 >> outputEnc - the output tensor of the encoder
 >> mask - mask that indicates which position is valid
->> mask - mask for the encoder-decoder attention
+>> maskEncDec - mask for the encoder-decoder attention
 >> isTraining - indicates whether the model is used for training
 << return - the output tensor of the encoder
 */

--- a/source/sample/transformer/T2TEmbedding.cpp
+++ b/source/sample/transformer/T2TEmbedding.cpp
@@ -150,7 +150,6 @@ XTensor T2TEmbedder::Make(XTensor &input)
    }
    /* then we make word embeddings */
-    //wordEmbedding = Linear(MMul(input, w), (float)sqrt((float)eSize));
    wordEmbedding = Gather(w, input);
    wordEmbedding = Linear(wordEmbedding, (float)sqrt((float)eSize));

--- a/source/sample/transformer/T2TTrainer.cpp
+++ b/source/sample/transformer/T2TTrainer.cpp
@@ -693,70 +693,46 @@ int T2TTrainer::LoadBatchLM(FILE * file,
    dims[2] = vs;
    InitTensor2D(batchEnc, sc, max, X_INT, devID, mem);
-    //InitTensor(batchEnc, 3, dims, X_FLOAT, 1.0F, devID, mem);
-    InitTensor2D(paddingEnc, sc, max, X_FLOAT, devID, mem);
    InitTensor(gold, 3, dims, X_FLOAT, 1.0F, devID, mem);
+    InitTensor2D(paddingEnc, sc, max, X_FLOAT, devID, mem);
    InitTensor2D(paddingDec, sc, max, X_FLOAT, devID, mem);
    batchEnc->SetZeroAll();
-    paddingEnc->SetZeroAll();
    gold->SetZeroAll();
+    paddingEnc->SetZeroAll();
    paddingDec->SetZeroAll();
-    if(isTraining) {
-        //XNoder::MakeGrad(batchEnc);
-        XNoder::MakeGrad(paddingEnc);
-        XNoder::MakeGrad(gold);
-        XNoder::MakeGrad(paddingDec);
-        //batchEnc->grad->SetZeroAll();
-        paddingEnc->grad->SetZeroAll();
-        gold->grad->SetZeroAll();
-        paddingDec->grad->SetZeroAll();
-    }
    int seqSize = 0;
-    MTYPE * batchEncOffsets = new MTYPE[batchEnc->unitNum];
    int * batchEncValues = new int[batchEnc->unitNum];
-    MTYPE * paddingEncOffsets = new MTYPE[paddingEnc->unitNum];
    MTYPE * goldOffsets = new MTYPE[gold->unitNum];
-    MTYPE * paddingDecOffsets = new MTYPE[paddingDec->unitNum];
+    //MTYPE * paddingEncOffsets = new MTYPE[paddingEnc->unitNum];
+    //MTYPE * paddingDecOffsets = new MTYPE[paddingDec->unitNum];
    int wGold = 0;
-    //fprintf(tf, "batch %d(%d)\n", tc++, sc);
+    memset(batchEncValues, 0, sizeof(int) * batchEnc->unitNum);
    for(int s = seq; s < seq + sc; s++){
        int len = isDoubledEnd ? seqLen[s] : seqLen[s] - 1;
        CheckNTErrors(len <= max, "Something is wrong!");
        for(int w = 0; w < len; w++){
            int num = buf[seqOffset[s] + w];
-            //batchEnc->Set2DInt(buf[seqOffset[s] + w], s - seq, w);
+            batchEncValues[(int)batchEnc->GetOffset2D(s - seq, w)] = num;
-            batchEncOffsets[wCount] = batchEnc->GetOffset2D(s - seq, w);
+            //paddingEncOffsets[wCount] = paddingEnc->GetOffset2D(s - seq, w);
-            batchEncValues[wCount] = num;
+            //paddingDecOffsets[wCount] = paddingDec->GetOffset2D(s - seq, w);
-            //paddingEnc->Set2D(1.0F, s - seq, w);
-            //paddingDec->Set2D(1.0F, s - seq, w);
-            paddingEncOffsets[wCount] = paddingEnc->GetOffset2D(s - seq, w);
-            paddingDecOffsets[wCount] = paddingDec->GetOffset2D(s - seq, w);
            if (w > 0)
-                //gold->Set3D(1.0F, s - seq, w - 1, buf[seqOffset[s] + w]);
                goldOffsets[wGold++] = gold->GetOffset3D(s - seq, w - 1, num);
            if (w == len - 1) {
                if (isDoubledEnd)
-                    //gold->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w]);
                    goldOffsets[wGold++] = gold->GetOffset3D(s - seq, w, num);
                else
-                    //gold->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w + 1]);
                    goldOffsets[wGold++] = gold->GetOffset3D(s - seq, w, buf[seqOffset[s] + w + 1]);
            }
            wCount++;
-            /*fprintf(tf, "%d", buf[seqOffset[s] + w]);
-            if(w < seqLen[s] - 1)
-                fprintf(tf, " ");
-            else
-                fprintf(tf, "\n");*/
            if(seqs != NULL)
                seqs[seqSize++] = buf[seqOffset[s] + w];
        }
@@ -767,16 +743,25 @@ int T2TTrainer::LoadBatchLM(FILE * file,
        }
    }
-    batchEnc->SetDataBatched(batchEncOffsets, batchEncValues, wCount);
+    batchEnc->SetData(batchEncValues, batchEnc->unitNum);
-    paddingEnc->SetDataBatched(paddingEncOffsets, 1.0F, wCount);
-    paddingDec->SetDataBatched(paddingDecOffsets, 1.0F, wCount);
    gold->SetDataBatched(goldOffsets, 1.0F, wGold);
+    //paddingEnc->SetDataBatched(paddingEncOffsets, 1.0F, wCount);
+    //paddingDec->SetDataBatched(paddingDecOffsets, 1.0F, wCount);
+    XTensor * tmp = NewTensorBuf(paddingEnc, devID, mem);
+    _ConvertDataType(batchEnc, tmp);
+    _NotEqual(tmp, paddingEnc, 0);
+    DelTensorBuf(tmp);
+    XTensor * tmp2 = NewTensorBuf(paddingDec, devID, mem);
+    _ConvertDataType(batchEnc, tmp2);
+    _NotEqual(tmp2, paddingDec, 0);
+    DelTensorBuf(tmp2);
-    delete[] batchEncOffsets;
    delete[] batchEncValues;
-    delete[] paddingEncOffsets;
-    delete[] paddingDecOffsets;
    delete[] goldOffsets;
+    //delete[] paddingEncOffsets;
+    //delete[] paddingDecOffsets;
    fflush(tf);
@@ -877,33 +862,33 @@ int T2TTrainer::LoadBatchMT(FILE * file,
    int wGold = 0;
    wCount = 0;
-    MTYPE * batchEncOffsets = new MTYPE[batchEnc->unitNum];
    int * batchEncValues = new int[batchEnc->unitNum];
-    MTYPE * batchDecOffsets = new MTYPE[batchDec->unitNum];
    int * batchDecValues = new int[batchDec->unitNum];
-    MTYPE * paddingEncOffsets = new MTYPE[sc * maxEnc / 2];
+    //MTYPE * paddingEncOffsets = new MTYPE[sc * maxEnc / 2];
-    MTYPE * paddingDecOffsets = new MTYPE[sc * maxDec / 2];
+    //MTYPE * paddingDecOffsets = new MTYPE[sc * maxDec / 2];
    MTYPE * goldOffsets = new MTYPE[sc * maxDec / 2];
+    memset(batchEncValues, 0, sizeof(int) * batchEnc->unitNum);
+    memset(batchDecValues, 0, sizeof(int) * batchDec->unitNum);
    /* batch of the source-side sequences */
    for(int s = seq; s < seq + sc; s += 2){
        int len = seqLen[s];
        int sent = (s - seq)/2;
        for(int w = 0; w < len; w++){
            int num = buf[seqOffset[s] + w];
+            batchEncValues[batchEnc->GetOffset2D(sent, w)] = num;
-            batchEncOffsets[wCount] = batchEnc->GetOffset2D(sent, w);
+            //paddingEncOffsets[wCountEnc] = paddingEnc->GetOffset2D(sent, w);
-            batchEncValues[wCount] = num;
+            wCountEnc++;
-            paddingEncOffsets[wCount] = paddingEnc->GetOffset2D(sent, w);
-            wCount++;
        }
    }
-    batchEnc->SetDataBatched(batchEncOffsets, batchEncValues, wCount);
+    batchEnc->SetData(batchEncValues, batchEnc->unitNum);
-    paddingEnc->SetDataBatched(paddingEncOffsets, 1.0F, wCount);
+    //paddingEnc->SetDataBatched(paddingEncOffsets, 1.0F, wCountEnc);
+    XTensor * tmp = NewTensorBuf(paddingEnc, devID, mem);
-    wCountEnc = wCount;
+    _ConvertDataType(batchEnc, tmp);
-    wCount = 0;
+    _NotEqual(tmp, paddingEnc, 0);
+    DelTensorBuf(tmp);
    /* batch of the target-side sequences */
    for(int s = seq + 1; s < seq + sc; s += 2){
@@ -912,10 +897,8 @@ int T2TTrainer::LoadBatchMT(FILE * file,
        int sent = (s - seq - 1)/2;
        for(int w = 0; w < len; w++){
            int num = buf[seqOffset[s] + w];
+            batchDecValues[batchDec->GetOffset2D(sent, w)] = num;
-            batchDecOffsets[wCountDec] = batchDec->GetOffset2D(sent, w);
+            //paddingDecOffsets[wCountDec] = paddingDec->GetOffset2D(sent, w);
-            batchDecValues[wCountDec] = num;
-            paddingDecOffsets[wCountDec] = paddingDec->GetOffset2D(sent, w);
            if (w > 0)
                goldOffsets[wGold++] = gold->GetOffset3D(sent, w - 1, buf[seqOffset[s] + w]);
@@ -938,16 +921,20 @@ int T2TTrainer::LoadBatchMT(FILE * file,
        }
    }
-    batchDec->SetDataBatched(batchDecOffsets, batchDecValues, wCountDec);
+    batchDec->SetData(batchDecValues, batchDec->unitNum);
-    paddingDec->SetDataBatched(paddingDecOffsets, 1.0F, wCountDec);
+    //paddingDec->SetDataBatched(paddingDecOffsets, 1.0F, wCountDec);
+    XTensor * tmp2 = NewTensorBuf(paddingDec, devID, mem);
+    _ConvertDataType(batchDec, tmp2);
+    _NotEqual(tmp2, paddingDec, 0);
+    DelTensorBuf(tmp2);
    gold->SetDataBatched(goldOffsets, 1.0F, wGold);
-    delete[] batchEncOffsets;
    delete[] batchEncValues;
-    delete[] batchDecOffsets;
    delete[] batchDecValues;
-    delete[] paddingEncOffsets;
+    //delete[] paddingEncOffsets;
-    delete[] paddingDecOffsets;
+    //delete[] paddingDecOffsets;
    delete[] goldOffsets;
    return sc;
@@ -981,11 +968,12 @@ float T2TTrainer::GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs)
    XTensor probs;
    InitTensor(&probs, output);
-    /*XTensor logOutput;
+    //XTensor logOutput;
-    InitTensor(&logOutput, output);
+    //InitTensor(&logOutput, output);
-    _Log(output, &logOutput);*/
+    //_Log(output, &logOutput);
    /* probs[i,j] = output[i,j] * gold[i,j] */
+    //_Multiply(&logOutput, gold, &probs);
    _Multiply(output, gold, &probs);
    /* probability of each word */
@@ -1161,6 +1149,7 @@ void T2TTrainer::RescaleOutput(XTensor * output, XTensor * gold, XTensor * paddi
    _ExpMe(output);
    _ScaleAndShiftMe(output, 1/count);
    _LogMe(output);
    _ScaleAndShiftMe(gold, 1/count);
 }

--- a/source/sample/transformer/Transformer.cpp
+++ b/source/sample/transformer/Transformer.cpp
@@ -20,6 +20,7 @@
 */
 #include <math.h>
+#include <time.h>
 #include "Transformer.h"
 #include "T2TModel.h"
 #include "T2TUtility.h"
@@ -58,6 +59,7 @@ int TransformerMain(int argc, const char ** argv)
    LoadParamString(argc, args, "test", testFN, "");
    LoadParamString(argc, args, "output", outputFN, "");
+    srand((unsigned int)time(NULL));
    T2TTrainer trainer;
    trainer.Init(argc, args);

--- a/source/tensor/XGlobal.h
+++ b/source/tensor/XGlobal.h
@@ -65,10 +65,10 @@ namespace nts {
 #endif
 #ifndef MIN
-#define MIN(a,b) ((a < b) ? a : b)
+#define MIN(a,b) ((a) < (b) ? a : b)
 #endif
 #ifndef MAX
-#define MAX(a,b) ((a > b) ? a : b)
+#define MAX(a,b) ((a) > (b) ? a : b)
 #endif
 #define __FILENAME__ ( strrchr(__FILE__, DELIMITER) != NULL ? strrchr(__FILE__, DELIMITER)+1 : __FILE__ )

--- a/source/tensor/XName.h
+++ b/source/tensor/XName.h
@@ -79,7 +79,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define MOVEMENT_COPYVALUES     MOVEMENT_COPYINDEXED + 1
 #define MOVEMENT_GATHER         MOVEMENT_COPYVALUES + 1
-#define SHAPE                   MOVEMENT_COPYVALUES + 1
+#define SHAPE                   MOVEMENT_GATHER + 1
 #define SHAPE_CONCATENATE       SHAPE + 1
 #define SHAPE_MERGE             SHAPE_CONCATENATE + 1
 #define SHAPE_MERGE_LIST        SHAPE_MERGE + 1

--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -804,7 +804,7 @@ set tensor items with an array of values
 >> values - value for each data item
 >> num - number of the data items
 */
-void XTensor::SetDataBatched(MTYPE * offsets, void * values, int num)
+void XTensor::SetDataBatchedWithValues(MTYPE * offsets, void * values, int num)
 {
    _SetDataWithOffsetAndValue(this, offsets, values, num);
 }
@@ -1289,7 +1289,7 @@ int XTensor::GetNonzeroSize()
        if(dataType == DEFAULT_DTYPE){
            int count = 0;
            for(int i = 0; i < unitNum; i++){
-                DTYPE value = *((DTYPE*)(char*)data + i * sizeof(DTYPE));
+                DTYPE value = *(DTYPE*)((char*)data + i * sizeof(DTYPE));
                if(value == 0)
                    count++;
            }
@@ -2271,6 +2271,8 @@ XTensor * NewTensor(const XTensor * a, bool isFilledData)
    CheckNTErrors((a != NULL), "Empty input!");
+    memset(dims, 0, sizeof(int) * MAX_TENSOR_DIM_NUM);
    if(a->order > 0)
        memcpy(dims, a->dimSize, sizeof(int) * a->order);

--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
@@ -285,7 +285,7 @@ public:
    void SetDataBatched(MTYPE * offsets, DTYPE value, int num);
    /* set tensor items with an array of values */
-    void SetDataBatched(MTYPE * offsets, void * values, int num);
+    void SetDataBatchedWithValues(MTYPE * offsets, void * values, int num);
    /* check whether the data array is the same as the answer */
    bool CheckData(const void * answer, int num, int beg = 0);

--- a/source/tensor/core/CHeader.h
+++ b/source/tensor/core/CHeader.h
@@ -16,8 +16,8 @@
 */
 /*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
+ * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
+ */
 /* this is a header to include all functions in the "core" workspace */
@@ -46,16 +46,17 @@
 #include "arithmetic/XTensorBLAS.h"
 #include "getandset/ConvertDataType.h"
+#include "getandset/OnehotAndIndex.h"
 #include "getandset/Select.h"
 #include "getandset/SetData.h"
 #include "math/Clip.h"
+#include "math/Compare.h"
 #include "math/Normalize.h"
 #include "math/Power.h"
 #include "math/ScaleAndShift.h"
 #include "math/Unary.h"
 #include "movement/CopyBlocks.h"
 #include "movement/CopyBlocksInGrid.h"
 #include "movement/CopyBlocksOnSite.h"

--- a/source/tensor/core/getandset/OnehotAndIndex.cpp
+++ b/source/tensor/core/getandset/OnehotAndIndex.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-12-17
+ */
+#include "OnehotAndIndex.h"
+#include "OnehotAndIndex.cuh"
+namespace nts{ // namespace nts(NiuTrans.Tensor)
+/* 
+convert onehot tensor to index tensor 
+>> onehot - onehot tensor, which value is 0 or 1
+>> index - index tensor, which value is an integer num
+>> size - the last dimension size of the onehot tensor
+*/
+void _OnehotToIndex(XTensor * onehot, XTensor * index, int size)
+{
+    CheckNTErrors(onehot->GetDim(-1) == size, "Illegal tensor dimension!");
+    CheckNTErrors(onehot->order == index->order + 1, "Illegal tensor order!");
+    CheckNTErrors(onehot->dataType == X_INT, "The onehot tensor must be in X_INT!")
+    CheckNTErrors(index->dataType == X_INT, "The index tensor must be in X_INT!")
+    for (int i = 0; i < index->order; i++)
+        CheckNTErrors(index->GetDim(i) == onehot->GetDim(i), "Illegal tensor order!");
+#ifdef USE_CUDA
+    if(onehot->devID >= 0 && index->devID >= 0) {
+        _CudaOnehotToIndex(onehot, index, size);
+        return;
+    }
+#endif
+    int blockNum = index->unitNum;
+    int stride = size;
+    int * onehotData = (int *)onehot->data;
+    int * indexData = (int *)index->data;
+    for (int i = 0; i < blockNum; i++) {
+        int * od = onehotData + i * stride;
+        int record = -1;
+        for (int j = 0; j < stride; j++) {
+            if (od[j] != 0) {
+                if (record == -1)
+                    record = j;
+                else
+                    ShowNTErrors("The value of onehot tensor is illegal!");
+            }
+        }
+        indexData[i] = record;
+    }
+}
+/* 
+convert onehot tensor to index tensor (return an XTensor structure)
+make a new tensor to keep the result and return it 
+>> onehot - onehot tensor, which value is 0 or 1
+>> size - the last dimension size of the onehot tensor
+<< return - the index tensor
+*/
+XTensor OnehotToIndex(XTensor & onehot, int size)
+{
+    CheckNTErrors(onehot.GetDim(-1) == size, "Illegal tensor dimension!");
+    CheckNTErrors(onehot.dataType == X_INT, "The onehot tensor must be in X_INT!")
+    XTensor index;
+    InitTensor(&index, onehot.order - 1, onehot.dimSize, X_INT, 1.0F, onehot.devID, onehot.mem);
+    index.SetTMPFlag();
+    _OnehotToIndex(&onehot, &index, size);
+    return index;
+}
+/* 
+convert index tensor to onehot tensor 
+>> index - index tensor, which value is an integer num
+>> onehot - onehot tensor, which value is 0 or 1
+>> size - the last dimension size of the onehot tensor
+*/
+void _IndexToOnehot(XTensor * index, XTensor * onehot, int size)
+{
+    CheckNTErrors(onehot->GetDim(-1) == size, "Illegal tensor dimension!");
+    CheckNTErrors(onehot->order == index->order + 1, "Illegal tensor order!");
+    CheckNTErrors(onehot->dataType == X_INT, "The onehot tensor must be in X_INT!")
+    CheckNTErrors(index->dataType == X_INT, "The index tensor must be in X_INT!")
+    for (int i = 0; i < index->order; i++)
+        CheckNTErrors(index->GetDim(i) == onehot->GetDim(i), "Illegal tensor order!");
+    onehot->SetZeroAll();
+#ifdef USE_CUDA
+    if(onehot->devID >= 0 && index->devID >= 0) {
+        _CudaIndexToOnehot(index, onehot, size);
+        return;
+    }
+#endif
+    int blockNum = index->unitNum;
+    int stride = size;
+    int * indexData = (int *)index->data;
+    int * onehotData = (int *)onehot->data;
+    for (int i = 0; i < blockNum; i++) {
+        int id = indexData[i];
+        int * od = onehotData + i * stride;
+        od[id] = 1;
+    }
+}
+/* 
+convert onehot tensor to index tensor (return an XTensor structure)
+make a new tensor to keep the result and return it 
+>> index - index tensor, which value is an integer num
+>> size - the last dimension size of the onehot tensor
+<< return - the onehot tensor
+*/
+XTensor IndexToOnehot(XTensor & index, int size)
+{
+    CheckNTErrors(index.dataType == X_INT, "The onehot tensor must be in X_INT!")
+    XTensor onehot;
+    onehot.SetTMPFlag();
+    int order = index.order;
+    int * dim = new int[order + 1];
+    memcpy(dim, index.dimSize, order * sizeof(int));
+    dim[order] = size;
+    InitTensor(&onehot, index.order + 1, dim, X_INT, 1.0F, index.devID, index.mem);
+    _IndexToOnehot(&index, &onehot, size);
+    delete[] dim;
+    return onehot;
+}
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/getandset/OnehotAndIndex.cu
+++ b/source/tensor/core/getandset/OnehotAndIndex.cu
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31
+ */
+#include "OnehotAndIndex.cuh"
+#include "../../XDevice.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+#ifdef USE_CUDA
+/* 
+convert onehot tensor to index tensor (kernel version) 
+>> onehotData - the data pointer of the onehot tensor
+>> indexData - the data pointer of the index tensor
+>> blockNum - the number of block
+>> stride - stride of a data block
+*/
+__global__
+void KernelOnehotToIndex(int * onehotData, int * indexData, int blockNum, int stride)
+{
+    /* block id */
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    /* offset in each block */
+    int offset = blockDim.y * blockIdx.y + threadIdx.y;
+    if (i >= blockNum || offset >= stride)
+        return;
+    int * od = onehotData + i * stride;
+    int * id = indexData + i;
+    if (od[offset] != 0)
+        *id = offset;
+}
+/* 
+convert onehot tensor to index tensor (cuda version) 
+>> onehot - onehot tensor, which value is 0 or 1
+>> index - index tensor, which value is an integer num
+>> size - the last dimension size of the onehot tensor
+*/
+void _CudaOnehotToIndex(XTensor * onehot, XTensor * index, int size)
+{
+    int devID = onehot->devID;
+    int blockNum = index->unitNum;
+    int stride = size;
+    int cudaGrids[3];
+    int cudaBlocks[3];
+    int devIDBackup;
+    ProtectCudaDev(devID, devIDBackup);
+    GDevs.GetCudaThread2D(devID, blockNum, stride, MAX_INT, cudaGrids, cudaBlocks);
+    dim3 blocks(cudaGrids[0], cudaGrids[1]);
+    dim3 threads(cudaBlocks[0], cudaBlocks[1]);
+    int * onehotData = (int *)onehot->data;
+    int * indexData = (int *)index->data;
+    KernelOnehotToIndex<<<blocks, threads >>>(onehotData, indexData, blockNum, stride);
+    BacktoCudaDev(devID, devIDBackup);
+}
+/* 
+convert index tensor to onehot tensor (kernel version) 
+>> onehotData - the data pointer of the onehot tensor
+>> indexData - the data pointer of the index tensor
+>> blockNum - the number of block
+>> stride - stride of a data block
+*/
+__global__
+void KernelIndexToOnehot(int * onehotData, int * indexData, int blockNum, int stride)
+{
+    /* block id */
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    /* offset in each block */
+    int offset = blockDim.y * blockIdx.y + threadIdx.y;
+    if (i >= blockNum || offset >= stride)
+        return;
+    int * od = onehotData + i * stride;
+    int id = indexData[i];
+    od[id] = 1;
+}
+/* 
+convert index tensor to onehot tensor (cuda version) 
+>> index - index tensor, which value is an integer num
+>> onehot - onehot tensor, which value is 0 or 1
+>> size - the last dimension size of the onehot tensor
+*/
+void _CudaIndexToOnehot(XTensor * index, XTensor * onehot, int size)
+{
+    int devID = onehot->devID;
+    int blockNum = index->unitNum;
+    int stride = size;
+    int cudaGrids[3];
+    int cudaBlocks[3];
+    int devIDBackup;
+    ProtectCudaDev(devID, devIDBackup);
+    GDevs.GetCudaThread2D(devID, blockNum, stride, MAX_INT, cudaGrids, cudaBlocks);
+    dim3 blocks(cudaGrids[0], cudaGrids[1]);
+    dim3 threads(cudaBlocks[0], cudaBlocks[1]);
+    int * onehotData = (int *)onehot->data;
+    int * indexData = (int *)index->data;
+    KernelIndexToOnehot<<<blocks, threads >>>(onehotData, indexData, blockNum, stride);
+    BacktoCudaDev(devID, devIDBackup);
+}
+#endif // USE_CUDA
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/getandset/OnehotAndIndex.cuh
+++ b/source/tensor/core/getandset/OnehotAndIndex.cuh
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-12-17
+ */
+#ifndef __ONEHOTANDINDEX_CUH__
+#define __ONEHOTANDINDEX_CUH__
+#include "../../XTensor.h"
+namespace nts{ // namespace nts(NiuTrans.Tensor)
+/* convert onehot tensor to index tensor (cuda version) */
+void _CudaOnehotToIndex(XTensor * onehot, XTensor * index, int size);
+/* convert index tensor to onehot tensor (cuda version) */
+void _CudaIndexToOnehot(XTensor * index, XTensor * onehot, int size);
+} // namespace nts(NiuTrans.Tensor)
+#endif // __ONEHOTANDINDEX_CUH__
\ No newline at end of file
--- a/source/tensor/core/getandset/OnehotAndIndex.h
+++ b/source/tensor/core/getandset/OnehotAndIndex.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-12-17
+ */
+#ifndef __ONEHOTANDINDEX_H__
+#define __ONEHOTANDINDEX_H__
+#include "../../XTensor.h"
+namespace nts{ // namespace nts(NiuTrans.Tensor)
+/* convert onehot tensor to index tensor */
+void _OnehotToIndex(XTensor * onehot, XTensor * index, int size);
+/* convert onehot tensor to index tensor (return an XTensor structure)
+make a new tensor to keep the result and return it */
+XTensor OnehotToIndex(XTensor & onehot, int num);
+/* convert index tensor to onehot tensor */
+void _IndexToOnehot(XTensor * index, XTensor * onehot, int size);
+/* convert index tensor to onehot tensor (return an XTensor structure)
+make a new tensor to keep the result and return it */
+XTensor IndexToOnehot(XTensor & index, int num);
+} // namespace nts(NiuTrans.Tensor)
+#endif // __ONEHOTANDINDEX_H__
\ No newline at end of file
--- a/source/tensor/core/getandset/SetData.cpp
+++ b/source/tensor/core/getandset/SetData.cpp
@@ -70,8 +70,9 @@ void _SetDataFanInOut(XTensor * tensor, DTYPE gain)
        fanOut = numOutputFmaps * receptiveFieldSize;
    }
-    DTYPE finfout = gain * (float)sqrt(6.0F/(fanIn + fanOut));
+    DTYPE std = gain * (float)sqrt(2.0 / (fanIn + fanOut));
-    tensor->SetDataRand(-finfout, finfout);
+    DTYPE a = (DTYPE)sqrt(3.0F) * std;
+    tensor->SetDataRand(-a, a);
    //_SetDataRand(tensor, -finfout, finfout);
 }
@@ -499,36 +500,9 @@ void _SetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * values
    }
    else {
 #ifdef USE_CUDA
-        XMem * mem = tensor->mem;
+        if(tensor->devID >= 0) {
-        MTYPE offsetSize = num * sizeof(MTYPE);
+            _CudaSetDataWithOffsetAndValue(tensor, offsets, values, num);
-        MTYPE valueSize;
+            return;
-        if (tensor->dataType == X_INT)
-            valueSize = num * sizeof(int);
-        else if (tensor->dataType == X_FLOAT)
-            valueSize = num * sizeof(float);
-        else
-            ShowNTErrors("TO DO!!!");
-        MTYPE * offsetsCuda = mem != NULL ? 
-                             (MTYPE*)mem->AllocBuf(mem->devID, offsetSize) : 
-                             (MTYPE*)XMemAlloc(tensor->devID, offsetSize);
-        void * valuesCuda  = mem != NULL ? 
-                             mem->AllocBuf(mem->devID, valueSize) : 
-                             XMemAlloc(tensor->devID, valueSize);
-        XMemCopy(offsetsCuda, tensor->devID, offsets, -1, offsetSize);
-        XMemCopy(valuesCuda, tensor->devID, values, -1, valueSize);
-        _CudaSetDataWithOffsetAndValue(tensor, offsetsCuda, valuesCuda, num);
-        if (mem != NULL) {
-            mem->ReleaseBuf(mem->devID, valueSize);
-            mem->ReleaseBuf(mem->devID, offsetSize);
-        }
-        else {
-            XMemFree(tensor->devID, offsetsCuda);
-            XMemFree(tensor->devID, valuesCuda);
        }
 #else
        ShowNTErrors("Please recompile the code with USE_CUDA");

--- a/source/tensor/core/getandset/SetData.cu
+++ b/source/tensor/core/getandset/SetData.cu
@@ -26,6 +26,7 @@
 #include "SetData.cuh"
 #include <curand_kernel.h>
 #include "../../XDevice.h"
+#include "../../XUtility.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -363,7 +364,7 @@ e.g., for a 3* 3 tensor,
      2 2 0
 */
 __global__
-void _KernelSetDataLowTri(DTYPE * d, int l, int blockSize, int blockNum, DTYPE p, int shift)
+void KernelSetDataLowTri(DTYPE * d, int l, int blockSize, int blockNum, DTYPE p, int shift)
 {
    /* offset in each block */
    int i = blockDim.x * blockIdx.x + threadIdx.x;
@@ -425,7 +426,7 @@ void _CudaSetDataLowTri(XTensor * tensor, DTYPE p, int shift)
    int devIDBackup;
    ProtectCudaDev(tensor->devID, devIDBackup);
-    _KernelSetDataLowTri<<<blocks, threads >>>((DTYPE*)tensor->data, l, blockSize, blockNum, p, shift);
+    KernelSetDataLowTri<<<blocks, threads >>>((DTYPE*)tensor->data, l, blockSize, blockNum, p, shift);
    BacktoCudaDev(tensor->devID, devIDBackup);
 }
@@ -474,12 +475,12 @@ set the data with an array of offsets (kernel version)
 >> num - number of the data items
 */
 __global__
-void _KernelSetDataWithOffset(DTYPE * data, MTYPE * offsets, DTYPE value, MTYPE num)
+void KernelSetDataWithOffset(DTYPE * data, MTYPE * offsets, DTYPE value, MTYPE num)
 {
    /* index */
    int i = blockDim.x * blockIdx.x + threadIdx.x;
-    if(i < num)
+    if (i < num)
        data[offsets[i]] = value;
 }
@@ -505,7 +506,7 @@ void _CudaSetDataWithOffset(XTensor * tensor, MTYPE * offsets, DTYPE value, MTYP
    int devIDBackup;
    ProtectCudaDev(tensor->devID, devIDBackup);
-    _KernelSetDataWithOffset << <blocks, threads >> > ((DTYPE*)tensor->data, offsets, value, num);
+    KernelSetDataWithOffset << <blocks, threads >> > ((DTYPE*)tensor->data, offsets, value, num);
    BacktoCudaDev(tensor->devID, devIDBackup);
 }
@@ -519,7 +520,7 @@ set the data with an array of offsets (kernel version)
 >> dataType - the data type of the data and values
 */
 __global__
-void _KernelSetDataWithOffset(void * data, MTYPE * offsets, void * values, MTYPE num, TENSOR_DATA_TYPE dataType)
+void KernelSetDataWithOffsetAndValue(void * data, MTYPE * offsets, void * values, MTYPE num, TENSOR_DATA_TYPE dataType)
 {
    /* index */
    int i = blockDim.x * blockIdx.x + threadIdx.x;
@@ -541,6 +542,18 @@ set the data with an array of values
 */
 void _CudaSetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * values, MTYPE num)
 {
+    XMem * mem = tensor->mem;
+    MTYPE offsetSize = num * sizeof(MTYPE);
+    MTYPE valueSize;
+    if (tensor->dataType == X_INT)
+        valueSize = num * sizeof(int);
+    else if (tensor->dataType == X_FLOAT)
+        valueSize = num * sizeof(float);
+    else
+        ShowNTErrors("TO DO!!!");
    int gridSize[3];
    int blockSize[3];
@@ -552,7 +565,32 @@ void _CudaSetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * va
    int devIDBackup;
    ProtectCudaDev(tensor->devID, devIDBackup);
-    _KernelSetDataWithOffset << <blocks, threads >> > (tensor->data, offsets, values, num, tensor->dataType);
+    MTYPE * offsetsCuda = mem != NULL ? 
+                            (MTYPE*)mem->AllocBuf(mem->devID, offsetSize) : 
+                            (MTYPE*)XMemAlloc(tensor->devID, offsetSize);
+    void * valuesCuda  = mem != NULL ? 
+                            mem->AllocBuf(mem->devID, valueSize) : 
+                            XMemAlloc(tensor->devID, valueSize);
+    if (mem != NULL) {
+        XMemCopy(offsetsCuda, mem->devID, offsets, -1, offsetSize);
+        XMemCopy(valuesCuda, mem->devID, values, -1, valueSize);
+    }
+    else {
+        XMemCopy(offsetsCuda, tensor->devID, offsets, -1, offsetSize);
+        XMemCopy(valuesCuda, tensor->devID, values, -1, valueSize);
+    }
+    KernelSetDataWithOffsetAndValue<<<blocks, threads >>> (tensor->data, offsetsCuda, valuesCuda, num, tensor->dataType);
+    if (mem != NULL) {
+        mem->ReleaseBuf(mem->devID, valueSize);
+        mem->ReleaseBuf(mem->devID, offsetSize);
+    }
+    else {
+        XMemFree(tensor->devID, valuesCuda);
+        XMemFree(tensor->devID, offsetsCuda);
+    }
    BacktoCudaDev(tensor->devID, devIDBackup);
 }

--- a/source/tensor/core/math/Clip.h
+++ b/source/tensor/core/math/Clip.h
@@ -15,7 +15,6 @@
 * limitations under the License.
 */
 /*
 * $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-03
 */

--- a/source/tensor/core/math/Compare.cpp
+++ b/source/tensor/core/math/Compare.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-12-10
+ */
+#include "../../XTensor.h"
+#include "../../XName.h"
+#include "Compare.h"
+#include "Compare.cuh"
+namespace nts{ // namespace nts(NiuTrans.Tensor)
+DTYPE myIsEqual(DTYPE a, DTYPE b)
+{
+    return (a == b ? 1.0F : 0.0F);
+}
+DTYPE myIsNotEqual(DTYPE a, DTYPE b)
+{
+    return (a != b ? 1.0F : 0.0F);
+}
+#ifdef USE_CUDA
+/* define three marco separately, specify the respective function names  (GPU mode) */
+#define _SIMPLE_COMPARE_FUNCTION(_funcName, _cudaFuncName, origFunc)        \
+void _funcName(const XTensor * a, XTensor * b, DTYPE number)                \
+{                                                                           \
+    CheckNTErrors((XTensor::IsSameShaped(a, b)),                            \
+                  "Input tensors should have the same type!");              \
+    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");                 \
+    /* run it on GPUs */                                                    \
+    if (a->devID >= 0) {                                                    \
+        _cudaFuncName(a, b, number);                                        \
+        return;                                                             \
+    }                                                                       \
+    DTYPE * d = (DTYPE*)a->data;                                            \
+    DTYPE * db = (DTYPE*)b->data;                                           \
+    for (int i = 0; i < a->unitNum; i++)                                    \
+        db[i] = (DTYPE)origFunc(d[i], number);                              \
+}
+#define _SIMPLE_COMPARE_FUNCTION_ME(_funcNameMe, _funcName)                 \
+void _funcNameMe(XTensor * a, DTYPE number)                                 \
+{                                                                           \
+    _funcName(a, a, number);                                                \
+}        
+#define SIMPLE_COMPARE_FUNCTION(funcName, _funcName, operationId)           \
+XTensor funcName(const XTensor &a, DTYPE number)                            \
+{                                                                           \
+    XTensor b(&a);                                                          \
+    b.SetTMPFlag();                                                         \
+    _funcName(&a, &b, number);                                              \
+    return b;                                                               \
+}
+// I think we needn't to make link.
+// XLink::MakeLink(&a, NULL, &b, operationId);
+_SIMPLE_COMPARE_FUNCTION(_Equal, _CudaEqual, myIsEqual)
+_SIMPLE_COMPARE_FUNCTION_ME(_EqualMe, _Equal)
+SIMPLE_COMPARE_FUNCTION(Equal, _Equal, MATH_EQUAL)
+_SIMPLE_COMPARE_FUNCTION(_NotEqual, _CudaNotEqual, myIsNotEqual)
+_SIMPLE_COMPARE_FUNCTION_ME(_NotEqualMe, _NotEqual)
+SIMPLE_COMPARE_FUNCTION(NotEqual, _NotEqual, MATH_NOTEQUAL)
+#else
+/* define three marco separately, specify the respective function names (CPU mode) */
+#define _SIMPLE_COMPARE_FUNCTION(_funcName, origFunc)                       \
+void _funcName(const XTensor * a, XTensor * b, DTYPE number)                \
+{                                                                           \
+    CheckNTErrors((XTensor::IsSameShaped(a, b)),                            \
+                  "Input tensors should have the same type!");              \
+    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");                 \
+    DTYPE * d = (DTYPE*)a->data;                                            \
+    DTYPE * db = (DTYPE*)b->data;                                           \
+    for (int i = 0; i < a->unitNum; i++)                                    \
+        db[i] = (DTYPE)origFunc(d[i], number);                              \
+}
+#define _SIMPLE_COMPARE_FUNCTION_ME(_funcNameMe, _funcName)                 \
+void _funcNameMe(XTensor * a, DTYPE number)                                 \
+{                                                                           \
+    _funcName(a, a, number);                                                \
+}        
+#define SIMPLE_COMPARE_FUNCTION(funcName, _funcName, operationId)           \
+XTensor funcName(const XTensor &a, DTYPE number)                            \
+{                                                                           \
+    XTensor b(&a);                                                          \
+    b.SetTMPFlag();                                                         \
+    _funcName(&a, &b, number);                                              \
+    return b;                                                               \
+}
+// I think we needn't to make link.
+// XLink::MakeLink(&a, NULL, &b, operationId);
+_SIMPLE_COMPARE_FUNCTION(_Equal, myIsEqual)
+_SIMPLE_COMPARE_FUNCTION_ME(_EqualMe, _Equal)
+SIMPLE_COMPARE_FUNCTION(Equal, _Equal, MATH_EQUAL)
+_SIMPLE_COMPARE_FUNCTION(_NotEqual, myIsNotEqual)
+_SIMPLE_COMPARE_FUNCTION_ME(_NotEqualMe, _NotEqual)
+SIMPLE_COMPARE_FUNCTION(NotEqual, _NotEqual, MATH_NOTEQUAL)
+#endif
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/math/Compare.cu
+++ b/source/tensor/core/math/Compare.cu
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31
+ */
+#include <math.h>
+#include "../../XDevice.h"
+#include "../../XName.h"
+#include "Compare.h"
+#include "Compare.cuh"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+#ifdef USE_CUDA
+__device__
+DTYPE cudaIsEqual(DTYPE a, DTYPE b)
+{
+    return (a == b ? 1.0F : 0.0F);
+}
+__device__
+DTYPE cudaIsNotEqual(DTYPE a, DTYPE b)
+{
+    return (a != b ? 1.0F : 0.0F);
+}
+#define SIMPLE_COMPARE_FUNCTION_GPU(funcName, origFunc)                     \
+__global__                                                                  \
+void Kernel##funcName(DTYPE * a, DTYPE * b, int size, DTYPE number)         \
+{                                                                           \
+    int i = blockDim.x * blockIdx.x + threadIdx.x;                          \
+                                                                            \
+    if (i < size)                                                           \
+        b[i] = (DTYPE)origFunc(a[i], number);                               \
+}                                                                           \
+__global__                                                                  \
+void Kernel##funcName(__half * a, __half * b, int size, __half number)      \
+{                                                                           \
+    return;                                                                 \
+}                                                                           \
+void _Cuda##funcName(const XTensor * a, XTensor * b, DTYPE number)          \
+{                                                                           \
+                                                                            \
+    int gridSize[3];                                                        \
+    int blockSize[3];                                                       \
+                                                                            \
+    GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);         \
+                                                                            \
+    dim3 blocks(gridSize[0]);                                               \
+    dim3 threads(blockSize[0]);                                             \
+                                                                            \
+    int devIDBackup;                                                        \
+    ProtectCudaDev(a->devID, devIDBackup);                                  \
+                                                                            \
+    if (a->dataType == DEFAULT_DTYPE) {                                     \
+        Kernel##funcName<<<blocks, threads>>>                               \
+                         ((DTYPE*)a->data, (DTYPE*)b->data,                 \
+                           a->unitNum, (DTYPE)number);                      \
+    }                                                                       \
+    else if (a->dataType == X_FLOAT16) {                                    \
+        Kernel##funcName<<<blocks, threads>>>                               \
+                         ((__half*)a->data, (__half*)b->data,               \
+                           a->unitNum, (__half)number);                     \
+    }                                                                       \
+    else {                                                                  \
+        ShowNTErrors("TODO!");                                              \
+    }                                                                       \
+                                                                            \
+    BacktoCudaDev(a->devID, devIDBackup);                                   \
+}                                                                           \
+SIMPLE_COMPARE_FUNCTION_GPU(Equal, cudaIsEqual)
+SIMPLE_COMPARE_FUNCTION_GPU(NotEqual, cudaIsNotEqual)
+#endif // USE_CUDA
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/math/Compare.cuh
+++ b/source/tensor/core/math/Compare.cuh
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-12-10
+ */
+#ifndef __COMPARE_CUH__
+#define __COMPARE_CUH__
+#include "../../XTensor.h"
+namespace nts{ // namespace nts(NiuTrans.Tensor)
+#ifdef USE_CUDA
+/* compare whether every entry is equal to the specified value (cuda kernel) */
+__global__
+void KernelEqual(DTYPE * a, DTYPE * b, DTYPE * number);
+/* compare whether every entry is equal to the specified value (cuda version) */
+void _CudaEqual(const XTensor * a, XTensor * b, DTYPE number);
+/* compare whether every entry is not equal to the specified value (cuda kernel) */
+__global__
+void KernelNotEqual(DTYPE * a, DTYPE * b, DTYPE * number);
+/* compare whether every entry is not equal to the specified value (cuda version) */
+void _CudaNotEqual(const XTensor * a, XTensor * b, DTYPE number);
+#endif // USE_CUDA
+} // namespace nts(NiuTrans.Tensor)
+#endif //end __COMPARE_CUH__
\ No newline at end of file
--- a/source/tensor/core/math/Compare.h
+++ b/source/tensor/core/math/Compare.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-12-10
+ */
+#ifndef __COMPARE_H__
+#define __COMPARE_H__
+#include "../../XTensor.h"
+namespace nts{ // namespace nts(NiuTrans.Tensor)
+/* compare whether every entry is equal to the specified value */
+void _Equal(const XTensor * a, XTensor * b, DTYPE number);
+/* compare whether every entry is equal to the specified value (do it on site)
+keep the result in the input tensor a and return nothing */
+void _EqualMe(XTensor * a, DTYPE number);
+/* compare whether every entry is equal to the specified value (return an XTensor structure)
+make a new tensor to keep the result and return it */
+XTensor Equal(const XTensor & a, DTYPE number);
+/* compare whether every entry is not equal to the specified value */
+void _NotEqual(const XTensor * a, XTensor * b, DTYPE number);
+/* compare whether every entry is not equal to the specified value (do it on site)
+keep the result in the input tensor a and return nothing */
+void _NotEqualMe(XTensor * a, DTYPE number);
+/* compare whether every entry is not equal to the specified value (return an XTensor structure)
+make a new tensor to keep the result and return it */
+XTensor NotEqual(const XTensor & a, DTYPE number);
+} // namespace nts(NiuTrans.Tensor)
+#endif // end __COMPARE_H__
\ No newline at end of file
--- a/source/tensor/core/math/Unary.cpp
+++ b/source/tensor/core/math/Unary.cpp
@@ -223,4 +223,4 @@ _SIMPLE_UNARY_FUNCTION_ME(_RoundMe, _Round)
 SIMPLE_UNARY_FUNCTION(Round, _Round, MATH_ROUND)*/
 #endif
-}
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/math/Unary.cu
+++ b/source/tensor/core/math/Unary.cu
@@ -15,7 +15,6 @@
 * limitations under the License.
 */
 /*
 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31
 */

--- a/source/tensor/core/math/Unary.cuh
+++ b/source/tensor/core/math/Unary.cuh
@@ -15,7 +15,6 @@
 * limitations under the License.
 */
 /*
 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31
 */

--- a/source/tensor/core/math/Unary.h
+++ b/source/tensor/core/math/Unary.h
@@ -15,7 +15,6 @@
 * limitations under the License.
 */
 /*
 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31
 */
@@ -145,5 +144,6 @@ void _TanMe(XTensor * a);
 make a new tensor to keep the result and return it */
 XTensor Tan(const XTensor & a);
-}
+} // namespace nts(NiuTrans.Tensor)
-#endif //end __UNARY_H__
\ No newline at end of file
+#endif // end __UNARY_H__
\ No newline at end of file
--- a/source/tensor/core/movement/CopyBlocksSelected.cu
+++ b/source/tensor/core/movement/CopyBlocksSelected.cu
@@ -79,8 +79,13 @@ void _CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, i
    ProtectCudaDev(devID, devIDBackup);
    /* copy the index to the GPU memory */
-    int * sourceBlocksTMP = myMem != NULL ? (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) : (int *)XMemAlloc(devID, blockNum * sizeof(int));
+    int * sourceBlocksTMP = myMem != NULL ? 
-    int * targetBlocksTMP = myMem != NULL ? (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) : (int *)XMemAlloc(devID, blockNum * sizeof(int));
+                           (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) : 
+                           (int *)XMemAlloc(devID, blockNum * sizeof(int));
+    int * targetBlocksTMP = myMem != NULL ? 
+                           (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) : 
+                           (int *)XMemAlloc(devID, blockNum * sizeof(int));
    XMemCopy(sourceBlocksTMP, devID, sourceBlocks, -1, blockNum * sizeof(int));
    XMemCopy(targetBlocksTMP, devID, targetBlocks, -1, blockNum * sizeof(int));

--- a/source/tensor/core/movement/CopyIndexed.cpp
+++ b/source/tensor/core/movement/CopyIndexed.cpp
 /* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
+ * All rights reserved.
-*
+ *
-* Licensed under the Apache License, Version 2.0 (the "License");
+ * Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
+ * you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
+ * You may obtain a copy of the License at
-*
+ *
-*   http://www.apache.org/licenses/LICENSE-2.0
+ *   http://www.apache.org/licenses/LICENSE-2.0
-*
+ *
-* Unless required by applicable law or agreed to in writing, software
+ * Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
+ * distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
+ * See the License for the specific language governing permissions and
-* limitations under the License.
+ * limitations under the License.
-*/
+ */
 /*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
+ * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
+ */
 #include "CopyIndexed.h"
+#include "CopyIndexed.cuh"
 #include "CopyBlocks.h"
+#include "Gather.h"
 #include "../../XName.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -40,7 +42,9 @@ copy indexed sub-tensors
             e.g., for srcIndex = [1,4] and copyNum = 2,
             we actually copy the source sub-tensors 1, 2, 4, 5
 */
-void _CopyIndexed(const XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum)
+void _CopyIndexed(const XTensor * s, XTensor * t, int dim, 
+                  int * srcIndex, int indexSize, int * tgtIndex, 
+                  int copyNum)
 {
    CheckNTErrors((s && t), "Invalid tensors!");
    CheckNTErrors((s->devID == t->devID || (s->devID < 0 && t->devID < 0)),
@@ -99,7 +103,148 @@ void _CopyIndexed(const XTensor * s, XTensor * t, int dim, int * srcIndex, int i
 }
 /*
-copy indexed sub-tensors (return an XTensor structure)
+copy selected sub-tensors where indeces are kept in tensors
+>> s - the source tensor
+>> t - the target tensor
+>> dim - the leading dimension to define "sub-tensors"
+         e.g., for a tensor of size (3, 2, 4) and dim = 2, 
+         we have 4 sub-tensors of size (3, 2)
+>> srcIndex - the tensor to save the index of the source sub-tensors
+>> tgtIndex - the tensor to save the index of the target sub-tensors
+>> copyNum - number of the sub-tensors we copy for each source index, 
+             e.g., for srcIndex = [1,4] and copyNum = 2,
+             we actually copy the source sub-tensors 1, 2, 4, 5
+*/
+void _CopyIndexed(const XTensor * s, XTensor * t, int dim, 
+                  const XTensor * srcIndex, const XTensor * tgtIndex, 
+                  int copyNum)
+{
+    int order = s->order;
+    int indexSize = srcIndex->unitNum;
+    CheckNTErrors(indexSize != 0, "NULL index!")
+    CheckNTErrors((s && t), "Invalid tensors!");
+    CheckNTErrors((srcIndex && tgtIndex), "Invalid index tensors!");
+    CheckNTErrors((s->devID == t->devID || (s->devID < 0 && t->devID < 0)),
+                  "the data must be kept on the same device!");
+    CheckNTErrors((srcIndex->devID == srcIndex->devID || (s->devID < 0 && t->devID < 0)),
+                  "the index must be kept on the same device!");
+    CheckNTErrors((s->devID == srcIndex->devID || (s->devID < 0 && t->devID < 0)),
+                  "the data and index must be kept on the same device!");
+    CheckNTErrors((dim >= 0 && dim < order), "A too larget dimension specified!");
+    CheckNTErrors((s->unitSize == t->unitSize), "Unmatched tensors!");
+    CheckNTErrors((srcIndex->unitNum == tgtIndex->unitNum), "Unmatched index tensors!");
+    for (int i = 0; i < order; i++) {
+        if (i != dim) {
+            CheckNTErrors(s->GetDim(i) == t->GetDim(i), "Unmatched dimensions");
+        }
+        else {
+            CheckNTErrors(t->GetDim(i) == indexSize * copyNum, "Unmatched dimensions");
+        }
+    }
+#ifdef USE_CUDA
+    if (s->devID >= 0 && srcIndex->devID >= 0) {
+        _CudaCopyIndexed(s, t, dim, srcIndex, tgtIndex, copyNum);
+        return;
+    }
+#endif
+    int blockNum = 1;
+    int stride = 1;
+    int blockSizeSrc = 1;
+    int blockSizeTgt = 1;
+    for (int i = 0; i < dim; i++)
+        blockNum *= s->GetDim(i);
+    for (int i = dim + 1; i < order; i++)
+        stride *= s->GetDim(i);
+    blockSizeSrc = stride * s->GetDim(dim);
+    blockSizeTgt = stride * t->GetDim(dim);
+    DTYPE * sData = (DTYPE*)s->data;
+    DTYPE * tData = (DTYPE*)t->data;
+    int * sIndex = (int*)srcIndex->data;
+    int * tIndex = (int*)tgtIndex->data;
+    for (int i = 0; i < indexSize; i++) {
+        for (int c = 0; c < copyNum; c++) {
+            int si = sIndex[i] + c;
+            int ti = tIndex[i] + c;
+            for (int j = 0; j < blockNum; j++) {
+                DTYPE * sd = sData + j * blockSizeSrc + si * stride;
+                DTYPE * td = tData + j * blockSizeTgt + ti * stride;
+                for (int k = 0; k < stride; k++)
+                    *(td + k) = *(sd + k);
+            }
+        }
+    }
+}
+/*
+copy selected sub-tensors where indeces are kept in tensors (return an XTensor structure)
+make a new tensor to keep the result and return it
+>> s - the source tensor
+>> dim - the leading dimension to define "sub-tensors"
+         e.g., for a tensor of size (3, 2, 4) and dim = 2, 
+         we have 4 sub-tensors of size (3,2)
+>> srcIndex - index of the source sub-tensors
+>> indexSize - length of srcIndex (and tgtIndex)
+>> tgtIndex - index of the target sub-tensors
+>> copyNum - number of the sub-tensors we copy for each source index, 
+   e.g., for srcIndex = [1,4] and copyNum = 2,
+   we actually copy the source sub-tensors 1, 2, 4, 5
+<< return - the result of copying indexed sub-tensors
+*/
+XTensor CopyIndexed(const XTensor & s, int dim, 
+                    const XTensor & srcIndex, const XTensor & tgtIndex,
+                    int copyNum)
+{
+    CheckNTErrors(dim >= 0 && dim < s.order, "A too larget dimension specified!");
+    int order = s.order;
+    int * dimSize = new int[order];
+    int indexSize = srcIndex.unitNum;
+    for (int i = 0; i < s.order; i++) {
+        if (i == dim)
+            dimSize[i] = indexSize * copyNum;
+        else
+            dimSize[i] = s.dimSize[i];
+    }
+    float dr = (!s.isSparse) ? 1.0F : s.denseRatio;
+    XTensor t(order, dimSize, s.dataType, dr, s.devID, s.mem);
+    t.SetTMPFlag();
+    /* call _CopyIndexed function */
+    _CopyIndexed(&s, &t, dim, &srcIndex, &tgtIndex, copyNum);
+    XList list(3);
+    list.Add(&s);
+    list.Add(&srcIndex);
+    list.Add(&tgtIndex);
+    /* tensor connection */
+    XLink::MakeLink(&list, &t, MOVEMENT_COPYINDEXED);
+    XLink::AddParamToHeadInt(&t, dim);
+    XLink::AddParamToHeadInt(&t, copyNum);
+    /* destroy variables */
+    delete[] dimSize;
+    return t;
+}
+/*
+copy indexed sub-tensors (return a XTensor structure)
 make a new tensor to keep the result and return it
 >> s - the source tensor

--- a/source/tensor/core/movement/CopyIndexed.cu
+++ b/source/tensor/core/movement/CopyIndexed.cu
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-11-30
+ */
+#include "CopyIndexed.cuh"
+#include "../../XDevice.h"
+#include "../../XUtility.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+#ifdef USE_CUDA
+/*
+copy selected sub-tensors where indeces are kept in tensors (kenerl version)
+>> s - the source tensor
+>> t - the target tensor
+>> dim - the leading dimension to define "sub-tensors"
+         e.g., for a tensor of size (3, 2, 4) and dim = 2, 
+         we have 4 sub-tensors of size (3, 2)
+>> srcIndex - the tensor to save the index of the source sub-tensors
+>> tgtIndex - the tensor to save the index of the target sub-tensors
+>> copyNum - number of the sub-tensors we copy for each source index, 
+             e.g., for srcIndex = [1,4] and copyNum = 2,
+             we actually copy the source sub-tensors 1, 2, 4, 5
+*/
+__global__
+void KernelCopyIndexed(DTYPE * sData, DTYPE * tData, int * sIndex, int * tIndex, 
+                       int blockNum, int blockSizeSrc, int blockSizeTgt, 
+                       int stride, int indexSize, int copyNum)
+{
+    __shared__ DTYPE * sp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+    __shared__ DTYPE * tp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+    /* block id */
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    /* offset in each block */
+    int offset = blockDim.y * blockIdx.y + threadIdx.y;
+    if(i >= blockNum * indexSize * copyNum || offset >= stride)
+        return;
+    int realIndexSize = indexSize * copyNum;
+    int realBlockNum = i / realIndexSize;
+    int realIndex = i % realIndexSize;
+    int realSrcIndex = sIndex[realIndex / copyNum] + realIndex % copyNum;
+    int realTgtIndex = tIndex[realIndex / copyNum] + realIndex % copyNum;
+    if(threadIdx.y == 0){
+        sp[threadIdx.x] = sData + realBlockNum * blockSizeSrc + realSrcIndex * stride;
+        tp[threadIdx.x] = tData + realBlockNum * blockSizeTgt + realTgtIndex * stride;
+    }
+    __syncthreads();
+    DTYPE * s = sp[threadIdx.x];
+    DTYPE * t = tp[threadIdx.x];
+    t[offset] = s[offset];
+}
+/*
+copy selected sub-tensors where indeces are kept in tensors
+>> s - the source tensor
+>> t - the target tensor
+>> dim - the leading dimension to define "sub-tensors"
+         e.g., for a tensor of size (3, 2, 4) and dim = 2, 
+         we have 4 sub-tensors of size (3, 2)
+>> srcIndex - the tensor to save the index of the source sub-tensors
+>> tgtIndex - the tensor to save the index of the target sub-tensors
+>> copyNum - number of the sub-tensors we copy for each source index, 
+             e.g., for srcIndex = [1,4] and copyNum = 2,
+             we actually copy the source sub-tensors 1, 2, 4, 5
+*/
+void _CudaCopyIndexed(const XTensor * s, XTensor * t, int dim,
+                      const XTensor * srcIndex, const XTensor * tgtIndex,
+                      int copyNum)
+{
+    int devID = s->devID;
+    int order = s->order;
+    int indexSize = srcIndex->unitNum;
+    int blockNum = 1;
+    int stride = 1;
+    int blockSizeSrc = 1;
+    int blockSizeTgt = 1;
+    for (int i = 0; i < dim; i++)
+        blockNum *= s->GetDim(i);
+    for (int i = dim + 1; i < order; i++)
+        stride *= s->GetDim(i);
+    blockSizeSrc = stride * s->GetDim(dim);
+    blockSizeTgt = stride * t->GetDim(dim);
+    int cudaGrids[3];
+    int cudaBlocks[3];
+    int devIDBackup;
+    ProtectCudaDev(devID, devIDBackup);
+    GDevs.GetCudaThread2D(devID, blockNum * indexSize * copyNum, stride, MAX_INT, cudaGrids, cudaBlocks);
+    dim3 blocks(cudaGrids[0], cudaGrids[1]);
+    dim3 threads(cudaBlocks[0], cudaBlocks[1]);
+    DTYPE * sData = (DTYPE*)s->data;
+    DTYPE * tData = (DTYPE*)t->data;
+    int * sIndex = (int *)srcIndex->data;
+    int * tIndex = (int *)tgtIndex->data;
+    KernelCopyIndexed<<<blocks, threads >>>(sData, tData, sIndex, tIndex, 
+                                            blockNum, blockSizeSrc, blockSizeTgt,
+                                            stride, indexSize, copyNum);
+    BacktoCudaDev(devID, devIDBackup);
+}
+#endif // USE_CUDA
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/movement/CopyIndexed.cuh
+++ b/source/tensor/core/movement/CopyIndexed.cuh
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-11-30
+ * Tomorrow is the celebration of the laboratory, I'm so happy!
+ */
+#ifndef __CopyIndexed_CUH__
+#define __CopyIndexed_CUH__
+#include "../../XTensor.h"
+#include "CopyIndexed.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+#ifdef USE_CUDA
+/* copy selected sub-tensors where indeces are kept in tensors (cuda version) */
+void _CudaCopyIndexed(const XTensor * s, XTensor * t, int dim, 
+                      const XTensor * srcIndex, const XTensor * tgtIndex, 
+                      int copyNum);
+#endif // USE_CUDA
+} // namespace nts(NiuTrans.Tensor)
+#endif // __CopyIndexed_CUH__
\ No newline at end of file
--- a/source/tensor/core/movement/CopyIndexed.h
+++ b/source/tensor/core/movement/CopyIndexed.h
@@ -27,22 +27,27 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /* copy selected sub-tensors */
-void _CopyIndexed(const XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum);
+void _CopyIndexed(const XTensor * s, XTensor * t, int dim, 
+                  int * srcIndex, int indexSize, int * tgtIndex,
+                  int copyNum = 1);
 /* copy selected sub-tensors where indeces are kept in tensors */
-void _CopyIndexed(const XTensor * s, XTensor * t, int dim, const XTensor * srcIndex, const XTensor * tgtIndex);
+void _CopyIndexed(const XTensor * s, XTensor * t, int dim, 
+                  const XTensor * srcIndex, const XTensor * tgtIndex, 
+                  int copyNum = 1);
 /* 
-copy selected sub-tensors (return an XTensor structure)
+copy selected sub-tensors (return a XTensor structure)
 make a new tensor to keep the result and return it (remove this???)
 */
 XTensor CopyIndexed(const XTensor &s, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum);
 /*
 copy selected sub-tensors where indeces are kept in tensors (return an XTensor structure)
-make a new tensor to keep the result and return it (remove this???)
+make a new tensor to keep the result and return it
 */
-void CopyIndexed(const XTensor * s, XTensor * t, int dim, const XTensor * srcIndex, const XTensor * tgtIndex);
+XTensor CopyIndexed(const XTensor & s, int dim, 
+                    const XTensor & srcIndex, const XTensor & tgtIndex,
+                    int copyNum = 1);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/movement/Gather.cpp
+++ b/source/tensor/core/movement/Gather.cpp
@@ -57,15 +57,14 @@ gather indexed sub-tensors
 >> t - the target tensor
 >> srcIndex - the tensor to save the index of the source tensor
 */
-void _Gather(XTensor * s, XTensor * t, XTensor * srcIndex)
+void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex)
 {
    CheckNTErrors((s && t), "Invalid tensors!");
-    CheckNTErrors((s->devID == t->devID && t->devID == srcIndex->devID),
+    CheckNTErrors(s->devID == t->devID, "the data must be kept on the same device!");
-                  "the data must be kept on the same device!");
    CheckNTErrors((s->unitSize == t->unitSize), "Unmatched tensors!");
 #ifdef USE_CUDA
-    if (s->devID >= 0 && t->devID >= 0 && srcIndex->devID >= 0) {
+    if (s->devID >= 0 && t->devID >= 0) {
        _CudaGather(s, t, srcIndex);
        return;
    }
@@ -116,6 +115,8 @@ XTensor Gather(XTensor &s, XTensor &index)
    XTensor t(order, dimSize, s.dataType, dr, s.devID, s.mem);
    t.SetTMPFlag();
+    delete[] dimSize;
    _Gather(&s, &t, &index);
    /* tensor connection */

--- a/source/tensor/core/movement/Gather.cu
+++ b/source/tensor/core/movement/Gather.cu
 /* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
+ * All rights reserved.
-*
+ *
-* Licensed under the Apache License, Version 2.0 (the "License");
+ * Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
+ * you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
+ * You may obtain a copy of the License at
-*
+ *
-*   http://www.apache.org/licenses/LICENSE-2.0
+ *   http://www.apache.org/licenses/LICENSE-2.0
-*
+ *
-* Unless required by applicable law or agreed to in writing, software
+ * Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
+ * distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
+ * See the License for the specific language governing permissions and
-* limitations under the License.
+ * limitations under the License.
-*/
+ */
 /*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-11-27
-*/
+ */
 #include "Gather.cuh"
 #include "CopyBlocksSelected.cuh"
@@ -41,7 +41,7 @@ __global__
 void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int indexSize, int stride)
 {
    __shared__ DTYPE * sp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
-    __shared__ DTYPE * cp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+    __shared__ DTYPE * tp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
    /* block id */
    int i = blockDim.x * blockIdx.x + threadIdx.x;
@@ -54,15 +54,15 @@ void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int indexSize, int
    if(threadIdx.y == 0){
        sp[threadIdx.x] = sData + sIndex[i] * stride;
-        cp[threadIdx.x] = tData + i * stride;
+        tp[threadIdx.x] = tData + i * stride;
    }
    __syncthreads();
    DTYPE * s = sp[threadIdx.x];
-    DTYPE * c = cp[threadIdx.x];
+    DTYPE * t = tp[threadIdx.x];
-    c[offset] = s[offset];
+    t[offset] = s[offset];
 }
 /*
@@ -72,9 +72,10 @@ gather indexed sub-tensors(cuda version)
 >> t - the target tensor
 >> srcIndex - the tensor to save the index of the source tensor
 */
-void _CudaGather(XTensor * s, XTensor * t, XTensor * srcIndex)
+void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex)
 {
    int devID = s->devID;
+    XMem * mem = s->mem;
    int stride = s->GetDim(1);
    int indexSize = srcIndex->unitNum;
@@ -93,10 +94,26 @@ void _CudaGather(XTensor * s, XTensor * t, XTensor * srcIndex)
    DTYPE * sData = (DTYPE*)s->data;
    DTYPE * tData = (DTYPE*)t->data;
-    int * sIndex = (int *)srcIndex->data;
+    int * sIndex = NULL;
+    if (srcIndex->devID < 0) {
+        sIndex = mem != NULL ? 
+                  (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) : 
+                  (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
+        XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize);
+    }
+    else
+        sIndex = (int *)srcIndex->data;
    KernelGather<<<blocks, threads >>>(sData, tData, sIndex, indexSize, stride);
+    if (srcIndex->devID < 0) {
+        if(mem != NULL)
+            mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize);
+        else
+            XMemFree(mem->devID, sIndex);
+    }
    BacktoCudaDev(devID, devIDBackup);
 }

--- a/source/tensor/core/movement/Gather.cuh
+++ b/source/tensor/core/movement/Gather.cuh
 /* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
+ * All rights reserved.
-*
+ *
-* Licensed under the Apache License, Version 2.0 (the "License");
+ * Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
+ * you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
+ * You may obtain a copy of the License at
-*
+ *
-*   http://www.apache.org/licenses/LICENSE-2.0
+ *   http://www.apache.org/licenses/LICENSE-2.0
-*
+ *
-* Unless required by applicable law or agreed to in writing, software
+ * Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
+ * distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
+ * See the License for the specific language governing permissions and
-* limitations under the License.
+ * limitations under the License.
-*/
+ */
 /*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-11-27
-*/
+ */
 #ifndef __GATHER_CUH__
 #define __GATHER_CUH__
@@ -30,7 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA
 /* gather indexed sub-tensors(cuda version) */
-void _CudaGather(XTensor * s, XTensor * t, XTensor * srcIndex);
+void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex);
 #endif // USE_CUDA

--- a/source/tensor/core/movement/Gather.h
+++ b/source/tensor/core/movement/Gather.h
@@ -30,7 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 void _Gather(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize);
 /* gather selected sub-tensors */
-void _Gather(XTensor * s, XTensor * t, XTensor * srcIndex);
+void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex);
 /* gather selected sub-tensors (return an XTensor structure)
   make a new tensor to keep the result and return it */

--- a/source/tensor/core/movement/Spread.cpp
+++ b/source/tensor/core/movement/Spread.cpp
@@ -134,63 +134,92 @@ void _AssignmentForGather(DTYPE * sData, DTYPE * cData, int blockNum,
 /*
 spread a collection tensor to source tensor.
-And this is a special spread function for backward computation of gather function.
+And this is a special spread function for backward computation of CopyIndexed function.
->> source - the source tensor whose data would be modified
+>> s - the source tensor whose data would be modified
->> collection - the collection whose data would be spread to source tensor
+>> c - the collection whose data would be spread to source tensor
 >> dim - the leading dimension to define "sub-tensors"
         e.g., for a tensor of size (3, 2, 4) and dim = 2, 
         we have 4 sub-tensors of size (3, 2)
->> srcIndex - index of the source sub-tensors
+>> srcIndex - the tensor to save the index of the source sub-tensors
->> indexSize - length of srcIndex (and collIndex)
+>> collIndex - the tensor to save the index of the collection sub-tensors
+>> copyNum - number of the sub-tensors we copy for each source index, 
+             e.g., for srcIndex = [1,4] and copyNum = 2,
+             we actually copy the source sub-tensors 1, 2, 4, 5
 */
-void _SpreadForGather(XTensor * source, XTensor * collection, int dim, 
+void _SpreadForCopyIndexed(XTensor * s, XTensor * c, int dim, 
-                      int * srcIndex, int indexSize)
+                           XTensor * srcIndex, XTensor * collIndex, 
+                           int copyNum)
 {
-    int order = source->order;
+    int order = s->order;
+    int indexSize = srcIndex->unitNum;
-    CheckNTErrors(source->dataType == DEFAULT_DTYPE, "TODO!");
+    CheckNTErrors(indexSize != 0, "NULL index!")
+    CheckNTErrors((s && c), "Invalid tensors!");
+    CheckNTErrors((srcIndex && collIndex), "Invalid index tensors!");
+    CheckNTErrors((s->devID == c->devID || (s->devID < 0 && c->devID < 0)),
+                  "the data must be kept on the same device!");
+    CheckNTErrors((srcIndex->devID == srcIndex->devID || (s->devID < 0 && c->devID < 0)),
+                  "the index must be kept on the same device!");
+    CheckNTErrors((s->devID == srcIndex->devID || (s->devID < 0 && c->devID < 0)),
+                  "the data and index must be kept on the same device!");
+    CheckNTErrors((dim >= 0 && dim < s->order), "A too larget dimension specified!");
+    CheckNTErrors((s->unitSize == c->unitSize), "Unmatched tensors!");
+    CheckNTErrors((srcIndex->unitNum == collIndex->unitNum), "Unmatched index tensors!");
+    CheckNTErrors(s->dataType == DEFAULT_DTYPE, "TODO!");
    CheckNTErrors(dim >= 0 && dim < order, "Illegal dimension!");
-    for (int i = 0; i < order; i++){
+    for (int i = 0; i < order; i++) {
-        if (i == dim) {
+        if (i != dim) {
-            CheckNTErrors(collection->GetDim(i) == indexSize, "Illegal dimension!");
+            CheckNTErrors(s->GetDim(i) == c->GetDim(i), "Unmatched dimensions");
        }
        else {
-            CheckNTErrors(collection->GetDim(i) == source->GetDim(i), "Illegal dimension!");
+            CheckNTErrors(c->GetDim(i) == indexSize * copyNum, "Unmatched dimensions");
        }
    }
 #ifdef USE_CUDA
-    if(source->devID >= 0 && collection->devID >= 0) {
+    if(s->devID >= 0 && c->devID >= 0) {
-        _CudaSpreadForGather(source, collection, dim, srcIndex, indexSize);
+        _CudaSpreadForCopyIndexed(s, c, dim, srcIndex, collIndex, copyNum);
        return;
    }
 #endif
-    int blockSizeSrc = 1;
-    int blockSizeColl = 1;
    int blockNum = 1;
    int stride = 1;
+    int blockSizeSrc = 1;
+    int blockSizeTgt = 1;
-    for (int i = dim + 1; i < order; i++) {
+    for (int i = 0; i < dim; i++)
-        stride *= source->GetDim(i);
+        blockNum *= s->GetDim(i);
-    }
-    blockSizeSrc = stride * source->GetDim(dim);
+    for (int i = dim + 1; i < order; i++)
-    blockSizeColl = stride * collection->GetDim(dim);
+        stride *= s->GetDim(i);
-    blockNum = source->unitNum / blockSizeSrc;
-    DTYPE * sData = (DTYPE*)source->data;
+    blockSizeSrc = stride * s->GetDim(dim);
-    DTYPE * cData = (DTYPE*)collection->data;
+    blockSizeTgt = stride * c->GetDim(dim);
-    for(int i = 0; i < indexSize; i++){
+    DTYPE * sData = (DTYPE*)s->data;
-        int src = srcIndex[i];
+    DTYPE * cData = (DTYPE*)c->data;
-        int tgt = i;
+    int * sIndex = (int*)srcIndex->data;
-        DTYPE * s = sData + src * stride;
+    int * cIndex = (int*)collIndex->data;
-        DTYPE * c = cData + tgt * stride;
-        _AssignmentForGather(s, c, blockNum, blockSizeSrc, blockSizeColl, stride);
+    for (int i = 0; i < indexSize; i++) {
+        for (int c = 0; c < copyNum; c++) {
+            int si = sIndex[i] + c;
+            int ti = cIndex[i] + c;
+            for (int j = 0; j < blockNum; j++) {
+                DTYPE * sd = sData + j * blockSizeSrc + si * stride;
+                DTYPE * td = cData + j * blockSizeTgt + ti * stride;
+                for (int k = 0; k < stride; k++)
+                    *(sd + k) += *(td + k);
+            }
+        }
    }
 }
 /*
@@ -218,7 +247,7 @@ void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index)
    }
 #ifdef USE_CUDA
-    if(source->devID >= 0 && collection->devID >= 0 && index->devID >= 0) {
+    if(source->devID >= 0 && collection->devID >= 0) {
        _CudaSpreadForGather(source, collection, index);
        return;
    }

--- a/source/tensor/core/movement/Spread.cu
+++ b/source/tensor/core/movement/Spread.cu
--- a/source/tensor/core/movement/Spread.cuh
+++ b/source/tensor/core/movement/Spread.cuh
@@ -32,9 +32,10 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 void _CudaSpread(XTensor * source, XTensor * collection, int dim, 
                 int * srcIndex, int indexSize, int * collIndex);
-/* special spread function for backward computation of gather function (cuda version) */
+/* special spread function for backward computation of CopyIndexed function (cuda version) */
-void _CudaSpreadForGather(XTensor * source, XTensor * collection, int dim, 
+void _CudaSpreadForCopyIndexed(XTensor * s, XTensor * c, int dim, 
-                          int * srcIndex, int indexSize);
+                               XTensor * srcIndex, XTensor * collIndex, 
+                               int copyNum);
 /* special spread function for backward computation of gather function (cuda version) */
 void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcIndex);

--- a/source/tensor/core/movement/Spread.h
+++ b/source/tensor/core/movement/Spread.h
@@ -36,9 +36,10 @@ void Spread(XTensor * source, XTensor * collection,
            XTensor * srcIndex, XTensor * collIndex,
            int dim);
-/* special spread function for backward computation of gather function */
+/* special spread function for backward computation of CopyIndexed function */
-void _SpreadForGather(XTensor * source, XTensor * collection, int dim, 
+void _SpreadForCopyIndexed(XTensor * source, XTensor * collection, int dim, 
-                      int * srcIndex, int indexSize);
+                           XTensor * srcIndex, XTensor * collIndex, 
+                           int copyNum);
 /* special spread function for backward computation of gather function */
 void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index);

--- a/source/tensor/core/reduce/ReduceSumAll.cpp
+++ b/source/tensor/core/reduce/ReduceSumAll.cpp
@@ -48,18 +48,18 @@ DTYPE _ReduceSumAll(const XTensor * source)
 {
    int dims[2] = {1, source->unitNum};
    int one = 1;
    XTensor * all = NewTensorBuf(2, dims, source->dataType, source->denseRatio, source->devID, source->mem);
    XTensor * result = NewTensorBuf(1, &one, source->dataType, 1.0F, source->devID, source->mem);
    _CopyValues(source, all);
    _ReduceSum(all, result, 1);
    DTYPE r = result->Get1D(0);
    DelTensorBuf(result);
    DelTensorBuf(all);
    return r;
    int order = source->order;
@@ -76,7 +76,7 @@ DTYPE _ReduceSumAll(const XTensor * source)
        dimSize = getDimSize(big, leadingDim);
        XTensor * little = NewTensor(big->order - 1, dimSize, source->dataType, source->denseRatio, 
                                     source->devID, source->mem);
        _ReduceSum(big, little, leadingDim);
        delete big;

--- a/source/tensor/core/shape/Split.cpp
+++ b/source/tensor/core/shape/Split.cpp
@@ -126,7 +126,7 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
        void * dataTMP = t->data;
        if (!isOnSameDevice)
-            dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(mem->devID, size);
+            dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(s->devID, size);
        int realBlockSize = blockSize * t->unitSize;
        int blockSplitSize = blockNum / splitNum;

--- a/source/tensor/function/Dropout.cpp
+++ b/source/tensor/function/Dropout.cpp
@@ -153,7 +153,7 @@ XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim)
    int unitNum = x.dimSize[n];
    DTYPE * maskArray = new DTYPE[unitNum];
-    srand((unsigned int)time(NULL));
+    //srand((unsigned int)time(NULL));
    for (int i = 0; i < unitNum; i++)
        maskArray[i] = RandomBernoulli(dropProb, scaleFactor);
@@ -166,4 +166,33 @@ XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim)
    return MultiplyDim(x, mask, n, 0);
 }
+/* 
+dropout function without broadcast 
+>> x - input tensor
+>> dropProb - probability to set an element to zero
+*/
+XTensor DropoutWithoutBroadcast(const XTensor &x, DTYPE dropProb)
+{
+    CheckNTErrors(dropProb >= 0.0 && dropProb <= 1.0, "The probability must be 0-1!");
+    DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - dropProb);
+    /* generate a mask tensor with probability p */
+    int unitNum = x.unitNum;
+    DTYPE * maskArray = new DTYPE[unitNum];
+    srand((unsigned int)time(NULL));
+    for (int i = 0; i < unitNum; i++)
+        maskArray[i] = RandomBernoulli(dropProb, scaleFactor);
+    XTensor mask;
+    InitTensor(&mask, x.order, x.dimSize, x.dataType, x.denseRatio, x.devID, x.mem);
+    mask.SetData(maskArray, unitNum);
+    delete[] maskArray;
+    return Multiply(x, mask);
+}
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/function/Dropout.h
+++ b/source/tensor/function/Dropout.h
@@ -42,6 +42,9 @@ void _DropoutBackward(const XTensor * y, const XTensor * x,
 /* dropout function */
 XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim = -1);
+/* dropout function without broadcast */
+XTensor DropoutWithoutBroadcast(const XTensor &x, DTYPE dropProb);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/function/LogSoftmax.cu
+++ b/source/tensor/function/LogSoftmax.cu
@@ -373,9 +373,9 @@ better numerical stability.
 >> leadDim - leading dimension (along which we perform reduction)
 */
 void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
-                            XTensor * dedy, XTensor * dedx, 
+                             XTensor * dedy, XTensor * dedx, 
-                            XTensor * padding, int leadDim, 
+                             XTensor * padding, int leadDim, 
-                            LOSS_FUNCTION_NAME lossName)
+                             LOSS_FUNCTION_NAME lossName)
 {
    leadDim = leadDim < 0 ? y->order - 1 : leadDim;

--- a/source/tensor/test/TAbsolute.h
+++ b/source/tensor/test/TAbsolute.h
 /* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
-* All rights reserved.
+ * All rights reserved.
-*
+ *
-* Licensed under the Apache License, Version 2.0 (the "License");
+ * Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
+ * you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
+ * You may obtain a copy of the License at
-*
+ *
-*   http://www.apache.org/licenses/LICENSE-2.0
+ *   http://www.apache.org/licenses/LICENSE-2.0
-*
+ *
-* Unless required by applicable law or agreed to in writing, software
+ * Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
+ * distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
+ * See the License for the specific language governing permissions and
-* limitations under the License.
+ * limitations under the License.
-*/
+ */
 /*
-* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-12
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-12
-*/
+ */
 #ifndef __TEST_ABSOLUTE_H__
 #define __TEST_ABSOLUTE_H__

--- a/source/tensor/test/TClip.cpp
+++ b/source/tensor/test/TClip.cpp
 /* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
-* All rights reserved.
+ * All rights reserved.
-*
+ *
-* Licensed under the Apache License, Version 2.0 (the "License");
+ * Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
+ * you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
+ * You may obtain a copy of the License at
-*
+ *
-*   http://www.apache.org/licenses/LICENSE-2.0
+ *   http://www.apache.org/licenses/LICENSE-2.0
-*
+ *
-* Unless required by applicable law or agreed to in writing, software
+ * Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
+ * distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
+ * See the License for the specific language governing permissions and
-* limitations under the License.
+ * limitations under the License.
-*/
+ */
 /*
-* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-03
+ * $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-03
-*/
+ */
 #include "../XTensor.h"
+#include "../core/math/Clip.h"
 #include "TClip.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/test/TClip.h
+++ b/source/tensor/test/TClip.h
 /* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
-* All rights reserved.
+ * All rights reserved.
-*
+ *
-* Licensed under the Apache License, Version 2.0 (the "License");
+ * Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
+ * you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
+ * You may obtain a copy of the License at
-*
+ *
-*   http://www.apache.org/licenses/LICENSE-2.0
+ *   http://www.apache.org/licenses/LICENSE-2.0
-*
+ *
-* Unless required by applicable law or agreed to in writing, software
+ * Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
+ * distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
+ * See the License for the specific language governing permissions and
-* limitations under the License.
+ * limitations under the License.
-*/
+ */
 /*
-* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-03
+ * $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-03
-*/
+ */
 #ifndef __TEST_CLIP_H__
 #define __TEST_CLIP_H__

--- a/source/tensor/test/TCompare.cpp
+++ b/source/tensor/test/TCompare.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-12
+ */
+#include "../XTensor.h"
+#include "../core/math/Compare.h"
+#include "TCompare.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/*
+case 1: test Equal function.
+Comapre whether every entry is equal to the specified value.
+*/
+bool TestCompare1()
+{
+	/* a tensor of size (3, 2) */
+	int aOrder = 2;
+	int * aDimSize = new int[aOrder];
+	aDimSize[0] = 3;
+	aDimSize[1] = 2;
+	int aUnitNum = 1;
+	for (int i = 0; i < aOrder; i++)
+		aUnitNum *= aDimSize[i];
+	DTYPE aData[3][2] = { {1.0F, -2.0F},
+						  {0.0F, 4.0F},
+						  {5.0F, 1.0F} };
+	DTYPE answer[3][2] = { {1.0F, 0.0F},
+						   {0.0F, 0.0F},
+					   	   {0.0F, 1.0F} };
+	/* CPU test */
+	bool cpuTest = true;
+	/* create tensors */
+	XTensor * a = NewTensor(aOrder, aDimSize);
+	XTensor * b = NewTensor(aOrder, aDimSize);
+	XTensor * aMe = NewTensor(aOrder, aDimSize);
+	XTensor bUser;
+	/* initialize variables */
+	a->SetData(aData, aUnitNum);
+	aMe->SetData(aData, aUnitNum);
+	/* call Equal function */
+	_Equal(a, b, 1.0);
+	_EqualMe(aMe, 1.0);
+	bUser = Equal(*a, 1.0);
+	/* check results */
+	cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && 
+              aMe->CheckData(answer, aUnitNum, 1e-4F) && 
+              bUser.CheckData(answer, aUnitNum, 1e-4F);
+#ifdef USE_CUDA
+	/* GPU test */
+	bool gpuTest = true;
+	/* create tensor */
+	XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+	XTensor * bGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+	XTensor * aMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+	XTensor bUserGPU;
+	/* Initialize variables */
+	aGPU->SetData(aData, aUnitNum);
+	aMeGPU->SetData(aData, aUnitNum);
+	/* call Equal function */
+	_Equal(aGPU, bGPU, 1.0);
+	_EqualMe(aMeGPU, 1.0);
+	bUserGPU = Equal(*aGPU, 1.0);
+	/* check results */
+	gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && 
+              aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && 
+              bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
+	/* destroy variables */
+	delete a;
+	delete b;
+	delete aMe;
+	delete aGPU;
+	delete bGPU;
+	delete aMeGPU;
+	delete[] aDimSize;
+	return cpuTest && gpuTest;
+#else
+	/* destroy variables */
+	delete a;
+	delete b;
+	delete aMe;
+	delete[] aDimSize;
+	return cpuTest;
+#endif // USE_CUDA
+}
+/* other cases */
+/*
+TODO!!
+*/
+/* test for Compare Function */
+bool TestCompare()
+{
+	XPRINT(0, stdout, "[TEST Compare] compare every entry with specified value \n");
+	bool returnFlag = true, caseFlag = true;
+	/* case 1 test */
+	caseFlag = TestCompare1();
+	if (!caseFlag) {
+		returnFlag = false;
+		XPRINT(0, stdout, ">> case 1 failed!\n");
+	}
+	else
+		XPRINT(0, stdout, ">> case 1 passed!\n");
+	/* other cases test */
+	/*
+	TODO!!
+	*/
+	if (returnFlag) {
+		XPRINT(0, stdout, ">> All Passed!\n");
+	}
+	else
+		XPRINT(0, stdout, ">> Failed!\n");
+	XPRINT(0, stdout, "\n");
+	return returnFlag;
+}
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/test/TCompare.h
+++ b/source/tensor/test/TCompare.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-12-10
+ */
+#ifndef __TEST_Compare_H__
+#define __TEST_Compare_H__
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* test for Compare Function */
+bool TestCompare();
+} // namespace nts(NiuTrans.Tensor)
+#endif // __TEST_Compare_H__
--- a/source/tensor/test/TConvertDataType.cpp
+++ b/source/tensor/test/TConvertDataType.cpp
@@ -232,13 +232,12 @@ bool TestConvertDataType3()
    /* initialize variables */
    a->SetData(data1, unitNum1);
-    /* call ConvertDataType function */
+    /* call ConvertDataType function (We have not implemented this yet...)  */
    //_ConvertDataType(a, b);
    //_ConvertDataType(b, c);
    /* check results */
-    cpuTest = a->CheckData(data1, unitNum1, 1e-4F);
+    //cpuTest = a->CheckData(data1, unitNum1, 1e-4F);
-    c->Dump(stderr, "");
 #ifdef USE_CUDA
    /* GPU test */

--- a/source/tensor/test/TCopyIndexed.cpp
+++ b/source/tensor/test/TCopyIndexed.cpp
--- a/source/tensor/test/TCrossEntropy.cpp
+++ b/source/tensor/test/TCrossEntropy.cpp
@@ -311,8 +311,8 @@ bool TestCrossEntropy3()
    delete goldGPU;
    delete lossGPU;
    delete weightGPU;
    delete[] dimSize;
+    delete[] wDimSize;
    return cpuTest && gpuTest;
 #else
@@ -322,6 +322,7 @@ bool TestCrossEntropy3()
    delete loss;
    delete weight;
    delete[] dimSize;
+    delete[] wDimSize;
    return cpuTest;
 #endif // USE_CUDA

--- a/source/tensor/test/TDropout.cpp
+++ b/source/tensor/test/TDropout.cpp
@@ -212,6 +212,8 @@ bool TestDropout2()
    /* destroy variables */
    delete x;
    delete y;
+    delete dedx;
+    delete dedy;
    delete[] dimSize;
    return cpuTest;

--- a/source/tensor/test/TGather.cpp
+++ b/source/tensor/test/TGather.cpp
@@ -332,6 +332,7 @@ bool TestGather3()
    /* destroy variables */
    delete s;
    delete t;
+    delete index;
    delete[] sDimSize;
    delete[] tDimSize;
    delete[] indexDimSize;

--- a/source/tensor/test/TSetData.cpp
+++ b/source/tensor/test/TSetData.cpp
@@ -150,6 +150,7 @@ bool TestSetData2()
    delete sGPU;
    delete modifyGPU;
    delete[] sDimSize;
+    delete[] dataDimSize;
    return cpuTest && gpuTest;
 #else
@@ -157,6 +158,7 @@ bool TestSetData2()
    delete s;
    delete modify;
    delete[] sDimSize;
+    delete[] dataDimSize;
    return cpuTest;
 #endif // USE_CUDA
@@ -242,6 +244,7 @@ bool TestSetData3()
    delete sGPU;
    delete modifyGPU;
    delete[] sDimSize;
+    delete[] dataDimSize;
    return cpuTest && gpuTest;
 #else
@@ -249,6 +252,7 @@ bool TestSetData3()
    delete s;
    delete modify;
    delete[] sDimSize;
+    delete[] dataDimSize;
    return cpuTest;
 #endif // USE_CUDA

--- a/source/tensor/test/TSort.cpp
+++ b/source/tensor/test/TSort.cpp
@@ -61,7 +61,9 @@ bool TestSort1()
    _SortMe(aMe, index, 0);
    Sort(*a, bUser, *index, 0);
-    cpuTest = b->CheckData(answer, unitNum) && aMe->CheckData(answer, unitNum) && bUser.CheckData(answer, unitNum);
+    cpuTest = b->CheckData(answer, unitNum) && 
+              aMe->CheckData(answer, unitNum) && 
+              bUser.CheckData(answer, unitNum);
 #ifdef USE_CUDA
    /* GPU test */
@@ -85,7 +87,9 @@ bool TestSort1()
    Sort(*aGPU, bUserGPU, *indexGPU, 0);
    /* check results */
-    gpuTest = bGPU->CheckData(answer, unitNum) && aMeGPU->CheckData(answer, unitNum) && bUserGPU.CheckData(answer, unitNum);
+    gpuTest = bGPU->CheckData(answer, unitNum) && 
+              aMeGPU->CheckData(answer, unitNum) && 
+              bUserGPU.CheckData(answer, unitNum);
    /* destroy variables */
    delete a;
@@ -149,7 +153,9 @@ bool TestSort2()
    Sort(*a, bUser, *index, 1);
    /* check results */
-    cpuTest = b->CheckData(answer, unitNum) && aMe->CheckData(answer, unitNum) && bUser.CheckData(answer, unitNum);
+    cpuTest = b->CheckData(answer, unitNum) && 
+              aMe->CheckData(answer, unitNum) && 
+              bUser.CheckData(answer, unitNum);
 #ifdef USE_CUDA
    /* GPU test */
@@ -173,7 +179,9 @@ bool TestSort2()
    Sort(*aGPU, bUserGPU, *indexGPU, 1);
    /* check results */
-    gpuTest = bGPU->CheckData(answer, unitNum) && aMeGPU->CheckData(answer, unitNum) && bUserGPU.CheckData(answer, unitNum);
+    gpuTest = bGPU->CheckData(answer, unitNum) && 
+              aMeGPU->CheckData(answer, unitNum) && 
+              bUserGPU.CheckData(answer, unitNum);
    /* destroy variables */
    delete a;

--- a/source/tensor/test/TSplit.cpp
+++ b/source/tensor/test/TSplit.cpp
@@ -357,6 +357,7 @@ bool TestSplit3()
 	delete[] sDimSize;
 	delete[] tDimSize1;
 	delete[] tDimSize2;
+    delete tList;
 	return cpuTest;
 #endif // USE_CUDA

--- a/source/tensor/test/TSpread.cpp
+++ b/source/tensor/test/TSpread.cpp
@@ -182,6 +182,7 @@ bool TestSpread2()
    int dim = 0;
    int indexSize = 2;
    int srcIndex[2] = {0, 2};
+    int tgtIndex[2] = {0, 1};
    /* CPU test */
    bool cpuTest = true;
@@ -190,17 +191,19 @@ bool TestSpread2()
    XTensor * s1 = NewTensor(sOrder, sDimSize);
    XTensor * s2 = NewTensor(sOrder, sDimSize);
    XTensor * t = NewTensor(tOrder, tDimSize);
-    XTensor * index = NewTensor(indexOrder, indexDimSize, X_INT);
+    XTensor * sIndex = NewTensor(indexOrder, indexDimSize, X_INT);
+    XTensor * cIndex = NewTensor(indexOrder, indexDimSize, X_INT);
    /* initialize variables */
    s1->SetData(sData, sUnitNum);
    s2->SetData(sData, sUnitNum);
    t->SetData(tData, tUnitNum);
-    index->SetData(srcIndex, indexSize);
+    sIndex->SetData(srcIndex, indexSize);
+    cIndex->SetData(tgtIndex, indexSize);
    /* call _SpreadForGather function */
-    _SpreadForGather(s1, t, dim, srcIndex, indexSize);
+    _SpreadForCopyIndexed(s1, t, dim, sIndex, cIndex, 1);
-    _SpreadForGather(s2, t, index);
+    _SpreadForGather(s2, t, sIndex);
    /* check results */
    cpuTest = s1->CheckData(answer, tUnitNum) &&
@@ -214,17 +217,19 @@ bool TestSpread2()
    XTensor * sGPU1 = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
    XTensor * sGPU2 = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
    XTensor * tGPU = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * indexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
+    XTensor * sIndexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
+    XTensor * cIndexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
    /* initialize variables */
    sGPU1->SetData(sData, sUnitNum);
    sGPU2->SetData(sData, sUnitNum);
    tGPU->SetData(tData, tUnitNum);
-    indexGPU->SetData(srcIndex, indexSize);
+    sIndexGPU->SetData(srcIndex, indexSize);
+    cIndexGPU->SetData(tgtIndex, indexSize);
    /* call _SpreadForGather function */
-    _SpreadForGather(sGPU1, tGPU, dim, srcIndex, indexSize);
+    _SpreadForCopyIndexed(sGPU1, tGPU, dim, sIndex, cIndex, 1);
-    _SpreadForGather(sGPU2, tGPU, indexGPU);
+    _SpreadForGather(sGPU2, tGPU, sIndexGPU);
    /* check results */
    gpuTest = sGPU1->CheckData(answer, tUnitNum) && 
@@ -234,11 +239,13 @@ bool TestSpread2()
    delete s1;
    delete s2;
    delete t;
-    delete index;
+    delete sIndex;
+    delete cIndex;
    delete sGPU1;
    delete sGPU2;
    delete tGPU;
-    delete indexGPU;
+    delete sIndexGPU;
+    delete cIndexGPU;
    delete[] sDimSize;
    delete[] tDimSize;
    delete[] indexDimSize;
@@ -249,6 +256,8 @@ bool TestSpread2()
    delete s1;
    delete s2;
    delete t;
+    delete sIndex;
+    delete cIndex;
    delete[] sDimSize;
    delete[] tDimSize;
    delete[] indexDimSize;

--- a/source/tensor/test/Test.cpp
+++ b/source/tensor/test/Test.cpp
@@ -31,6 +31,7 @@ bool Test()
    wrong = !TestAbsolute() || wrong;
    wrong = !TestClip() || wrong;
+    wrong = !TestCompare() || wrong;
    wrong = !TestConcatenate() || wrong;
    wrong = !TestConcatenateSolely() || wrong;
    wrong = !TestCos() || wrong;

--- a/source/tensor/test/Test.h
+++ b/source/tensor/test/Test.h
@@ -24,6 +24,7 @@
 #include "TAbsolute.h"
 #include "TClip.h"
+#include "TCompare.h"
 #include "TConcatenate.h"
 #include "TConcatenateSolely.h"
 #include "TCos.h"