cumulative update

5f933fc6 · xuchen · 78954fad · 5f933fc6 · 5f933fc6 · 5f933fc6
Commit 5f933fc6 authored Dec 26, 2018 by xuchen
--- a/source/network/Main.cpp
+++ b/source/network/Main.cpp
@@ -35,6 +35,8 @@
 void BackwardTest();
 void TransposeTest();
 void SumDimTest();
+void SplitBackwardTest();
+void MemTest();

 using namespace nts;
 using namespace fnnlm;
@@ -42,6 +44,10 @@ using namespace transformer;

 int main( int argc, const char ** argv )
 {
+    //MemTest();
+    //return 0;
+    //SplitBackwardTest();
+    //return 0;
    //_CrtSetBreakAlloc(896);
    //BackwardTest();
    //return 0;
@@ -89,7 +95,7 @@ void BackwardTest()
    c = DivDim(a, b, 0);
    c.Dump(stderr, "c:");

-    XLink::ShowNetwork(stderr, &c);
+    //XLink::ShowNetwork(stderr, &c);

    net.Backward(c);

@@ -208,4 +214,68 @@ void SumDimTest()
    z.Dump(stderr, "z:");

    delete[] data;
+}
+
+void SplitBackwardTest()
+{
+    int * dimSize = new int[2];
+    dimSize[0] = 2;
+    dimSize[1] = 4;
+
+    XTensor t1;
+    InitTensor2D(&t1, 2, 4, X_FLOAT, 0, NULL);
+    XTensor t2;
+    InitTensor2D(&t2, 2, 4, X_FLOAT, 0, NULL);
+    XTensor tensor;
+    
+    //_SetDataFixedFloat(&t1, 1.0F);
+    //_SetDataFixedFloat(&t2, 2.0F);
+    t1.SetDataRand();
+    t2.SetDataRand();
+
+    tensor = t1 + t2;
+
+    XList smalls;
+
+    XTensor first;
+    XTensor second;
+    InitTensor2D(&first, 2, 2, X_FLOAT, 0, NULL);
+    InitTensor2D(&second, 2, 2, X_FLOAT, 0, NULL);
+    smalls.Add(&first);
+    smalls.Add(&second);
+
+    Split(tensor, smalls, 1, 2);
+
+    XTensor mul;
+    mul = Sum(first, second);
+
+    XNet net;
+    net.Backward(mul);
+    net.Dump(stderr);
+
+    printf("Done!");
+}
+
+void MemTest()
+{
+    XMem * mem;
+    mem = new XMem(0, FREE_ON_THE_FLY, (MTYPE)MILLION, 1024, MILLION);
+    
+    XTensor tensor;
+    InitTensor2D(&tensor, 2, 4, X_FLOAT, 0, mem);
+    
+    tensor.SetZeroAll();
+
+    tensor.Dump(stderr);
+
+    delete mem;
+
+    if (tensor.mem != NULL) {
+        printf("It isn't null!\n");
+        printf("%d\n", (int)tensor.mem->signature);
+    }
+    else {
+        printf("It's null\n");
+    }
+    tensor.Dump(stderr);
 }
\ No newline at end of file
--- a/source/network/XBackwardShape.cpp
+++ b/source/network/XBackwardShape.cpp
@@ -25,6 +25,7 @@
 #include "XNoder.h"
 #include "XBackwardShape.h"
 #include "../tensor/XName.h"
+#include "../tensor/XUtility.h"
 #include "../tensor/core/CHeader.h"
 #include "../tensor/core/getandset/SetData.h"

@@ -40,7 +41,7 @@ void XShapeGrad::MakeGrad(XTensor * node, bool isEfficent)

    if(operID == MOVEMENT_COPYINDEXED)
        GradCopyIndexed(node, isEfficent);
-    if(operID == MOVEMENT_GATHER)
+    else if(operID == MOVEMENT_GATHER)
        GradGather(node, isEfficent);
    else if(operID == SHAPE_MERGE)
        GradMerge(node, isEfficent);
@@ -80,7 +81,7 @@ gradient computation for copying indexed sub-tensors
 for
 b = copyindexed(a) 
 we have
-dE/da = spread(b)
+dE/da = spreadforcopyindexed(b)
 >> node - the node (c) for backward computation
 >> isEfficient - indicates whether the computation is in
                 an efficient manner
@@ -91,32 +92,14 @@ void XShapeGrad::GradCopyIndexed(XTensor * node, bool isEfficent)
    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for CopyIndexed!");

    int dim = income.GetParamInt(0);
-    int * srcIndex = (int *)income.GetParamPointer(1);
-    int indexSize = income.GetParamInt(2);
-    int * tgtIndex = (int *)income.GetParamPointer(3);
-    int copyNum = income.GetParamInt(4);
-
-    int realIndexSize = indexSize * copyNum;
-    int * realSrcIndex = new int[realIndexSize];
-    int * realTgtIndex = new int[realIndexSize];
-    for(int i = 0; i < indexSize; i++) {
-        for(int j = 0; j < copyNum; j++) {
-            realSrcIndex[i * copyNum + j] = srcIndex[i] + j;
-            realTgtIndex[i * copyNum + j] = tgtIndex[i] + j;
-        }
-    }
+    int copyNum = income.GetParamInt(1);

    XTensor * input = income.tails[0];
-    XNoder::MakeGrad(input);
-
-    _Spread(input->grad, node->grad, dim, realSrcIndex, realIndexSize, realTgtIndex);
+    XTensor * srcIndex = income.tails[1];
+    XTensor * tgtIndex = income.tails[2];

-    delete[] realSrcIndex;
-    delete[] realTgtIndex;
-    delete[] srcIndex;
-    delete[] tgtIndex;
-
-    node->visitMark = NODE_FINISHED;
+    XNoder::MakeGrad(input);
+    _SpreadForCopyIndexed(input->grad, node->grad, dim, srcIndex, tgtIndex, copyNum);
 }

 /* 
@@ -143,7 +126,6 @@ void XShapeGrad::GradGather(XTensor * node, bool isEfficent)
    node->visitMark = NODE_FINISHED;
 }

-
 /* 
 gradient for merge
 for 
@@ -181,6 +163,7 @@ void XShapeGrad::GradMerge(XTensor * node, bool isEfficent)
    XNoder::MakeGrad(input);

    int * dims = new int[input->order];
+    memset(dims, 0, sizeof(int) * input->order);
    for(int i = 0, j = 0; i < input->order; i++){
        if(i >= leadDim){
            dims[j++] = input->dimSize[i];

--- a/source/network/XNet.cpp
+++ b/source/network/XNet.cpp
@@ -437,4 +437,25 @@ void XNet::ClearGrad(XTensor * node)
    }
 }

+/* 
+show network topology 
+>> file - file to dump information
+>> node - pointer to the node
+*/
+void XNet::ShowNetwork(FILE * file, XTensor * node)
+{
+    XList roots(1);
+    roots.Add(node);
+
+    Traverse(roots);
+
+    XLink::ShowNode(file, node);
+
+    /* go over nodes in its topological order */
+    for(int i = nodes.count - 1; i >= 0; i--){
+        XTensor * n = (XTensor*)nodes.Get(i);
+        XLink::ShowNode(file, n);
+    }
+}
+
 }
\ No newline at end of file
--- a/source/network/XNet.h
+++ b/source/network/XNet.h
@@ -108,6 +108,9 @@ struct XNet

    /* clear the graident information if the node is no use */
    void ClearGrad(XTensor * node);
+
+    /* show network topology */
+    void ShowNetwork(FILE * file, XTensor * node);
 };

 /* we make a unique id for every tensor */

--- a/source/sample/fnnlm/FNNLM.cpp
+++ b/source/sample/fnnlm/FNNLM.cpp
@@ -231,7 +231,7 @@ void LoadArgs(int argc, const char ** argv, FNNModel &model)
    }

    for(int i = 0; i < argc; i++){
-        if(!strcmp(argv[i], "-mempool"))
+        if (!strcmp(argv[i], "-mempool"))
            model.mem = new XMem(model.devID);
    }
 }
@@ -715,24 +715,16 @@ The indexed cell is set to 1, and 0 otherwise.
 >> devID - device id
 >> mem - memory pool
 */
-void InitZeroOneTensor2D(XTensor &tensor, int rowNum, int colNum, int * rows, int * cols, int itemNum, int devID, XMem * mem)
+void InitZeroOneTensor2D(XTensor &tensor, int rowNum, int colNum, int * rows, int * cols, 
+                         int itemNum, int devID, XMem * mem)
 {
-    if(devID >= 0 || (mem != NULL && mem->devID >= 0))
-        InitTensor2D(&tensor, rowNum, colNum, X_FLOAT, -1);
-    else
-        InitTensor2D(&tensor, rowNum, colNum, X_FLOAT, devID, mem);
+    InitTensor2D(&tensor, rowNum, colNum, X_FLOAT, devID, mem);

    tensor.SetZeroAll();

    /* set none-zero cells */
    for(int i = 0; i < itemNum; i++)
        tensor.Set2D(1.0F, rows[i], cols[i]);
-
-    if(devID >= 0 || (mem != NULL && mem->devID >= 0)){
-        XList list(1);
-        list.Add(&tensor);
-        CPUToGPUFlush(&list, devID, mem);
-    }
 }

 /*
@@ -859,8 +851,6 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
        /* y = softmax(s) */
        _LogSoftmax(&s, &y, 1);
    }
-    
-    
 }

 /*
@@ -998,7 +988,6 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model
    XTensor embeddingBig;
    XTensor hidden;
    XTensor b;
-    XTensor srcIndex;

    int size = batch * (n-1);
    int * index = new int[size];
@@ -1010,28 +999,25 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model
        }
    }

-    InitTensor1D(&srcIndex, size, X_INT, model.devID, model.mem);
-    srcIndex.SetData(index, size);
+    InitTensor1D(&words, size, X_INT, model.devID, model.mem);
+    words.SetData(index, size);
+
+    embeddingBig = Gather(model.embeddingW, words);

-    XTensor embedding;
-    embedding = Gather(model.embeddingW, srcIndex);
-    
    delete[] index;

    int dimSize[2];
-    dimSize[0] = embedding.GetDim(0) / (n - 1);
-    dimSize[1] = embedding.GetDim(1) * (n - 1);
+    dimSize[0] = embeddingBig.GetDim(0) / (n - 1);
+    dimSize[1] = embeddingBig.GetDim(1) * (n - 1);

-    hidden = Reshape(embedding, embedding.order, dimSize);
+    hidden = Reshape(embeddingBig, embeddingBig.order, dimSize);

    /* hidden layers */
    for(int i = 0; i < depth; i++)
-        hidden = MMul(hidden, model.hiddenW[i]) + model.hiddenB[i];
+        hidden = HardTanH(MMul(hidden, model.hiddenW[i]) + model.hiddenB[i]);

    /* output layer */
    output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1);
-
-    //XLink::ShowNetwork(stderr, &output);
 }

 /*
@@ -1071,7 +1057,6 @@ void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model)
    /* output layer */
    output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1);

-    //XLink::ShowNetwork(stderr, &output);
 }

 /* 

--- a/source/sample/transformer/T2TDecoder.cpp
+++ b/source/sample/transformer/T2TDecoder.cpp
@@ -60,7 +60,7 @@ void AttDecoder::InitModel(int argc, char ** argv,

    /* initialize the stacked layers */
    for(int i = 0; i < nlayer; i++){
-        attentionsEnde[i].InitModel(argc, argv, false, myIgnored, myDevID, myMem);
+        attentionsEnde[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
        attEndeLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
    }
 }
@@ -69,11 +69,12 @@ void AttDecoder::InitModel(int argc, char ** argv,
 make the decoding network
 >> inputDec - the input tensor of the decoder
 >> outputEnc - the output tensor of the encoder
->> mask - the mask that indicate each position is valid
+>> mask - mask that indicates which position is valid
+>> maskEncDec - mask for the encoder-decoder attention
 >> isTraining - indicates whether the model is used for training
 << return - the output tensor of the encoder
 */
-XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, bool isTraining)
+XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, XTensor &maskEncDec, bool isTraining)
 {
    XTensor x;

@@ -89,7 +90,6 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, b
        XTensor ln;
        XTensor fnn;
        XTensor res;
-        XTensor nothing;

        /******************/
        /* self attention */
@@ -107,7 +107,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, b

        /*****************************/
        /* encoder-decoder attention */
-        ende = attentionsEnde[i].Make(outputEnc, x, outputEnc, nothing, isTraining);
+        ende = attentionsEnde[i].Make(outputEnc, x, outputEnc, maskEncDec, isTraining);

        /* dropout */
        if(isTraining && dropoutP > 0)

--- a/source/sample/transformer/T2TDecoder.h
+++ b/source/sample/transformer/T2TDecoder.h
@@ -48,7 +48,7 @@ public:
                   int myDevID = -1, XMem * myMem = NULL);

    /* make the decoding network */
-    XTensor Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, bool isTraining);
+    XTensor Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, XTensor &maskEncDec, bool isTraining);
 };

 }

--- a/source/sample/transformer/T2TEmbedding.cpp
+++ b/source/sample/transformer/T2TEmbedding.cpp
@@ -150,7 +150,6 @@ XTensor T2TEmbedder::Make(XTensor &input)
    }

    /* then we make word embeddings */
-    //wordEmbedding = Linear(MMul(input, w), (float)sqrt((float)eSize));
    wordEmbedding = Gather(w, input);
    wordEmbedding = Linear(wordEmbedding, (float)sqrt((float)eSize));


--- a/source/sample/transformer/T2TEncoder.cpp
+++ b/source/sample/transformer/T2TEncoder.cpp
@@ -93,10 +93,11 @@ void AttEncoder::InitModel(int argc, char ** argv,
 make the encoding network
 >> input - the input tensor of the encoder
 >> mask - the mask that indicate each position is valid
+>> maskEncDec - no use
 >> isTraining - indicates whether the model is used for training
 << return - the output tensor of the encoder
 */
-XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool isTraining)
+XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, bool isTraining)
 {
    XTensor x;

@@ -144,4 +145,18 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool isTraining)
    return x;
 }

+/*
+make the encoding network (wrapper) 
+>> input - the input tensor of the encoder
+>> mask - the mask that indicate each position is valid
+>> isTraining - indicates whether the model is used for training
+<< return - the output tensor of the encoder
+*/
+XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool isTraining)
+{
+    XTensor nothing;
+
+    return Make(input, mask, nothing, isTraining);
+}
+
 }
--- a/source/sample/transformer/T2TEncoder.h
+++ b/source/sample/transformer/T2TEncoder.h
@@ -40,7 +40,7 @@ class T2TEncoder
 {
 public:
    virtual
-    XTensor Make(XTensor &input, XTensor &mask, bool isTraining) = 0;
+    XTensor Make(XTensor &input, XTensor &mask, XTensor &mask2, bool isTraining) = 0;
 };

 /* 
@@ -49,7 +49,7 @@ the encoder based on RNN
 class RNNEncoder : T2TEncoder
 {
 public:
-    XTensor Make(XTensor &input, XTensor &mask, bool isTraining);
+    XTensor Make(XTensor &input, XTensor &mask, XTensor &mask2, bool isTraining);
 };


@@ -118,6 +118,9 @@ public:
                   int myDevID = -1, XMem * myMem = NULL);

    /* make the encoding network */
+    XTensor Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, bool isTraining);
+
+    /* make the encoding network (wrapper) */
    XTensor Make(XTensor &input, XTensor &mask, bool isTraining);
 };


--- a/source/sample/transformer/T2TModel.cpp
+++ b/source/sample/transformer/T2TModel.cpp
@@ -75,7 +75,7 @@ void T2TModel::InitModel(int argc, char ** argv)
        mem->SetDesiredSize(devID, 0, (MTYPE)memSize * MILLION);
    }

-    encoder->InitModel(argc, argv, isLM, 0, devID, mem);
+    encoder->InitModel(argc, argv, true, 0, devID, mem);
    outputLayer->InitModel(argc, argv, devID, mem);

    if(isMT)
@@ -99,7 +99,9 @@ make the encoding network
 */
 XTensor T2TModel::MakeEncoder(XTensor &input, XTensor &mask, bool isTraining)
 {
-    return encoder->Make(input, mask, isTraining);
+    XTensor nothing;
+
+    return encoder->Make(input, mask, nothing, isTraining);
 }

 /* 
@@ -107,13 +109,14 @@ make the decoding network
 >> inputDec - input tensor of the decoder
 >> outputEnc - output tensor of the encoder
 >> output - output tensor (distribution)
->> mask - the mask for positions that are/not involved in computation
+>> mask - mask for positions that are/not involved in computation
+>> maskEncDec - mask for the encoder-decoder attention
 >> isTraining - indicates whether we are training the model
 << return - encoding result
 */
-XTensor T2TModel::MakeDecoder(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, bool isTraining)
+XTensor T2TModel::MakeDecoder(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, XTensor &maskEncDec, bool isTraining)
 {
-    return decoder->Make(inputDec, outputEnc, mask, isTraining);
+    return decoder->Make(inputDec, outputEnc, mask, maskEncDec, isTraining);
 }

 /* 
@@ -190,14 +193,16 @@ make the network for machine translation (with the output softmax layer)
 >> inputDec - input tensor of the decoder
 >> output - output tensor (distribution)
 >> paddingEnc - padding of the sequences (on the encoder side)
+>> paddingDec - padding of the sequences (on the decoder side)
 >> isTraining - indicates whether the model is for training
 */
-void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTensor &paddingEnc, bool isTraining)
+void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTensor &paddingEnc, XTensor &paddingDec, bool isTraining)
 {
    XTensor encoding;
    XTensor decoding;
    XTensor maskEnc;
    XTensor maskDec;
+    XTensor maskEncDec;
    
    /* generate mask to see "previous" words on the decoder side */
    //int len = inputDec.GetDim(inputDec.order - 2);
@@ -222,6 +227,23 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe
    _SetDataLowTri(&maskDec, 1e9F, 0);
    _ScaleAndShiftMe(&maskDec, 1.0F, -1e9F);

+    /* encoder-decoder mask that prevent the attention to padding dummy words */
+    dims[inputDec.order + 1] = inputEnc.GetDim(inputEnc.order - 1);
+    InitTensor(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, 1.0F, paddingEnc.devID, paddingEnc.mem);
+
+    XTensor * maskEncDecTMPEnc = NewTensorBuf(paddingEnc.order + 1, dims + 1, paddingEnc.dataType,
+                                              paddingEnc.denseRatio, paddingEnc.devID, paddingEnc.mem);
+    XTensor * maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID, paddingEnc.mem);
+
+    _Unsqueeze(&paddingEnc, maskEncDecTMPEnc, paddingEnc.order - 1, paddingDec.GetDim(-1));
+    _Unsqueeze(&paddingDec, maskEncDecTMPDec, paddingEnc.order, paddingEnc.GetDim(-1));
+    _Multiply(maskEncDecTMPDec, maskEncDecTMPEnc, maskEncDecTMPDec);
+    _ScaleAndShiftMe(maskEncDecTMPDec, 1e9F, -1e9F);
+    _Unsqueeze(maskEncDecTMPDec, &maskEncDec, 0, dims[0]);
+
+    DelTensorBuf(maskEncDecTMPDec);
+    DelTensorBuf(maskEncDecTMPEnc);
+
    /* padding on the source side */
    int * dimsPadding = new int[paddingEnc.order + 2];
    for (int i = 0; i < paddingEnc.order - 1; i++)
@@ -252,7 +274,7 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe
    _Sum(&maskEnc, padding3, &maskEnc);

    encoding = MakeEncoder(inputEnc, maskEnc, isTraining);
-    decoding = MakeDecoder(inputDec, encoding, maskDec, isTraining);
+    decoding = MakeDecoder(inputDec, encoding, maskDec, maskEncDec, isTraining);
    outputLayer->Make(decoding, output);

    delete[] dims;

--- a/source/sample/transformer/T2TModel.h
+++ b/source/sample/transformer/T2TModel.h
@@ -72,13 +72,13 @@ public:
    XTensor MakeEncoder(XTensor &input, XTensor &mask, bool isTraining);

    /* make the encoding network */
-    XTensor MakeDecoder(XTensor &inputEnc, XTensor &inputDec, XTensor &mask, bool isTraining);
+    XTensor MakeDecoder(XTensor &inputEnc, XTensor &inputDec, XTensor &mask, XTensor &MaskEncDec, bool isTraining);

    /* make the network for langauge modeling (with the output softmax layer) */
    void MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool isTraining);

    /* make the network for machine translation (with the output softmax layer) */
-    void MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTensor &paddingEnc, bool isTraining);
+    void MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTensor &paddingEnc, XTensor &paddingDec, bool isTraining);

    /* get parameter matrics */
    void GetParams(XList &list);

--- a/source/sample/transformer/T2TOutput.cpp
+++ b/source/sample/transformer/T2TOutput.cpp
@@ -93,8 +93,8 @@ void T2TOutput::Make(XTensor &input, XTensor &output)
 {
    XTensor &x = input;

-    //output = LogSoftmax(MMul(x, w), -1);
-    output = Softmax(MMul(x, w), -1);
+    output = LogSoftmax(MMul(x, w), -1);
+    //output = Softmax(MMul(x, w), -1);
 }

 }
--- a/source/sample/transformer/T2TTrainer.cpp
+++ b/source/sample/transformer/T2TTrainer.cpp
--- a/source/sample/transformer/T2TTrainer.h
+++ b/source/sample/transformer/T2TTrainer.h
@@ -142,6 +142,9 @@ public:
    /* counterpart of "isSmallBatch" */
    bool isBigBatch;

+    /* indicates whether we use small memory footprint for backward process */
+    bool isSmallFootprint;
+
 public:
    /* constructor */
    T2TTrainer();

--- a/source/sample/transformer/Transformer.cpp
+++ b/source/sample/transformer/Transformer.cpp
@@ -20,6 +20,7 @@
 */

 #include <math.h>
+#include <time.h>
 #include "Transformer.h"
 #include "T2TModel.h"
 #include "T2TUtility.h"
@@ -58,6 +59,7 @@ int TransformerMain(int argc, const char ** argv)
    LoadParamString(argc, args, "test", testFN, "");
    LoadParamString(argc, args, "output", outputFN, "");

+    srand((unsigned int)time(NULL));
    T2TTrainer trainer;
    trainer.Init(argc, args);


--- a/source/tensor/Main.cpp
+++ b/source/tensor/Main.cpp
@@ -126,7 +126,7 @@ void SmallTest()
    d = a + b + c.Lin(0.5F);

    XLink::CheckNetwork(&d);
-    XLink::ShowNetwork(stderr, &d);
+    //XLink::ShowNetwork(stderr, &d);
        
    a.Dump(stderr, "a:");
    b.Dump(stderr, "b:");

--- a/source/tensor/XGlobal.h
+++ b/source/tensor/XGlobal.h
@@ -65,10 +65,10 @@ namespace nts {
 #endif

 #ifndef MIN
-#define MIN(a,b) ((a < b) ? a : b)
+#define MIN(a,b) ((a) < (b) ? a : b)
 #endif
 #ifndef MAX
-#define MAX(a,b) ((a > b) ? a : b)
+#define MAX(a,b) ((a) > (b) ? a : b)
 #endif

 #define __FILENAME__ ( strrchr(__FILE__, DELIMITER) != NULL ? strrchr(__FILE__, DELIMITER)+1 : __FILE__ )

--- a/source/tensor/XLink.cpp
+++ b/source/tensor/XLink.cpp
@@ -593,21 +593,6 @@ void XLink::CheckNetwork(XTensor * root)
 }

 /* 
-show the network encoded in a root node (tensor) 
->> file - file to dump information
->> root - pointer to the root node
-*/
-void XLink::ShowNetwork(FILE * file, XTensor * root)
-{
-    XLink &income = root->income;
-
-    for(int i = 0; i < income.tailNum; i++){
-        XTensor * child = income.tails[i];
-        ShowNetwork(file, child);
-    }
-}
-
-/* 
 show a node 
 >> file - file to dump information
 >> root - pointer to the node

--- a/source/tensor/XLink.h
+++ b/source/tensor/XLink.h
@@ -178,10 +178,6 @@ struct XLink
    static
    void CheckNetwork(XTensor * root);

-    /* show the network encoded in a root node (tensor) */
-    static
-    void ShowNetwork(FILE * file, XTensor * root);
-
    /* show a node */
    static
    void ShowNode(FILE * file, XTensor * node);

--- a/source/tensor/XName.h
+++ b/source/tensor/XName.h
@@ -79,7 +79,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define MOVEMENT_COPYVALUES     MOVEMENT_COPYINDEXED + 1
 #define MOVEMENT_GATHER         MOVEMENT_COPYVALUES + 1

-#define SHAPE                   MOVEMENT_COPYVALUES + 1
+#define SHAPE                   MOVEMENT_GATHER + 1
 #define SHAPE_CONCATENATE       SHAPE + 1
 #define SHAPE_MERGE             SHAPE_CONCATENATE + 1
 #define SHAPE_MERGE_LIST        SHAPE_MERGE + 1

--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -677,9 +677,9 @@ void XTensor::SetData(const void * d, int num, int beg)
        return;

    CheckNTErrors(!isSparse, "TODO");
-    CheckNTErrors(num == unitNum - beg, "Illegal size!");
+    CheckNTErrors(num <= unitNum - beg, "Illegal size!");

-    XMemCopy(data, devID, d, -1, num * unitSize);
+    XMemCopy((char*)data + beg * unitSize, devID, d, -1, num * unitSize);
 }

 /* 
@@ -804,7 +804,7 @@ set tensor items with an array of values
 >> values - value for each data item
 >> num - number of the data items
 */
-void XTensor::SetDataBatched(MTYPE * offsets, void * values, int num)
+void XTensor::SetDataBatchedWithValues(MTYPE * offsets, void * values, int num)
 {
    _SetDataWithOffsetAndValue(this, offsets, values, num);
 }
@@ -1289,7 +1289,7 @@ int XTensor::GetNonzeroSize()
        if(dataType == DEFAULT_DTYPE){
            int count = 0;
            for(int i = 0; i < unitNum; i++){
-                DTYPE value = *((DTYPE*)(char*)data + i * sizeof(DTYPE));
+                DTYPE value = *(DTYPE*)((char*)data + i * sizeof(DTYPE));
                if(value == 0)
                    count++;
            }
@@ -2271,6 +2271,8 @@ XTensor * NewTensor(const XTensor * a, bool isFilledData)
    
    CheckNTErrors((a != NULL), "Empty input!");
    
+    memset(dims, 0, sizeof(int) * MAX_TENSOR_DIM_NUM);
+
    if(a->order > 0)
        memcpy(dims, a->dimSize, sizeof(int) * a->order);


--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
@@ -49,6 +49,8 @@ struct XLink;
 #define USE_BATCHED_STRIDED_MAT_MUL
 #define MIN_TENSOR_SPLIT_NUM 0
 #define MIN_TENSOR_SPLIT_LIST_NUM 1024
+#define MIN_TENSOR_MERGE_NUM 0
+#define MIN_TENSOR_MERGE_LIST_NUM 1024
 #define MIN_TENSOR_CAT_NUM 8

 /* computation flags */
@@ -283,7 +285,7 @@ public:
    void SetDataBatched(MTYPE * offsets, DTYPE value, int num);

    /* set tensor items with an array of values */
-    void SetDataBatched(MTYPE * offsets, void * values, int num);
+    void SetDataBatchedWithValues(MTYPE * offsets, void * values, int num);

    /* check whether the data array is the same as the answer */
    bool CheckData(const void * answer, int num, int beg = 0);

--- a/source/tensor/core/CHeader.h
+++ b/source/tensor/core/CHeader.h
@@ -16,8 +16,8 @@
 */

 /*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
+ * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
+ */

 /* this is a header to include all functions in the "core" workspace */

@@ -46,16 +46,17 @@
 #include "arithmetic/XTensorBLAS.h"

 #include "getandset/ConvertDataType.h"
+#include "getandset/OnehotAndIndex.h"
 #include "getandset/Select.h"
 #include "getandset/SetData.h"

 #include "math/Clip.h"
+#include "math/Compare.h"
 #include "math/Normalize.h"
 #include "math/Power.h"
 #include "math/ScaleAndShift.h"
 #include "math/Unary.h"

-
 #include "movement/CopyBlocks.h"
 #include "movement/CopyBlocksInGrid.h"
 #include "movement/CopyBlocksOnSite.h"

--- a/source/tensor/core/getandset/ConvertDataType.cpp
+++ b/source/tensor/core/getandset/ConvertDataType.cpp
@@ -32,8 +32,6 @@ convert data type
 */
 void _ConvertDataType(const XTensor * input, XTensor * output)
 {
-    //CheckNTErrors((input->unitSize == output->unitSize), "Input and Output must be same in size!");
-
    if (input->dataType == output->dataType)
        return;
    
@@ -61,4 +59,29 @@ void _ConvertDataType(const XTensor * input, XTensor * output)
        ShowNTErrors("Unsupported data types for conversion!");

 }
+
+
+/*
+convert data type (return an XTensor structure) 
+make a new tensor to keep the result and return it
+
+>> input - input tensor
+>> output - output tensor
+*/
+XTensor ConvertDataType(const XTensor & input, TENSOR_DATA_TYPE dataType)
+{
+    int order = input.order;
+    
+    float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
+    XTensor output(order, input.dimSize, dataType, dr, input.devID, input.mem);
+    output.SetTMPFlag();
+
+
+    _Gather(&s, &t, &index);
+
+    /* tensor connection */
+    XLink::MakeLink(&s, &index, &t, MOVEMENT_GATHER);
+}
+
+
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/getandset/ConvertDataType.h
+++ b/source/tensor/core/getandset/ConvertDataType.h
@@ -23,12 +23,16 @@
 #define __CONVERTDATATYPE_H__

 #include "../../XTensor.h"
+#include "../../XDataType.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

 /* convert data type */
 void _ConvertDataType(const XTensor * input, XTensor * output);

+/* convert data type (return an XTensor structure) */
+XTensor ConvertDataType(const XTensor * input, TENSOR_DATA_TYPE dataType);
+
 } // namespace nts(NiuTrans.Tensor)

 #endif // __CONVERTDATATYPE_H__
--- a/source/tensor/core/getandset/OnehotAndIndex.cpp
+++ b/source/tensor/core/getandset/OnehotAndIndex.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-12-17
+ */
+
+#include "OnehotAndIndex.h"
+#include "OnehotAndIndex.cuh"
+
+namespace nts{ // namespace nts(NiuTrans.Tensor)
+
+/* 
+convert onehot tensor to index tensor 
+
+>> onehot - onehot tensor, which value is 0 or 1
+>> index - index tensor, which value is an integer num
+>> size - the last dimension size of the onehot tensor
+*/
+void _OnehotToIndex(XTensor * onehot, XTensor * index, int size)
+{
+    CheckNTErrors(onehot->GetDim(-1) == size, "Illegal tensor dimension!");
+    CheckNTErrors(onehot->order == index->order + 1, "Illegal tensor order!");
+    CheckNTErrors(onehot->dataType == X_INT, "The onehot tensor must be in X_INT!")
+    CheckNTErrors(index->dataType == X_INT, "The index tensor must be in X_INT!")
+
+    for (int i = 0; i < index->order; i++)
+        CheckNTErrors(index->GetDim(i) == onehot->GetDim(i), "Illegal tensor order!");
+
+#ifdef USE_CUDA
+    if(onehot->devID >= 0 && index->devID >= 0) {
+        _CudaOnehotToIndex(onehot, index, size);
+        return;
+    }
+#endif
+
+    int blockNum = index->unitNum;
+    int stride = size;
+
+    int * onehotData = (int *)onehot->data;
+    int * indexData = (int *)index->data;
+
+    for (int i = 0; i < blockNum; i++) {
+        int * od = onehotData + i * stride;
+        int record = -1;
+        for (int j = 0; j < stride; j++) {
+            if (od[j] != 0) {
+                if (record == -1)
+                    record = j;
+                else
+                    ShowNTErrors("The value of onehot tensor is illegal!");
+            }
+        }
+        indexData[i] = record;
+    }
+
+}
+
+/* 
+convert onehot tensor to index tensor (return an XTensor structure)
+make a new tensor to keep the result and return it 
+
+>> onehot - onehot tensor, which value is 0 or 1
+>> size - the last dimension size of the onehot tensor
+<< return - the index tensor
+*/
+XTensor OnehotToIndex(XTensor & onehot, int size)
+{
+    CheckNTErrors(onehot.GetDim(-1) == size, "Illegal tensor dimension!");
+    CheckNTErrors(onehot.dataType == X_INT, "The onehot tensor must be in X_INT!")
+
+    XTensor index;
+    InitTensor(&index, onehot.order - 1, onehot.dimSize, X_INT, 1.0F, onehot.devID, onehot.mem);
+    index.SetTMPFlag();
+
+    _OnehotToIndex(&onehot, &index, size);
+
+    return index;
+}
+
+/* 
+convert index tensor to onehot tensor 
+
+>> index - index tensor, which value is an integer num
+>> onehot - onehot tensor, which value is 0 or 1
+>> size - the last dimension size of the onehot tensor
+*/
+void _IndexToOnehot(XTensor * index, XTensor * onehot, int size)
+{
+    CheckNTErrors(onehot->GetDim(-1) == size, "Illegal tensor dimension!");
+    CheckNTErrors(onehot->order == index->order + 1, "Illegal tensor order!");
+    CheckNTErrors(onehot->dataType == X_INT, "The onehot tensor must be in X_INT!")
+    CheckNTErrors(index->dataType == X_INT, "The index tensor must be in X_INT!")
+
+    for (int i = 0; i < index->order; i++)
+        CheckNTErrors(index->GetDim(i) == onehot->GetDim(i), "Illegal tensor order!");
+
+    onehot->SetZeroAll();
+
+#ifdef USE_CUDA
+    if(onehot->devID >= 0 && index->devID >= 0) {
+        _CudaIndexToOnehot(index, onehot, size);
+        return;
+    }
+#endif
+
+    int blockNum = index->unitNum;
+    int stride = size;
+
+    int * indexData = (int *)index->data;
+    int * onehotData = (int *)onehot->data;
+
+    for (int i = 0; i < blockNum; i++) {
+        int id = indexData[i];
+        int * od = onehotData + i * stride;
+        od[id] = 1;
+    }
+
+}
+
+/* 
+convert onehot tensor to index tensor (return an XTensor structure)
+make a new tensor to keep the result and return it 
+
+>> index - index tensor, which value is an integer num
+>> size - the last dimension size of the onehot tensor
+<< return - the onehot tensor
+*/
+XTensor IndexToOnehot(XTensor & index, int size)
+{
+    CheckNTErrors(index.dataType == X_INT, "The onehot tensor must be in X_INT!")
+
+    XTensor onehot;
+    onehot.SetTMPFlag();
+    
+    int order = index.order;
+    int * dim = new int[order + 1];
+    memcpy(dim, index.dimSize, order * sizeof(int));
+    dim[order] = size;
+    InitTensor(&onehot, index.order + 1, dim, X_INT, 1.0F, index.devID, index.mem);
+
+    _IndexToOnehot(&index, &onehot, size);
+
+    delete[] dim;
+
+    return onehot;
+}
+
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/getandset/OnehotAndIndex.cu
+++ b/source/tensor/core/getandset/OnehotAndIndex.cu
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31
+ */
+
+#include "OnehotAndIndex.cuh"
+#include "../../XDevice.h"
+
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+#ifdef USE_CUDA
+
+/* 
+convert onehot tensor to index tensor (kernel version) 
+
+>> onehotData - the data pointer of the onehot tensor
+>> indexData - the data pointer of the index tensor
+>> blockNum - the number of block
+>> stride - stride of a data block
+*/
+__global__
+void KernelOnehotToIndex(int * onehotData, int * indexData, int blockNum, int stride)
+{
+    /* block id */
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    /* offset in each block */
+    int offset = blockDim.y * blockIdx.y + threadIdx.y;
+
+    if (i >= blockNum || offset >= stride)
+        return;
+
+    int * od = onehotData + i * stride;
+    int * id = indexData + i;
+
+    if (od[offset] != 0)
+        *id = offset;
+}
+
+/* 
+convert onehot tensor to index tensor (cuda version) 
+
+>> onehot - onehot tensor, which value is 0 or 1
+>> index - index tensor, which value is an integer num
+>> size - the last dimension size of the onehot tensor
+*/
+void _CudaOnehotToIndex(XTensor * onehot, XTensor * index, int size)
+{
+    int devID = onehot->devID;
+
+    int blockNum = index->unitNum;
+    int stride = size;
+
+    int cudaGrids[3];
+    int cudaBlocks[3];
+
+    int devIDBackup;
+    ProtectCudaDev(devID, devIDBackup);
+
+    GDevs.GetCudaThread2D(devID, blockNum, stride, MAX_INT, cudaGrids, cudaBlocks);
+
+    dim3 blocks(cudaGrids[0], cudaGrids[1]);
+    dim3 threads(cudaBlocks[0], cudaBlocks[1]);
+
+    int * onehotData = (int *)onehot->data;
+    int * indexData = (int *)index->data;
+
+    KernelOnehotToIndex<<<blocks, threads >>>(onehotData, indexData, blockNum, stride);
+
+    BacktoCudaDev(devID, devIDBackup);
+}
+
+/* 
+convert index tensor to onehot tensor (kernel version) 
+
+>> onehotData - the data pointer of the onehot tensor
+>> indexData - the data pointer of the index tensor
+>> blockNum - the number of block
+>> stride - stride of a data block
+*/
+__global__
+void KernelIndexToOnehot(int * onehotData, int * indexData, int blockNum, int stride)
+{
+    /* block id */
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    /* offset in each block */
+    int offset = blockDim.y * blockIdx.y + threadIdx.y;
+
+    if (i >= blockNum || offset >= stride)
+        return;
+
+    int * od = onehotData + i * stride;
+
+    int id = indexData[i];
+    od[id] = 1;
+}
+
+/* 
+convert index tensor to onehot tensor (cuda version) 
+
+>> index - index tensor, which value is an integer num
+>> onehot - onehot tensor, which value is 0 or 1
+>> size - the last dimension size of the onehot tensor
+*/
+void _CudaIndexToOnehot(XTensor * index, XTensor * onehot, int size)
+{
+    int devID = onehot->devID;
+
+    int blockNum = index->unitNum;
+    int stride = size;
+
+    int cudaGrids[3];
+    int cudaBlocks[3];
+
+    int devIDBackup;
+    ProtectCudaDev(devID, devIDBackup);
+
+    GDevs.GetCudaThread2D(devID, blockNum, stride, MAX_INT, cudaGrids, cudaBlocks);
+
+    dim3 blocks(cudaGrids[0], cudaGrids[1]);
+    dim3 threads(cudaBlocks[0], cudaBlocks[1]);
+
+    int * onehotData = (int *)onehot->data;
+    int * indexData = (int *)index->data;
+
+    KernelIndexToOnehot<<<blocks, threads >>>(onehotData, indexData, blockNum, stride);
+
+    BacktoCudaDev(devID, devIDBackup);
+}
+
+#endif // USE_CUDA
+
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/getandset/OnehotAndIndex.cuh
+++ b/source/tensor/core/getandset/OnehotAndIndex.cuh
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-12-17
+ */
+
+#ifndef __ONEHOTANDINDEX_CUH__
+#define __ONEHOTANDINDEX_CUH__
+
+#include "../../XTensor.h"
+
+namespace nts{ // namespace nts(NiuTrans.Tensor)
+
+/* convert onehot tensor to index tensor (cuda version) */
+void _CudaOnehotToIndex(XTensor * onehot, XTensor * index, int size);
+
+/* convert index tensor to onehot tensor (cuda version) */
+void _CudaIndexToOnehot(XTensor * index, XTensor * onehot, int size);
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __ONEHOTANDINDEX_CUH__
\ No newline at end of file
--- a/source/tensor/core/getandset/OnehotAndIndex.h
+++ b/source/tensor/core/getandset/OnehotAndIndex.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-12-17
+ */
+
+#ifndef __ONEHOTANDINDEX_H__
+#define __ONEHOTANDINDEX_H__
+
+#include "../../XTensor.h"
+
+namespace nts{ // namespace nts(NiuTrans.Tensor)
+
+/* convert onehot tensor to index tensor */
+void _OnehotToIndex(XTensor * onehot, XTensor * index, int size);
+
+/* convert onehot tensor to index tensor (return an XTensor structure)
+make a new tensor to keep the result and return it */
+XTensor OnehotToIndex(XTensor & onehot, int num);
+
+/* convert index tensor to onehot tensor */
+void _IndexToOnehot(XTensor * index, XTensor * onehot, int size);
+
+/* convert index tensor to onehot tensor (return an XTensor structure)
+make a new tensor to keep the result and return it */
+XTensor IndexToOnehot(XTensor & index, int num);
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __ONEHOTANDINDEX_H__
\ No newline at end of file
--- a/source/tensor/core/getandset/SetData.cpp
+++ b/source/tensor/core/getandset/SetData.cpp
@@ -70,8 +70,9 @@ void _SetDataFanInOut(XTensor * tensor, DTYPE gain)
        fanOut = numOutputFmaps * receptiveFieldSize;
    }

-    DTYPE finfout = gain * (float)sqrt(6.0F/(fanIn + fanOut));
-    tensor->SetDataRand(-finfout, finfout);
+    DTYPE std = gain * (float)sqrt(2.0 / (fanIn + fanOut));
+    DTYPE a = (DTYPE)sqrt(3.0F) * std;
+    tensor->SetDataRand(-a, a);
    //_SetDataRand(tensor, -finfout, finfout);
 }

@@ -499,36 +500,9 @@ void _SetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * values
    }
    else {
 #ifdef USE_CUDA
-        XMem * mem = tensor->mem;
-        MTYPE offsetSize = num * sizeof(MTYPE);
-        MTYPE valueSize;
-
-        if (tensor->dataType == X_INT)
-            valueSize = num * sizeof(int);
-        else if (tensor->dataType == X_FLOAT)
-            valueSize = num * sizeof(float);
-        else
-            ShowNTErrors("TO DO!!!");
-
-        MTYPE * offsetsCuda = mem != NULL ? 
-                             (MTYPE*)mem->AllocBuf(mem->devID, offsetSize) : 
-                             (MTYPE*)XMemAlloc(tensor->devID, offsetSize);
-        void * valuesCuda  = mem != NULL ? 
-                             mem->AllocBuf(mem->devID, valueSize) : 
-                             XMemAlloc(tensor->devID, valueSize);
-
-        XMemCopy(offsetsCuda, tensor->devID, offsets, -1, offsetSize);
-        XMemCopy(valuesCuda, tensor->devID, values, -1, valueSize);
-
-        _CudaSetDataWithOffsetAndValue(tensor, offsetsCuda, valuesCuda, num);
-        
-        if (mem != NULL) {
-            mem->ReleaseBuf(mem->devID, valueSize);
-            mem->ReleaseBuf(mem->devID, offsetSize);
-        }
-        else {
-            XMemFree(tensor->devID, offsetsCuda);
-            XMemFree(tensor->devID, valuesCuda);
+        if(tensor->devID >= 0) {
+            _CudaSetDataWithOffsetAndValue(tensor, offsets, values, num);
+            return;
        }
 #else
        ShowNTErrors("Please recompile the code with USE_CUDA");

--- a/source/tensor/core/getandset/SetData.cu
+++ b/source/tensor/core/getandset/SetData.cu
@@ -26,6 +26,7 @@
 #include "SetData.cuh"
 #include <curand_kernel.h>
 #include "../../XDevice.h"
+#include "../../XUtility.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

@@ -363,7 +364,7 @@ e.g., for a 3* 3 tensor,
      2 2 0
 */
 __global__
-void _KernelSetDataLowTri(DTYPE * d, int l, int blockSize, int blockNum, DTYPE p, int shift)
+void KernelSetDataLowTri(DTYPE * d, int l, int blockSize, int blockNum, DTYPE p, int shift)
 {
    /* offset in each block */
    int i = blockDim.x * blockIdx.x + threadIdx.x;
@@ -425,7 +426,7 @@ void _CudaSetDataLowTri(XTensor * tensor, DTYPE p, int shift)
    int devIDBackup;
    ProtectCudaDev(tensor->devID, devIDBackup);

-    _KernelSetDataLowTri<<<blocks, threads >>>((DTYPE*)tensor->data, l, blockSize, blockNum, p, shift);
+    KernelSetDataLowTri<<<blocks, threads >>>((DTYPE*)tensor->data, l, blockSize, blockNum, p, shift);

    BacktoCudaDev(tensor->devID, devIDBackup);
 }
@@ -474,12 +475,12 @@ set the data with an array of offsets (kernel version)
 >> num - number of the data items
 */
 __global__
-void _KernelSetDataWithOffset(DTYPE * data, MTYPE * offsets, DTYPE value, MTYPE num)
+void KernelSetDataWithOffset(DTYPE * data, MTYPE * offsets, DTYPE value, MTYPE num)
 {
    /* index */
    int i = blockDim.x * blockIdx.x + threadIdx.x;

-    if(i < num)
+    if (i < num)
        data[offsets[i]] = value;
 }

@@ -505,7 +506,7 @@ void _CudaSetDataWithOffset(XTensor * tensor, MTYPE * offsets, DTYPE value, MTYP
    int devIDBackup;
    ProtectCudaDev(tensor->devID, devIDBackup);

-    _KernelSetDataWithOffset << <blocks, threads >> > ((DTYPE*)tensor->data, offsets, value, num);
+    KernelSetDataWithOffset << <blocks, threads >> > ((DTYPE*)tensor->data, offsets, value, num);

    BacktoCudaDev(tensor->devID, devIDBackup);
 }
@@ -519,7 +520,7 @@ set the data with an array of offsets (kernel version)
 >> dataType - the data type of the data and values
 */
 __global__
-void _KernelSetDataWithOffset(void * data, MTYPE * offsets, void * values, MTYPE num, TENSOR_DATA_TYPE dataType)
+void KernelSetDataWithOffsetAndValue(void * data, MTYPE * offsets, void * values, MTYPE num, TENSOR_DATA_TYPE dataType)
 {
    /* index */
    int i = blockDim.x * blockIdx.x + threadIdx.x;
@@ -541,6 +542,18 @@ set the data with an array of values
 */
 void _CudaSetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * values, MTYPE num)
 {
+
+    XMem * mem = tensor->mem;
+    MTYPE offsetSize = num * sizeof(MTYPE);
+    MTYPE valueSize;
+
+    if (tensor->dataType == X_INT)
+        valueSize = num * sizeof(int);
+    else if (tensor->dataType == X_FLOAT)
+        valueSize = num * sizeof(float);
+    else
+        ShowNTErrors("TO DO!!!");
+
    int gridSize[3];
    int blockSize[3];

@@ -552,7 +565,32 @@ void _CudaSetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * va
    int devIDBackup;
    ProtectCudaDev(tensor->devID, devIDBackup);

-    _KernelSetDataWithOffset << <blocks, threads >> > (tensor->data, offsets, values, num, tensor->dataType);
+    MTYPE * offsetsCuda = mem != NULL ? 
+                            (MTYPE*)mem->AllocBuf(mem->devID, offsetSize) : 
+                            (MTYPE*)XMemAlloc(tensor->devID, offsetSize);
+    void * valuesCuda  = mem != NULL ? 
+                            mem->AllocBuf(mem->devID, valueSize) : 
+                            XMemAlloc(tensor->devID, valueSize);
+
+    if (mem != NULL) {
+        XMemCopy(offsetsCuda, mem->devID, offsets, -1, offsetSize);
+        XMemCopy(valuesCuda, mem->devID, values, -1, valueSize);
+    }
+    else {
+        XMemCopy(offsetsCuda, tensor->devID, offsets, -1, offsetSize);
+        XMemCopy(valuesCuda, tensor->devID, values, -1, valueSize);
+    }
+
+    KernelSetDataWithOffsetAndValue<<<blocks, threads >>> (tensor->data, offsetsCuda, valuesCuda, num, tensor->dataType);
+
+    if (mem != NULL) {
+        mem->ReleaseBuf(mem->devID, valueSize);
+        mem->ReleaseBuf(mem->devID, offsetSize);
+    }
+    else {
+        XMemFree(tensor->devID, valuesCuda);
+        XMemFree(tensor->devID, offsetsCuda);
+    }

    BacktoCudaDev(tensor->devID, devIDBackup);
 }

--- a/source/tensor/core/math/Clip.h
+++ b/source/tensor/core/math/Clip.h
@@ -15,7 +15,6 @@
 * limitations under the License.
 */

-
 /*
 * $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-03
 */

--- a/source/tensor/core/math/Compare.cpp
+++ b/source/tensor/core/math/Compare.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-12-10
+ */
+
+#include "../../XTensor.h"
+#include "../../XName.h"
+#include "Compare.h"
+#include "Compare.cuh"
+
+namespace nts{ // namespace nts(NiuTrans.Tensor)
+
+DTYPE myIsEqual(DTYPE a, DTYPE b)
+{
+    return (a == b ? 1.0F : 0.0F);
+}
+
+DTYPE myIsNotEqual(DTYPE a, DTYPE b)
+{
+    return (a != b ? 1.0F : 0.0F);
+}
+
+#ifdef USE_CUDA
+/* define three marco separately, specify the respective function names  (GPU mode) */
+#define _SIMPLE_COMPARE_FUNCTION(_funcName, _cudaFuncName, origFunc)        \
+void _funcName(const XTensor * a, XTensor * b, DTYPE number)                \
+{                                                                           \
+    CheckNTErrors((XTensor::IsSameShaped(a, b)),                            \
+                  "Input tensors should have the same type!");              \
+    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");                 \
+    /* run it on GPUs */                                                    \
+    if (a->devID >= 0) {                                                    \
+        _cudaFuncName(a, b, number);                                        \
+        return;                                                             \
+    }                                                                       \
+    DTYPE * d = (DTYPE*)a->data;                                            \
+    DTYPE * db = (DTYPE*)b->data;                                           \
+    for (int i = 0; i < a->unitNum; i++)                                    \
+        db[i] = (DTYPE)origFunc(d[i], number);                              \
+}
+
+#define _SIMPLE_COMPARE_FUNCTION_ME(_funcNameMe, _funcName)                 \
+void _funcNameMe(XTensor * a, DTYPE number)                                 \
+{                                                                           \
+    _funcName(a, a, number);                                                \
+}        
+
+#define SIMPLE_COMPARE_FUNCTION(funcName, _funcName, operationId)           \
+XTensor funcName(const XTensor &a, DTYPE number)                            \
+{                                                                           \
+    XTensor b(&a);                                                          \
+    b.SetTMPFlag();                                                         \
+    _funcName(&a, &b, number);                                              \
+    return b;                                                               \
+}
+// I think we needn't to make link.
+// XLink::MakeLink(&a, NULL, &b, operationId);
+
+_SIMPLE_COMPARE_FUNCTION(_Equal, _CudaEqual, myIsEqual)
+_SIMPLE_COMPARE_FUNCTION_ME(_EqualMe, _Equal)
+SIMPLE_COMPARE_FUNCTION(Equal, _Equal, MATH_EQUAL)
+
+_SIMPLE_COMPARE_FUNCTION(_NotEqual, _CudaNotEqual, myIsNotEqual)
+_SIMPLE_COMPARE_FUNCTION_ME(_NotEqualMe, _NotEqual)
+SIMPLE_COMPARE_FUNCTION(NotEqual, _NotEqual, MATH_NOTEQUAL)
+
+#else
+/* define three marco separately, specify the respective function names (CPU mode) */
+#define _SIMPLE_COMPARE_FUNCTION(_funcName, origFunc)                       \
+void _funcName(const XTensor * a, XTensor * b, DTYPE number)                \
+{                                                                           \
+    CheckNTErrors((XTensor::IsSameShaped(a, b)),                            \
+                  "Input tensors should have the same type!");              \
+    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");                 \
+    DTYPE * d = (DTYPE*)a->data;                                            \
+    DTYPE * db = (DTYPE*)b->data;                                           \
+    for (int i = 0; i < a->unitNum; i++)                                    \
+        db[i] = (DTYPE)origFunc(d[i], number);                              \
+}
+
+#define _SIMPLE_COMPARE_FUNCTION_ME(_funcNameMe, _funcName)                 \
+void _funcNameMe(XTensor * a, DTYPE number)                                 \
+{                                                                           \
+    _funcName(a, a, number);                                                \
+}        
+
+#define SIMPLE_COMPARE_FUNCTION(funcName, _funcName, operationId)           \
+XTensor funcName(const XTensor &a, DTYPE number)                            \
+{                                                                           \
+    XTensor b(&a);                                                          \
+    b.SetTMPFlag();                                                         \
+    _funcName(&a, &b, number);                                              \
+    return b;                                                               \
+}
+
+// I think we needn't to make link.
+// XLink::MakeLink(&a, NULL, &b, operationId);
+
+_SIMPLE_COMPARE_FUNCTION(_Equal, myIsEqual)
+_SIMPLE_COMPARE_FUNCTION_ME(_EqualMe, _Equal)
+SIMPLE_COMPARE_FUNCTION(Equal, _Equal, MATH_EQUAL)
+
+_SIMPLE_COMPARE_FUNCTION(_NotEqual, myIsNotEqual)
+_SIMPLE_COMPARE_FUNCTION_ME(_NotEqualMe, _NotEqual)
+SIMPLE_COMPARE_FUNCTION(NotEqual, _NotEqual, MATH_NOTEQUAL)
+
+#endif
+
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/math/Compare.cu
+++ b/source/tensor/core/math/Compare.cu
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31
+ */
+
+#include <math.h>
+#include "../../XDevice.h"
+#include "../../XName.h"
+#include "Compare.h"
+#include "Compare.cuh"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+#ifdef USE_CUDA
+
+__device__
+DTYPE cudaIsEqual(DTYPE a, DTYPE b)
+{
+    return (a == b ? 1.0F : 0.0F);
+}
+
+__device__
+DTYPE cudaIsNotEqual(DTYPE a, DTYPE b)
+{
+    return (a != b ? 1.0F : 0.0F);
+}
+
+#define SIMPLE_COMPARE_FUNCTION_GPU(funcName, origFunc)                     \
+__global__                                                                  \
+void Kernel##funcName(DTYPE * a, DTYPE * b, int size, DTYPE number)         \
+{                                                                           \
+    int i = blockDim.x * blockIdx.x + threadIdx.x;                          \
+                                                                            \
+    if (i < size)                                                           \
+        b[i] = (DTYPE)origFunc(a[i], number);                               \
+}                                                                           \
+__global__                                                                  \
+void Kernel##funcName(__half * a, __half * b, int size, __half number)      \
+{                                                                           \
+    return;                                                                 \
+}                                                                           \
+void _Cuda##funcName(const XTensor * a, XTensor * b, DTYPE number)          \
+{                                                                           \
+                                                                            \
+    int gridSize[3];                                                        \
+    int blockSize[3];                                                       \
+                                                                            \
+    GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);         \
+                                                                            \
+    dim3 blocks(gridSize[0]);                                               \
+    dim3 threads(blockSize[0]);                                             \
+                                                                            \
+    int devIDBackup;                                                        \
+    ProtectCudaDev(a->devID, devIDBackup);                                  \
+                                                                            \
+    if (a->dataType == DEFAULT_DTYPE) {                                     \
+        Kernel##funcName<<<blocks, threads>>>                               \
+                         ((DTYPE*)a->data, (DTYPE*)b->data,                 \
+                           a->unitNum, (DTYPE)number);                      \
+    }                                                                       \
+    else if (a->dataType == X_FLOAT16) {                                    \
+        Kernel##funcName<<<blocks, threads>>>                               \
+                         ((__half*)a->data, (__half*)b->data,               \
+                           a->unitNum, (__half)number);                     \
+    }                                                                       \
+    else {                                                                  \
+        ShowNTErrors("TODO!");                                              \
+    }                                                                       \
+                                                                            \
+    BacktoCudaDev(a->devID, devIDBackup);                                   \
+}                                                                           \
+
+SIMPLE_COMPARE_FUNCTION_GPU(Equal, cudaIsEqual)
+SIMPLE_COMPARE_FUNCTION_GPU(NotEqual, cudaIsNotEqual)
+
+
+#endif // USE_CUDA
+
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/math/Compare.cuh
+++ b/source/tensor/core/math/Compare.cuh
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-12-10
+ */
+
+#ifndef __COMPARE_CUH__
+#define __COMPARE_CUH__
+
+#include "../../XTensor.h"
+
+namespace nts{ // namespace nts(NiuTrans.Tensor)
+
+#ifdef USE_CUDA
+
+/* compare whether every entry is equal to the specified value (cuda kernel) */
+__global__
+void KernelEqual(DTYPE * a, DTYPE * b, DTYPE * number);
+/* compare whether every entry is equal to the specified value (cuda version) */
+void _CudaEqual(const XTensor * a, XTensor * b, DTYPE number);
+
+/* compare whether every entry is not equal to the specified value (cuda kernel) */
+__global__
+void KernelNotEqual(DTYPE * a, DTYPE * b, DTYPE * number);
+/* compare whether every entry is not equal to the specified value (cuda version) */
+void _CudaNotEqual(const XTensor * a, XTensor * b, DTYPE number);
+
+#endif // USE_CUDA
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif //end __COMPARE_CUH__
\ No newline at end of file
--- a/source/tensor/core/math/Compare.h
+++ b/source/tensor/core/math/Compare.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-12-10
+ */
+
+#ifndef __COMPARE_H__
+#define __COMPARE_H__
+
+#include "../../XTensor.h"
+
+namespace nts{ // namespace nts(NiuTrans.Tensor)
+
+/* compare whether every entry is equal to the specified value */
+void _Equal(const XTensor * a, XTensor * b, DTYPE number);
+/* compare whether every entry is equal to the specified value (do it on site)
+keep the result in the input tensor a and return nothing */
+void _EqualMe(XTensor * a, DTYPE number);
+/* compare whether every entry is equal to the specified value (return an XTensor structure)
+make a new tensor to keep the result and return it */
+XTensor Equal(const XTensor & a, DTYPE number);
+
+/* compare whether every entry is not equal to the specified value */
+void _NotEqual(const XTensor * a, XTensor * b, DTYPE number);
+/* compare whether every entry is not equal to the specified value (do it on site)
+keep the result in the input tensor a and return nothing */
+void _NotEqualMe(XTensor * a, DTYPE number);
+/* compare whether every entry is not equal to the specified value (return an XTensor structure)
+make a new tensor to keep the result and return it */
+XTensor NotEqual(const XTensor & a, DTYPE number);
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // end __COMPARE_H__
\ No newline at end of file
--- a/source/tensor/core/math/Unary.cpp
+++ b/source/tensor/core/math/Unary.cpp
@@ -223,4 +223,4 @@ _SIMPLE_UNARY_FUNCTION_ME(_RoundMe, _Round)
 SIMPLE_UNARY_FUNCTION(Round, _Round, MATH_ROUND)*/
 #endif

-}
\ No newline at end of file
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/math/Unary.cu
+++ b/source/tensor/core/math/Unary.cu
@@ -15,7 +15,6 @@
 * limitations under the License.
 */

-
 /*
 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31
 */

--- a/source/tensor/core/math/Unary.cuh
+++ b/source/tensor/core/math/Unary.cuh
@@ -15,7 +15,6 @@
 * limitations under the License.
 */

-
 /*
 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31
 */

--- a/source/tensor/core/math/Unary.h
+++ b/source/tensor/core/math/Unary.h
@@ -15,7 +15,6 @@
 * limitations under the License.
 */

-
 /*
 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31
 */
@@ -145,5 +144,6 @@ void _TanMe(XTensor * a);
 make a new tensor to keep the result and return it */
 XTensor Tan(const XTensor & a);

-}
-#endif //end __UNARY_H__
\ No newline at end of file
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // end __UNARY_H__
\ No newline at end of file
--- a/source/tensor/core/movement/CopyBlocksSelected.cu
+++ b/source/tensor/core/movement/CopyBlocksSelected.cu
@@ -79,8 +79,13 @@ void _CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, i
    ProtectCudaDev(devID, devIDBackup);

    /* copy the index to the GPU memory */
-    int * sourceBlocksTMP = myMem != NULL ? (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) : (int *)XMemAlloc(devID, blockNum * sizeof(int));
-    int * targetBlocksTMP = myMem != NULL ? (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) : (int *)XMemAlloc(devID, blockNum * sizeof(int));
+    int * sourceBlocksTMP = myMem != NULL ? 
+                           (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) : 
+                           (int *)XMemAlloc(devID, blockNum * sizeof(int));
+    int * targetBlocksTMP = myMem != NULL ? 
+                           (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) : 
+                           (int *)XMemAlloc(devID, blockNum * sizeof(int));
+    
    XMemCopy(sourceBlocksTMP, devID, sourceBlocks, -1, blockNum * sizeof(int));
    XMemCopy(targetBlocksTMP, devID, targetBlocks, -1, blockNum * sizeof(int));


--- a/source/tensor/core/movement/CopyIndexed.cpp
+++ b/source/tensor/core/movement/CopyIndexed.cpp
 /* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */

 /*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
+ * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
+ */

 #include "CopyIndexed.h"
+#include "CopyIndexed.cuh"
 #include "CopyBlocks.h"
+#include "Gather.h"
 #include "../../XName.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -40,7 +42,9 @@ copy indexed sub-tensors
             e.g., for srcIndex = [1,4] and copyNum = 2,
             we actually copy the source sub-tensors 1, 2, 4, 5
 */
-void _CopyIndexed(const XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum)
+void _CopyIndexed(const XTensor * s, XTensor * t, int dim, 
+                  int * srcIndex, int indexSize, int * tgtIndex, 
+                  int copyNum)
 {
    CheckNTErrors((s && t), "Invalid tensors!");
    CheckNTErrors((s->devID == t->devID || (s->devID < 0 && t->devID < 0)),
@@ -99,7 +103,148 @@ void _CopyIndexed(const XTensor * s, XTensor * t, int dim, int * srcIndex, int i
 }

 /*
-copy indexed sub-tensors (return an XTensor structure)
+copy selected sub-tensors where indeces are kept in tensors
+
+>> s - the source tensor
+>> t - the target tensor
+>> dim - the leading dimension to define "sub-tensors"
+         e.g., for a tensor of size (3, 2, 4) and dim = 2, 
+         we have 4 sub-tensors of size (3, 2)
+>> srcIndex - the tensor to save the index of the source sub-tensors
+>> tgtIndex - the tensor to save the index of the target sub-tensors
+>> copyNum - number of the sub-tensors we copy for each source index, 
+             e.g., for srcIndex = [1,4] and copyNum = 2,
+             we actually copy the source sub-tensors 1, 2, 4, 5
+*/
+void _CopyIndexed(const XTensor * s, XTensor * t, int dim, 
+                  const XTensor * srcIndex, const XTensor * tgtIndex, 
+                  int copyNum)
+{
+    int order = s->order;
+    int indexSize = srcIndex->unitNum;
+
+    CheckNTErrors(indexSize != 0, "NULL index!")
+    CheckNTErrors((s && t), "Invalid tensors!");
+    CheckNTErrors((srcIndex && tgtIndex), "Invalid index tensors!");
+    CheckNTErrors((s->devID == t->devID || (s->devID < 0 && t->devID < 0)),
+                  "the data must be kept on the same device!");
+    CheckNTErrors((srcIndex->devID == srcIndex->devID || (s->devID < 0 && t->devID < 0)),
+                  "the index must be kept on the same device!");
+    CheckNTErrors((s->devID == srcIndex->devID || (s->devID < 0 && t->devID < 0)),
+                  "the data and index must be kept on the same device!");
+    CheckNTErrors((dim >= 0 && dim < order), "A too larget dimension specified!");
+    CheckNTErrors((s->unitSize == t->unitSize), "Unmatched tensors!");
+    CheckNTErrors((srcIndex->unitNum == tgtIndex->unitNum), "Unmatched index tensors!");
+
+    for (int i = 0; i < order; i++) {
+        if (i != dim) {
+            CheckNTErrors(s->GetDim(i) == t->GetDim(i), "Unmatched dimensions");
+        }
+        else {
+            CheckNTErrors(t->GetDim(i) == indexSize * copyNum, "Unmatched dimensions");
+        }
+    }
+
+#ifdef USE_CUDA
+    if (s->devID >= 0 && srcIndex->devID >= 0) {
+        _CudaCopyIndexed(s, t, dim, srcIndex, tgtIndex, copyNum);
+        return;
+    }
+#endif
+
+    int blockNum = 1;
+    int stride = 1;
+    int blockSizeSrc = 1;
+    int blockSizeTgt = 1;
+
+    for (int i = 0; i < dim; i++)
+        blockNum *= s->GetDim(i);
+    
+    for (int i = dim + 1; i < order; i++)
+        stride *= s->GetDim(i);
+
+    blockSizeSrc = stride * s->GetDim(dim);
+    blockSizeTgt = stride * t->GetDim(dim);
+
+    DTYPE * sData = (DTYPE*)s->data;
+    DTYPE * tData = (DTYPE*)t->data;
+    int * sIndex = (int*)srcIndex->data;
+    int * tIndex = (int*)tgtIndex->data;
+
+    for (int i = 0; i < indexSize; i++) {
+        for (int c = 0; c < copyNum; c++) {
+            int si = sIndex[i] + c;
+            int ti = tIndex[i] + c;
+
+            for (int j = 0; j < blockNum; j++) {
+                DTYPE * sd = sData + j * blockSizeSrc + si * stride;
+                DTYPE * td = tData + j * blockSizeTgt + ti * stride;
+                for (int k = 0; k < stride; k++)
+                    *(td + k) = *(sd + k);
+            }
+        
+        }
+    }
+}
+
+/*
+copy selected sub-tensors where indeces are kept in tensors (return an XTensor structure)
+make a new tensor to keep the result and return it
+
+>> s - the source tensor
+>> dim - the leading dimension to define "sub-tensors"
+         e.g., for a tensor of size (3, 2, 4) and dim = 2, 
+         we have 4 sub-tensors of size (3,2)
+>> srcIndex - index of the source sub-tensors
+>> indexSize - length of srcIndex (and tgtIndex)
+>> tgtIndex - index of the target sub-tensors
+>> copyNum - number of the sub-tensors we copy for each source index, 
+   e.g., for srcIndex = [1,4] and copyNum = 2,
+   we actually copy the source sub-tensors 1, 2, 4, 5
+<< return - the result of copying indexed sub-tensors
+*/
+XTensor CopyIndexed(const XTensor & s, int dim, 
+                    const XTensor & srcIndex, const XTensor & tgtIndex,
+                    int copyNum)
+{
+    CheckNTErrors(dim >= 0 && dim < s.order, "A too larget dimension specified!");
+
+    int order = s.order;
+    int * dimSize = new int[order];
+    int indexSize = srcIndex.unitNum;
+
+    for (int i = 0; i < s.order; i++) {
+        if (i == dim)
+            dimSize[i] = indexSize * copyNum;
+        else
+            dimSize[i] = s.dimSize[i];
+    }
+    
+    float dr = (!s.isSparse) ? 1.0F : s.denseRatio;
+    XTensor t(order, dimSize, s.dataType, dr, s.devID, s.mem);
+    t.SetTMPFlag();
+
+    /* call _CopyIndexed function */
+    _CopyIndexed(&s, &t, dim, &srcIndex, &tgtIndex, copyNum);
+
+    XList list(3);
+    list.Add(&s);
+    list.Add(&srcIndex);
+    list.Add(&tgtIndex);
+
+    /* tensor connection */
+    XLink::MakeLink(&list, &t, MOVEMENT_COPYINDEXED);
+    XLink::AddParamToHeadInt(&t, dim);
+    XLink::AddParamToHeadInt(&t, copyNum);
+    
+    /* destroy variables */
+    delete[] dimSize;
+
+    return t;
+}
+
+/*
+copy indexed sub-tensors (return a XTensor structure)
 make a new tensor to keep the result and return it

 >> s - the source tensor

--- a/source/tensor/core/movement/CopyIndexed.cu
+++ b/source/tensor/core/movement/CopyIndexed.cu
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-11-30
+ */
+
+#include "CopyIndexed.cuh"
+#include "../../XDevice.h"
+#include "../../XUtility.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+#ifdef USE_CUDA
+
+/*
+copy selected sub-tensors where indeces are kept in tensors (kenerl version)
+
+>> s - the source tensor
+>> t - the target tensor
+>> dim - the leading dimension to define "sub-tensors"
+         e.g., for a tensor of size (3, 2, 4) and dim = 2, 
+         we have 4 sub-tensors of size (3, 2)
+>> srcIndex - the tensor to save the index of the source sub-tensors
+>> tgtIndex - the tensor to save the index of the target sub-tensors
+>> copyNum - number of the sub-tensors we copy for each source index, 
+             e.g., for srcIndex = [1,4] and copyNum = 2,
+             we actually copy the source sub-tensors 1, 2, 4, 5
+*/
+__global__
+void KernelCopyIndexed(DTYPE * sData, DTYPE * tData, int * sIndex, int * tIndex, 
+                       int blockNum, int blockSizeSrc, int blockSizeTgt, 
+                       int stride, int indexSize, int copyNum)
+{
+    __shared__ DTYPE * sp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+    __shared__ DTYPE * tp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+
+    /* block id */
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    /* offset in each block */
+    int offset = blockDim.y * blockIdx.y + threadIdx.y;
+
+    if(i >= blockNum * indexSize * copyNum || offset >= stride)
+        return;
+
+    int realIndexSize = indexSize * copyNum;
+
+    int realBlockNum = i / realIndexSize;
+    int realIndex = i % realIndexSize;
+
+    int realSrcIndex = sIndex[realIndex / copyNum] + realIndex % copyNum;
+    int realTgtIndex = tIndex[realIndex / copyNum] + realIndex % copyNum;
+
+    if(threadIdx.y == 0){
+        sp[threadIdx.x] = sData + realBlockNum * blockSizeSrc + realSrcIndex * stride;
+        tp[threadIdx.x] = tData + realBlockNum * blockSizeTgt + realTgtIndex * stride;
+    }
+
+    __syncthreads();
+
+    DTYPE * s = sp[threadIdx.x];
+    DTYPE * t = tp[threadIdx.x];
+
+    t[offset] = s[offset];
+}
+
+/*
+copy selected sub-tensors where indeces are kept in tensors
+
+>> s - the source tensor
+>> t - the target tensor
+>> dim - the leading dimension to define "sub-tensors"
+         e.g., for a tensor of size (3, 2, 4) and dim = 2, 
+         we have 4 sub-tensors of size (3, 2)
+>> srcIndex - the tensor to save the index of the source sub-tensors
+>> tgtIndex - the tensor to save the index of the target sub-tensors
+>> copyNum - number of the sub-tensors we copy for each source index, 
+             e.g., for srcIndex = [1,4] and copyNum = 2,
+             we actually copy the source sub-tensors 1, 2, 4, 5
+*/
+void _CudaCopyIndexed(const XTensor * s, XTensor * t, int dim,
+                      const XTensor * srcIndex, const XTensor * tgtIndex,
+                      int copyNum)
+{
+    int devID = s->devID;
+    int order = s->order;
+    int indexSize = srcIndex->unitNum;
+
+    int blockNum = 1;
+    int stride = 1;
+    int blockSizeSrc = 1;
+    int blockSizeTgt = 1;
+
+    for (int i = 0; i < dim; i++)
+        blockNum *= s->GetDim(i);
+    
+    for (int i = dim + 1; i < order; i++)
+        stride *= s->GetDim(i);
+
+    blockSizeSrc = stride * s->GetDim(dim);
+    blockSizeTgt = stride * t->GetDim(dim);
+
+    int cudaGrids[3];
+    int cudaBlocks[3];
+
+    int devIDBackup;
+    ProtectCudaDev(devID, devIDBackup);
+
+    GDevs.GetCudaThread2D(devID, blockNum * indexSize * copyNum, stride, MAX_INT, cudaGrids, cudaBlocks);
+
+    dim3 blocks(cudaGrids[0], cudaGrids[1]);
+    dim3 threads(cudaBlocks[0], cudaBlocks[1]);
+
+    DTYPE * sData = (DTYPE*)s->data;
+    DTYPE * tData = (DTYPE*)t->data;
+
+    int * sIndex = (int *)srcIndex->data;
+    int * tIndex = (int *)tgtIndex->data;
+
+    KernelCopyIndexed<<<blocks, threads >>>(sData, tData, sIndex, tIndex, 
+                                            blockNum, blockSizeSrc, blockSizeTgt,
+                                            stride, indexSize, copyNum);
+
+    BacktoCudaDev(devID, devIDBackup);
+
+}
+
+#endif // USE_CUDA
+
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/movement/CopyIndexed.cuh
+++ b/source/tensor/core/movement/CopyIndexed.cuh
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-11-30
+ * Tomorrow is the celebration of the laboratory, I'm so happy!
+ */
+
+#ifndef __CopyIndexed_CUH__
+#define __CopyIndexed_CUH__
+
+#include "../../XTensor.h"
+#include "CopyIndexed.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+#ifdef USE_CUDA
+
+/* copy selected sub-tensors where indeces are kept in tensors (cuda version) */
+void _CudaCopyIndexed(const XTensor * s, XTensor * t, int dim, 
+                      const XTensor * srcIndex, const XTensor * tgtIndex, 
+                      int copyNum);
+
+#endif // USE_CUDA
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __CopyIndexed_CUH__
\ No newline at end of file
--- a/source/tensor/core/movement/CopyIndexed.h
+++ b/source/tensor/core/movement/CopyIndexed.h
@@ -27,22 +27,27 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)

 /* copy selected sub-tensors */
-void _CopyIndexed(const XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum);
+void _CopyIndexed(const XTensor * s, XTensor * t, int dim, 
+                  int * srcIndex, int indexSize, int * tgtIndex,
+                  int copyNum = 1);

 /* copy selected sub-tensors where indeces are kept in tensors */
-void _CopyIndexed(const XTensor * s, XTensor * t, int dim, const XTensor * srcIndex, const XTensor * tgtIndex);
+void _CopyIndexed(const XTensor * s, XTensor * t, int dim, 
+                  const XTensor * srcIndex, const XTensor * tgtIndex, 
+                  int copyNum = 1);

 /* 
-copy selected sub-tensors (return an XTensor structure)
+copy selected sub-tensors (return a XTensor structure)
 make a new tensor to keep the result and return it (remove this???)
 */
 XTensor CopyIndexed(const XTensor &s, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum);
-    
 /*
 copy selected sub-tensors where indeces are kept in tensors (return an XTensor structure)
-make a new tensor to keep the result and return it (remove this???)
+make a new tensor to keep the result and return it
 */
-void CopyIndexed(const XTensor * s, XTensor * t, int dim, const XTensor * srcIndex, const XTensor * tgtIndex);
+XTensor CopyIndexed(const XTensor & s, int dim, 
+                    const XTensor & srcIndex, const XTensor & tgtIndex,
+                    int copyNum = 1);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/movement/Gather.cpp
+++ b/source/tensor/core/movement/Gather.cpp
@@ -57,15 +57,14 @@ gather indexed sub-tensors
 >> t - the target tensor
 >> srcIndex - the tensor to save the index of the source tensor
 */
-void _Gather(XTensor * s, XTensor * t, XTensor * srcIndex)
+void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex)
 {
    CheckNTErrors((s && t), "Invalid tensors!");
-    CheckNTErrors((s->devID == t->devID && t->devID == srcIndex->devID),
-                  "the data must be kept on the same device!");
+    CheckNTErrors(s->devID == t->devID, "the data must be kept on the same device!");
    CheckNTErrors((s->unitSize == t->unitSize), "Unmatched tensors!");

 #ifdef USE_CUDA
-    if (s->devID >= 0 && t->devID >= 0 && srcIndex->devID >= 0) {
+    if (s->devID >= 0 && t->devID >= 0) {
        _CudaGather(s, t, srcIndex);
        return;
    }
@@ -116,6 +115,8 @@ XTensor Gather(XTensor &s, XTensor &index)
    XTensor t(order, dimSize, s.dataType, dr, s.devID, s.mem);
    t.SetTMPFlag();

+    delete[] dimSize;
+
    _Gather(&s, &t, &index);

    /* tensor connection */
@@ -137,4 +138,4 @@ XTensor Gather(XTensor &s, XTensor &index)
    }   
 }

-} // namespace nts(NiuTrans.Tensor)
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/movement/Gather.cu
+++ b/source/tensor/core/movement/Gather.cu
 /* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */

 /*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-11-27
+ */

 #include "Gather.cuh"
 #include "CopyBlocksSelected.cuh"
@@ -41,7 +41,7 @@ __global__
 void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int indexSize, int stride)
 {
    __shared__ DTYPE * sp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
-    __shared__ DTYPE * cp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+    __shared__ DTYPE * tp[MAX_CUDA_THREAD_NUM_PER_BLOCK];

    /* block id */
    int i = blockDim.x * blockIdx.x + threadIdx.x;
@@ -54,15 +54,15 @@ void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int indexSize, int

    if(threadIdx.y == 0){
        sp[threadIdx.x] = sData + sIndex[i] * stride;
-        cp[threadIdx.x] = tData + i * stride;
+        tp[threadIdx.x] = tData + i * stride;
    }

    __syncthreads();

    DTYPE * s = sp[threadIdx.x];
-    DTYPE * c = cp[threadIdx.x];
+    DTYPE * t = tp[threadIdx.x];

-    c[offset] = s[offset];
+    t[offset] = s[offset];
 }

 /*
@@ -72,9 +72,10 @@ gather indexed sub-tensors(cuda version)
 >> t - the target tensor
 >> srcIndex - the tensor to save the index of the source tensor
 */
-void _CudaGather(XTensor * s, XTensor * t, XTensor * srcIndex)
+void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex)
 {
    int devID = s->devID;
+    XMem * mem = s->mem;

    int stride = s->GetDim(1);
    int indexSize = srcIndex->unitNum;
@@ -93,10 +94,26 @@ void _CudaGather(XTensor * s, XTensor * t, XTensor * srcIndex)
    DTYPE * sData = (DTYPE*)s->data;
    DTYPE * tData = (DTYPE*)t->data;

-    int * sIndex = (int *)srcIndex->data;
+    int * sIndex = NULL;
+    
+    if (srcIndex->devID < 0) {
+        sIndex = mem != NULL ? 
+                  (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) : 
+                  (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
+        XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize);
+    }
+    else
+        sIndex = (int *)srcIndex->data;

    KernelGather<<<blocks, threads >>>(sData, tData, sIndex, indexSize, stride);
-    
+
+    if (srcIndex->devID < 0) {
+        if(mem != NULL)
+            mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize);
+        else
+            XMemFree(mem->devID, sIndex);
+    }
+
    BacktoCudaDev(devID, devIDBackup);
 }


--- a/source/tensor/core/movement/Gather.cuh
+++ b/source/tensor/core/movement/Gather.cuh
 /* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */

 /*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-11-27
+ */

 #ifndef __GATHER_CUH__
 #define __GATHER_CUH__
@@ -30,7 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA

 /* gather indexed sub-tensors(cuda version) */
-void _CudaGather(XTensor * s, XTensor * t, XTensor * srcIndex);
+void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex);

 #endif // USE_CUDA


--- a/source/tensor/core/movement/Gather.h
+++ b/source/tensor/core/movement/Gather.h
@@ -30,7 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 void _Gather(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize);

 /* gather selected sub-tensors */
-void _Gather(XTensor * s, XTensor * t, XTensor * srcIndex);
+void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex);

 /* gather selected sub-tensors (return an XTensor structure)
   make a new tensor to keep the result and return it */

--- a/source/tensor/core/movement/Spread.cpp
+++ b/source/tensor/core/movement/Spread.cpp
@@ -134,63 +134,92 @@ void _AssignmentForGather(DTYPE * sData, DTYPE * cData, int blockNum,

 /*
 spread a collection tensor to source tensor.
-And this is a special spread function for backward computation of gather function.
+And this is a special spread function for backward computation of CopyIndexed function.

->> source - the source tensor whose data would be modified
->> collection - the collection whose data would be spread to source tensor
+>> s - the source tensor whose data would be modified
+>> c - the collection whose data would be spread to source tensor
 >> dim - the leading dimension to define "sub-tensors"
         e.g., for a tensor of size (3, 2, 4) and dim = 2, 
         we have 4 sub-tensors of size (3, 2)
->> srcIndex - index of the source sub-tensors
->> indexSize - length of srcIndex (and collIndex)
+>> srcIndex - the tensor to save the index of the source sub-tensors
+>> collIndex - the tensor to save the index of the collection sub-tensors
+>> copyNum - number of the sub-tensors we copy for each source index, 
+             e.g., for srcIndex = [1,4] and copyNum = 2,
+             we actually copy the source sub-tensors 1, 2, 4, 5
 */
-void _SpreadForGather(XTensor * source, XTensor * collection, int dim, 
-                      int * srcIndex, int indexSize)
+void _SpreadForCopyIndexed(XTensor * s, XTensor * c, int dim, 
+                           XTensor * srcIndex, XTensor * collIndex, 
+                           int copyNum)
 {
-    int order = source->order;
+    int order = s->order;
+    int indexSize = srcIndex->unitNum;

-    CheckNTErrors(source->dataType == DEFAULT_DTYPE, "TODO!");
+    CheckNTErrors(indexSize != 0, "NULL index!")
+    CheckNTErrors((s && c), "Invalid tensors!");
+    CheckNTErrors((srcIndex && collIndex), "Invalid index tensors!");
+    CheckNTErrors((s->devID == c->devID || (s->devID < 0 && c->devID < 0)),
+                  "the data must be kept on the same device!");
+    CheckNTErrors((srcIndex->devID == srcIndex->devID || (s->devID < 0 && c->devID < 0)),
+                  "the index must be kept on the same device!");
+    CheckNTErrors((s->devID == srcIndex->devID || (s->devID < 0 && c->devID < 0)),
+                  "the data and index must be kept on the same device!");
+    CheckNTErrors((dim >= 0 && dim < s->order), "A too larget dimension specified!");
+    CheckNTErrors((s->unitSize == c->unitSize), "Unmatched tensors!");
+    CheckNTErrors((srcIndex->unitNum == collIndex->unitNum), "Unmatched index tensors!");
+
+    CheckNTErrors(s->dataType == DEFAULT_DTYPE, "TODO!");
    CheckNTErrors(dim >= 0 && dim < order, "Illegal dimension!");
    
-    for (int i = 0; i < order; i++){
-        if (i == dim) {
-            CheckNTErrors(collection->GetDim(i) == indexSize, "Illegal dimension!");
+    for (int i = 0; i < order; i++) {
+        if (i != dim) {
+            CheckNTErrors(s->GetDim(i) == c->GetDim(i), "Unmatched dimensions");
        }
        else {
-            CheckNTErrors(collection->GetDim(i) == source->GetDim(i), "Illegal dimension!");
+            CheckNTErrors(c->GetDim(i) == indexSize * copyNum, "Unmatched dimensions");
        }
    }

 #ifdef USE_CUDA
-    if(source->devID >= 0 && collection->devID >= 0) {
-        _CudaSpreadForGather(source, collection, dim, srcIndex, indexSize);
+    if(s->devID >= 0 && c->devID >= 0) {
+        _CudaSpreadForCopyIndexed(s, c, dim, srcIndex, collIndex, copyNum);
        return;
    }
 #endif

-    int blockSizeSrc = 1;
-    int blockSizeColl = 1;
    int blockNum = 1;
    int stride = 1;
+    int blockSizeSrc = 1;
+    int blockSizeTgt = 1;

-    for (int i = dim + 1; i < order; i++) {
-        stride *= source->GetDim(i);
-    }
+    for (int i = 0; i < dim; i++)
+        blockNum *= s->GetDim(i);
    
-    blockSizeSrc = stride * source->GetDim(dim);
-    blockSizeColl = stride * collection->GetDim(dim);
-    blockNum = source->unitNum / blockSizeSrc;
+    for (int i = dim + 1; i < order; i++)
+        stride *= s->GetDim(i);

-    DTYPE * sData = (DTYPE*)source->data;
-    DTYPE * cData = (DTYPE*)collection->data;
+    blockSizeSrc = stride * s->GetDim(dim);
+    blockSizeTgt = stride * c->GetDim(dim);

-    for(int i = 0; i < indexSize; i++){
-        int src = srcIndex[i];
-        int tgt = i;
-        DTYPE * s = sData + src * stride;
-        DTYPE * c = cData + tgt * stride;
-        _AssignmentForGather(s, c, blockNum, blockSizeSrc, blockSizeColl, stride);
+    DTYPE * sData = (DTYPE*)s->data;
+    DTYPE * cData = (DTYPE*)c->data;
+    int * sIndex = (int*)srcIndex->data;
+    int * cIndex = (int*)collIndex->data;
+
+    for (int i = 0; i < indexSize; i++) {
+        for (int c = 0; c < copyNum; c++) {
+            int si = sIndex[i] + c;
+            int ti = cIndex[i] + c;
+
+            for (int j = 0; j < blockNum; j++) {
+                DTYPE * sd = sData + j * blockSizeSrc + si * stride;
+                DTYPE * td = cData + j * blockSizeTgt + ti * stride;
+                for (int k = 0; k < stride; k++)
+                    *(sd + k) += *(td + k);
+            }
+        
+        }
    }
+
 }

 /*
@@ -218,7 +247,7 @@ void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index)
    }

 #ifdef USE_CUDA
-    if(source->devID >= 0 && collection->devID >= 0 && index->devID >= 0) {
+    if(source->devID >= 0 && collection->devID >= 0) {
        _CudaSpreadForGather(source, collection, index);
        return;
    }
@@ -241,4 +270,4 @@ void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index)
    }
 }

-} // namespace nts(NiuTrans.Tensor)
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/movement/Spread.cu
+++ b/source/tensor/core/movement/Spread.cu
--- a/source/tensor/core/movement/Spread.cuh
+++ b/source/tensor/core/movement/Spread.cuh
@@ -32,9 +32,10 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 void _CudaSpread(XTensor * source, XTensor * collection, int dim, 
                 int * srcIndex, int indexSize, int * collIndex);

-/* special spread function for backward computation of gather function (cuda version) */
-void _CudaSpreadForGather(XTensor * source, XTensor * collection, int dim, 
-                          int * srcIndex, int indexSize);
+/* special spread function for backward computation of CopyIndexed function (cuda version) */
+void _CudaSpreadForCopyIndexed(XTensor * s, XTensor * c, int dim, 
+                               XTensor * srcIndex, XTensor * collIndex, 
+                               int copyNum);

 /* special spread function for backward computation of gather function (cuda version) */
 void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcIndex);

--- a/source/tensor/core/movement/Spread.h
+++ b/source/tensor/core/movement/Spread.h
@@ -36,9 +36,10 @@ void Spread(XTensor * source, XTensor * collection,
            XTensor * srcIndex, XTensor * collIndex,
            int dim);

-/* special spread function for backward computation of gather function */
-void _SpreadForGather(XTensor * source, XTensor * collection, int dim, 
-                      int * srcIndex, int indexSize);
+/* special spread function for backward computation of CopyIndexed function */
+void _SpreadForCopyIndexed(XTensor * source, XTensor * collection, int dim, 
+                           XTensor * srcIndex, XTensor * collIndex, 
+                           int copyNum);

 /* special spread function for backward computation of gather function */
 void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index);

--- a/source/tensor/core/reduce/ReduceSumAll.cpp
+++ b/source/tensor/core/reduce/ReduceSumAll.cpp
@@ -46,6 +46,22 @@ sum all the items of the tensor (It should be optimized!)
 */
 DTYPE _ReduceSumAll(const XTensor * source)
 {
+    int dims[2] = {1, source->unitNum};
+    int one = 1;
+
+    XTensor * all = NewTensorBuf(2, dims, source->dataType, source->denseRatio, source->devID, source->mem);
+    XTensor * result = NewTensorBuf(1, &one, source->dataType, 1.0F, source->devID, source->mem);
+    
+    _CopyValues(source, all);
+    _ReduceSum(all, result, 1);
+    
+    DTYPE r = result->Get1D(0);
+    
+    DelTensorBuf(result);
+    DelTensorBuf(all);
+    
+    return r;
+
    int order = source->order;
    DTYPE summation;

@@ -60,7 +76,7 @@ DTYPE _ReduceSumAll(const XTensor * source)
        dimSize = getDimSize(big, leadingDim);
        XTensor * little = NewTensor(big->order - 1, dimSize, source->dataType, source->denseRatio, 
                                     source->devID, source->mem);
-           
+        
        _ReduceSum(big, little, leadingDim);

        delete big;

--- a/source/tensor/core/shape/Merge.cpp
+++ b/source/tensor/core/shape/Merge.cpp
@@ -94,7 +94,7 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
    gridSize = blockNum;
    gridNum = s->unitNum / (blockSize * blockNum);

-    if (mergedNum * gridNum <= MIN_TENSOR_SPLIT_NUM) {
+    if (mergedNum * gridNum <= MIN_TENSOR_MERGE_NUM) {
        int sPitch = blockSize * s->unitSize;
        int tPtich = blockSize * mergedNum * t->unitSize;
        int mSize = blockSize * t->unitSize;
@@ -253,7 +253,7 @@ void _Merge(const XList * smalls, XTensor * big, int whereToMerge)
    gridNum = s0->unitNum / (blockSize * blockNum);

    /* merging with fewer data copy operations */
-    if (mergedNum * gridNum <= MIN_TENSOR_SPLIT_LIST_NUM) {
+    if (mergedNum * gridNum <= MIN_TENSOR_MERGE_LIST_NUM) {
        int sPitch = blockSize * s0->unitSize;
        int tPtich = blockSize * mergedNum * big->unitSize;
        int mSize = blockSize * big->unitSize;

--- a/source/tensor/core/shape/Split.cpp
+++ b/source/tensor/core/shape/Split.cpp
@@ -126,7 +126,7 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
        void * dataTMP = t->data;

        if (!isOnSameDevice)
-            dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(mem->devID, size);
+            dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(s->devID, size);

        int realBlockSize = blockSize * t->unitSize;
        int blockSplitSize = blockNum / splitNum;

--- a/source/tensor/function/Dropout.cpp
+++ b/source/tensor/function/Dropout.cpp
@@ -153,7 +153,7 @@ XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim)
    int unitNum = x.dimSize[n];
    DTYPE * maskArray = new DTYPE[unitNum];

-    srand((unsigned int)time(NULL));
+    //srand((unsigned int)time(NULL));
    for (int i = 0; i < unitNum; i++)
        maskArray[i] = RandomBernoulli(dropProb, scaleFactor);
    
@@ -166,4 +166,33 @@ XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim)
    return MultiplyDim(x, mask, n, 0);
 }

+/* 
+dropout function without broadcast 
+
+>> x - input tensor
+>> dropProb - probability to set an element to zero
+*/
+XTensor DropoutWithoutBroadcast(const XTensor &x, DTYPE dropProb)
+{
+    CheckNTErrors(dropProb >= 0.0 && dropProb <= 1.0, "The probability must be 0-1!");
+
+    DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - dropProb);
+    
+    /* generate a mask tensor with probability p */
+    int unitNum = x.unitNum;
+    DTYPE * maskArray = new DTYPE[unitNum];
+
+    srand((unsigned int)time(NULL));
+    for (int i = 0; i < unitNum; i++)
+        maskArray[i] = RandomBernoulli(dropProb, scaleFactor);
+    
+    XTensor mask;
+    InitTensor(&mask, x.order, x.dimSize, x.dataType, x.denseRatio, x.devID, x.mem);
+    mask.SetData(maskArray, unitNum);
+
+    delete[] maskArray;
+    
+    return Multiply(x, mask);
+}
+
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/function/Dropout.h
+++ b/source/tensor/function/Dropout.h
@@ -42,6 +42,9 @@ void _DropoutBackward(const XTensor * y, const XTensor * x,
    
 /* dropout function */
 XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim = -1);
+    
+/* dropout function without broadcast */
+XTensor DropoutWithoutBroadcast(const XTensor &x, DTYPE dropProb);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/function/LogSoftmax.cu
+++ b/source/tensor/function/LogSoftmax.cu
@@ -373,9 +373,9 @@ better numerical stability.
 >> leadDim - leading dimension (along which we perform reduction)
 */
 void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
-                            XTensor * dedy, XTensor * dedx, 
-                            XTensor * padding, int leadDim, 
-                            LOSS_FUNCTION_NAME lossName)
+                             XTensor * dedy, XTensor * dedx, 
+                             XTensor * padding, int leadDim, 
+                             LOSS_FUNCTION_NAME lossName)
 {
    leadDim = leadDim < 0 ? y->order - 1 : leadDim;


--- a/source/tensor/test/TAbsolute.h
+++ b/source/tensor/test/TAbsolute.h
 /* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */

 /*
-* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-12
-*/
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-12
+ */

 #ifndef __TEST_ABSOLUTE_H__
 #define __TEST_ABSOLUTE_H__

--- a/source/tensor/test/TClip.cpp
+++ b/source/tensor/test/TClip.cpp
 /* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */

 /*
-* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-03
-*/
+ * $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-03
+ */

 #include "../XTensor.h"
+#include "../core/math/Clip.h"
 #include "TClip.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/test/TClip.h
+++ b/source/tensor/test/TClip.h
 /* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */

 /*
-* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-03
-*/
+ * $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-03
+ */

 #ifndef __TEST_CLIP_H__
 #define __TEST_CLIP_H__

--- a/source/tensor/test/TCompare.cpp
+++ b/source/tensor/test/TCompare.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-12
+ */
+
+#include "../XTensor.h"
+#include "../core/math/Compare.h"
+#include "TCompare.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/*
+case 1: test Equal function.
+Comapre whether every entry is equal to the specified value.
+*/
+bool TestCompare1()
+{
+	/* a tensor of size (3, 2) */
+	int aOrder = 2;
+	int * aDimSize = new int[aOrder];
+	aDimSize[0] = 3;
+	aDimSize[1] = 2;
+
+	int aUnitNum = 1;
+	for (int i = 0; i < aOrder; i++)
+		aUnitNum *= aDimSize[i];
+
+	DTYPE aData[3][2] = { {1.0F, -2.0F},
+						  {0.0F, 4.0F},
+						  {5.0F, 1.0F} };
+	DTYPE answer[3][2] = { {1.0F, 0.0F},
+						   {0.0F, 0.0F},
+					   	   {0.0F, 1.0F} };
+
+	/* CPU test */
+	bool cpuTest = true;
+
+	/* create tensors */
+	XTensor * a = NewTensor(aOrder, aDimSize);
+	XTensor * b = NewTensor(aOrder, aDimSize);
+	XTensor * aMe = NewTensor(aOrder, aDimSize);
+	XTensor bUser;
+
+	/* initialize variables */
+	a->SetData(aData, aUnitNum);
+	aMe->SetData(aData, aUnitNum);
+
+	/* call Equal function */
+	_Equal(a, b, 1.0);
+	_EqualMe(aMe, 1.0);
+	bUser = Equal(*a, 1.0);
+
+	/* check results */
+	cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && 
+              aMe->CheckData(answer, aUnitNum, 1e-4F) && 
+              bUser.CheckData(answer, aUnitNum, 1e-4F);
+
+#ifdef USE_CUDA
+	/* GPU test */
+	bool gpuTest = true;
+
+	/* create tensor */
+	XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+	XTensor * bGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+	XTensor * aMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+	XTensor bUserGPU;
+
+	/* Initialize variables */
+	aGPU->SetData(aData, aUnitNum);
+	aMeGPU->SetData(aData, aUnitNum);
+
+	/* call Equal function */
+	_Equal(aGPU, bGPU, 1.0);
+	_EqualMe(aMeGPU, 1.0);
+	bUserGPU = Equal(*aGPU, 1.0);
+
+	/* check results */
+	gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && 
+              aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && 
+              bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
+
+	/* destroy variables */
+	delete a;
+	delete b;
+	delete aMe;
+	delete aGPU;
+	delete bGPU;
+	delete aMeGPU;
+	delete[] aDimSize;
+
+	return cpuTest && gpuTest;
+#else
+	/* destroy variables */
+	delete a;
+	delete b;
+	delete aMe;
+	delete[] aDimSize;
+
+	return cpuTest;
+#endif // USE_CUDA
+}
+
+/* other cases */
+/*
+TODO!!
+*/
+
+/* test for Compare Function */
+bool TestCompare()
+{
+	XPRINT(0, stdout, "[TEST Compare] compare every entry with specified value \n");
+	bool returnFlag = true, caseFlag = true;
+
+	/* case 1 test */
+	caseFlag = TestCompare1();
+
+	if (!caseFlag) {
+		returnFlag = false;
+		XPRINT(0, stdout, ">> case 1 failed!\n");
+	}
+	else
+		XPRINT(0, stdout, ">> case 1 passed!\n");
+
+	/* other cases test */
+	/*
+	TODO!!
+	*/
+
+	if (returnFlag) {
+		XPRINT(0, stdout, ">> All Passed!\n");
+	}
+	else
+		XPRINT(0, stdout, ">> Failed!\n");
+
+	XPRINT(0, stdout, "\n");
+
+	return returnFlag;
+}
+
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/test/TCompare.h
+++ b/source/tensor/test/TCompare.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-12-10
+ */
+
+#ifndef __TEST_Compare_H__
+#define __TEST_Compare_H__
+
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* test for Compare Function */
+bool TestCompare();
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __TEST_Compare_H__
--- a/source/tensor/test/TConvertDataType.cpp
+++ b/source/tensor/test/TConvertDataType.cpp
@@ -232,13 +232,12 @@ bool TestConvertDataType3()
    /* initialize variables */
    a->SetData(data1, unitNum1);

-    /* call ConvertDataType function */
+    /* call ConvertDataType function (We have not implemented this yet...)  */
    //_ConvertDataType(a, b);
    //_ConvertDataType(b, c);
    
    /* check results */
-    cpuTest = a->CheckData(data1, unitNum1, 1e-4F);
-    c->Dump(stderr, "");
+    //cpuTest = a->CheckData(data1, unitNum1, 1e-4F);

 #ifdef USE_CUDA
    /* GPU test */

--- a/source/tensor/test/TCopyIndexed.cpp
+++ b/source/tensor/test/TCopyIndexed.cpp
--- a/source/tensor/test/TCrossEntropy.cpp
+++ b/source/tensor/test/TCrossEntropy.cpp
@@ -311,8 +311,8 @@ bool TestCrossEntropy3()
    delete goldGPU;
    delete lossGPU;
    delete weightGPU;
-
    delete[] dimSize;
+    delete[] wDimSize;

    return cpuTest && gpuTest;
 #else
@@ -322,6 +322,7 @@ bool TestCrossEntropy3()
    delete loss;
    delete weight;
    delete[] dimSize;
+    delete[] wDimSize;

    return cpuTest;
 #endif // USE_CUDA

--- a/source/tensor/test/TDropout.cpp
+++ b/source/tensor/test/TDropout.cpp
@@ -212,6 +212,8 @@ bool TestDropout2()
    /* destroy variables */
    delete x;
    delete y;
+    delete dedx;
+    delete dedy;
    delete[] dimSize;

    return cpuTest;

--- a/source/tensor/test/TGather.cpp
+++ b/source/tensor/test/TGather.cpp
@@ -332,6 +332,7 @@ bool TestGather3()
    /* destroy variables */
    delete s;
    delete t;
+    delete index;
    delete[] sDimSize;
    delete[] tDimSize;
    delete[] indexDimSize;

--- a/source/tensor/test/TSetData.cpp
+++ b/source/tensor/test/TSetData.cpp
@@ -150,6 +150,7 @@ bool TestSetData2()
    delete sGPU;
    delete modifyGPU;
    delete[] sDimSize;
+    delete[] dataDimSize;

    return cpuTest && gpuTest;
 #else
@@ -157,6 +158,7 @@ bool TestSetData2()
    delete s;
    delete modify;
    delete[] sDimSize;
+    delete[] dataDimSize;

    return cpuTest;
 #endif // USE_CUDA
@@ -242,6 +244,7 @@ bool TestSetData3()
    delete sGPU;
    delete modifyGPU;
    delete[] sDimSize;
+    delete[] dataDimSize;

    return cpuTest && gpuTest;
 #else
@@ -249,6 +252,7 @@ bool TestSetData3()
    delete s;
    delete modify;
    delete[] sDimSize;
+    delete[] dataDimSize;

    return cpuTest;
 #endif // USE_CUDA

--- a/source/tensor/test/TSort.cpp
+++ b/source/tensor/test/TSort.cpp
@@ -61,7 +61,9 @@ bool TestSort1()
    _SortMe(aMe, index, 0);
    Sort(*a, bUser, *index, 0);

-    cpuTest = b->CheckData(answer, unitNum) && aMe->CheckData(answer, unitNum) && bUser.CheckData(answer, unitNum);
+    cpuTest = b->CheckData(answer, unitNum) && 
+              aMe->CheckData(answer, unitNum) && 
+              bUser.CheckData(answer, unitNum);

 #ifdef USE_CUDA
    /* GPU test */
@@ -85,7 +87,9 @@ bool TestSort1()
    Sort(*aGPU, bUserGPU, *indexGPU, 0);

    /* check results */
-    gpuTest = bGPU->CheckData(answer, unitNum) && aMeGPU->CheckData(answer, unitNum) && bUserGPU.CheckData(answer, unitNum);
+    gpuTest = bGPU->CheckData(answer, unitNum) && 
+              aMeGPU->CheckData(answer, unitNum) && 
+              bUserGPU.CheckData(answer, unitNum);

    /* destroy variables */
    delete a;
@@ -149,7 +153,9 @@ bool TestSort2()
    Sort(*a, bUser, *index, 1);

    /* check results */
-    cpuTest = b->CheckData(answer, unitNum) && aMe->CheckData(answer, unitNum) && bUser.CheckData(answer, unitNum);
+    cpuTest = b->CheckData(answer, unitNum) && 
+              aMe->CheckData(answer, unitNum) && 
+              bUser.CheckData(answer, unitNum);

 #ifdef USE_CUDA
    /* GPU test */
@@ -173,7 +179,9 @@ bool TestSort2()
    Sort(*aGPU, bUserGPU, *indexGPU, 1);

    /* check results */
-    gpuTest = bGPU->CheckData(answer, unitNum) && aMeGPU->CheckData(answer, unitNum) && bUserGPU.CheckData(answer, unitNum);
+    gpuTest = bGPU->CheckData(answer, unitNum) && 
+              aMeGPU->CheckData(answer, unitNum) && 
+              bUserGPU.CheckData(answer, unitNum);

    /* destroy variables */
    delete a;

--- a/source/tensor/test/TSplit.cpp
+++ b/source/tensor/test/TSplit.cpp
@@ -357,6 +357,7 @@ bool TestSplit3()
 	delete[] sDimSize;
 	delete[] tDimSize1;
 	delete[] tDimSize2;
+    delete tList;

 	return cpuTest;
 #endif // USE_CUDA

--- a/source/tensor/test/TSpread.cpp
+++ b/source/tensor/test/TSpread.cpp
@@ -182,6 +182,7 @@ bool TestSpread2()
    int dim = 0;
    int indexSize = 2;
    int srcIndex[2] = {0, 2};
+    int tgtIndex[2] = {0, 1};

    /* CPU test */
    bool cpuTest = true;
@@ -190,17 +191,19 @@ bool TestSpread2()
    XTensor * s1 = NewTensor(sOrder, sDimSize);
    XTensor * s2 = NewTensor(sOrder, sDimSize);
    XTensor * t = NewTensor(tOrder, tDimSize);
-    XTensor * index = NewTensor(indexOrder, indexDimSize, X_INT);
+    XTensor * sIndex = NewTensor(indexOrder, indexDimSize, X_INT);
+    XTensor * cIndex = NewTensor(indexOrder, indexDimSize, X_INT);

    /* initialize variables */
    s1->SetData(sData, sUnitNum);
    s2->SetData(sData, sUnitNum);
    t->SetData(tData, tUnitNum);
-    index->SetData(srcIndex, indexSize);
+    sIndex->SetData(srcIndex, indexSize);
+    cIndex->SetData(tgtIndex, indexSize);

    /* call _SpreadForGather function */
-    _SpreadForGather(s1, t, dim, srcIndex, indexSize);
-    _SpreadForGather(s2, t, index);
+    _SpreadForCopyIndexed(s1, t, dim, sIndex, cIndex, 1);
+    _SpreadForGather(s2, t, sIndex);

    /* check results */
    cpuTest = s1->CheckData(answer, tUnitNum) &&
@@ -214,17 +217,19 @@ bool TestSpread2()
    XTensor * sGPU1 = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
    XTensor * sGPU2 = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
    XTensor * tGPU = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * indexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
+    XTensor * sIndexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
+    XTensor * cIndexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);

    /* initialize variables */
    sGPU1->SetData(sData, sUnitNum);
    sGPU2->SetData(sData, sUnitNum);
    tGPU->SetData(tData, tUnitNum);
-    indexGPU->SetData(srcIndex, indexSize);
+    sIndexGPU->SetData(srcIndex, indexSize);
+    cIndexGPU->SetData(tgtIndex, indexSize);

    /* call _SpreadForGather function */
-    _SpreadForGather(sGPU1, tGPU, dim, srcIndex, indexSize);
-    _SpreadForGather(sGPU2, tGPU, indexGPU);
+    _SpreadForCopyIndexed(sGPU1, tGPU, dim, sIndex, cIndex, 1);
+    _SpreadForGather(sGPU2, tGPU, sIndexGPU);

    /* check results */
    gpuTest = sGPU1->CheckData(answer, tUnitNum) && 
@@ -234,11 +239,13 @@ bool TestSpread2()
    delete s1;
    delete s2;
    delete t;
-    delete index;
+    delete sIndex;
+    delete cIndex;
    delete sGPU1;
    delete sGPU2;
    delete tGPU;
-    delete indexGPU;
+    delete sIndexGPU;
+    delete cIndexGPU;
    delete[] sDimSize;
    delete[] tDimSize;
    delete[] indexDimSize;
@@ -249,6 +256,8 @@ bool TestSpread2()
    delete s1;
    delete s2;
    delete t;
+    delete sIndex;
+    delete cIndex;
    delete[] sDimSize;
    delete[] tDimSize;
    delete[] indexDimSize;

--- a/source/tensor/test/Test.cpp
+++ b/source/tensor/test/Test.cpp
@@ -31,6 +31,7 @@ bool Test()
    
    wrong = !TestAbsolute() || wrong;
    wrong = !TestClip() || wrong;
+    wrong = !TestCompare() || wrong;
    wrong = !TestConcatenate() || wrong;
    wrong = !TestConcatenateSolely() || wrong;
    wrong = !TestCos() || wrong;

--- a/source/tensor/test/Test.h
+++ b/source/tensor/test/Test.h
@@ -24,6 +24,7 @@

 #include "TAbsolute.h"
 #include "TClip.h"
+#include "TCompare.h"
 #include "TConcatenate.h"
 #include "TConcatenateSolely.h"
 #include "TCos.h"