1. Bug fixed; 2. Support abs, sign, convert data type(float <->int) ;3. Merge…

1. Bug fixed; 2. Support abs, sign, convert data type(float <->int) ;3. Merge with xu and xiao branch

1. Bug fixed; 2. Support abs, sign, convert data type(float <->int) ;3. Merge…
1. Bug fixed; 2. Support abs, sign, convert data type(float <->int) ;3. Merge with xu and xiao branch
1410c491 · liyinqiao · 2d504e7a · 2d504e7a · 1410c491 · 1410c491
Commit 1410c491 authored Jul 11, 2018 by liyinqiao
--- a/source/XLink.cpp
+++ b/source/XLink.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-04
- */
-#include <stdio.h>
-#include "XLink.h"
-#include "XName.h"
-namespace nts{ // namespace nts(NiuTrans.Tensor)
-int XLink::paramSize = 64;
-/* constuctor */
-XLink::XLink()
-{
-    head   = NULL;
-    tails  = NULL;
-    params = NULL;
-    tailNum  = 0;
-    paramNum = 0;
-    type[0] = 0;
-    typeID = 0;
-}
-/* deconstructor */
-XLink::~XLink()
-{
-    delete[] tails;
-    delete[] (char*)params;
-}
-/* reset it */
-void XLink::Reset()
-{
-    delete[] tails;
-    delete[] (char*)params;
-    head   = NULL;
-    tails  = NULL;
-    params = NULL;
-    tailNum  = 0;
-    paramNum = 0;
-    type[0] = 0;
-}
-/* 
-set edge type name 
->> id - id of the type
-*/
-void XLink::SetType(int id)
-{
-    type[0] = 0;
-    strcpy(type, GetOPName(id));
-    typeID = id;
-    CheckNTErrors(!strcmp(type, "NULL"), "illegal edge type name!");
-}
-/* 
-set head 
->> h - pointer to the head tensor
-*/
-void XLink::SetHead(XTensor * h)
-{
-    head = h;
-}
-/* 
-add a tail
->> t - pointer to the tail tensor
-*/
-void XLink::AddTail(XTensor * t)
-{
-    XTensor ** ts = tails;
-    tails = new XTensor*[tailNum + 1];
-    memcpy(tails, ts, sizeof(XTensor*) * tailNum);
-    tails[tailNum++] = t;
-    delete[] ts;
-}
-/* 
-add two tails in one time 
->> t1 - pointer to the tail tensor
->> t2 - pointer to another tail tensor
-*/
-void XLink::AddTwoTails(XTensor * t1, XTensor * t2)
-{
-    XTensor ** ts = tails;
-    tails = new XTensor*[tailNum + 2];
-    memcpy(tails, ts, sizeof(XTensor*) * tailNum);
-    tails[tailNum++] = t1;
-    tails[tailNum++] = t2;
-    delete[] ts;
-}
-/* 
-add a parameter 
->> param - parameter in default type
-*/
-void XLink::AddParam(DTYPE param)
-{
-    void * ps = params;
-    params = new char[paramNum + 1];
-    memcpy(params, ps, paramNum * paramSize);
-    DTYPE * p = (DTYPE*)((char*)params + paramNum * paramSize);
-    *p = param;
-    paramNum++;
-    delete[] (char*)ps;
-}
-/* 
-add a parameter 
->> param - pointer to the parameter
->> size - size of the parameter
-*/
-void XLink::AddParam(void * param, int size)
-{
-    void * ps = params;
-    params = new char[paramNum + 1];
-    memcpy(params, ps, paramNum * paramSize);
-    char * p = (char*)params + paramNum * paramSize;
-    memcpy(p, param, size);
-    paramNum++;
-    delete[] (char*)ps;
-}
-/* 
-create a hyperedge with two input tensors and a output tensor 
->> t1 - a tail tensor
->> t2 - another tail tensor
->> h - head tensor
->> id - id of the edge type
-*/
-void XLink::MakeLink(XTensor * t1, XTensor * t2, XTensor * h, int id)
-{
-    if(h != NULL)
-        return;
-    /* forward */
-    XLink &income = h->income;
-    income.Reset();
-    income.SetHead(h);
-    if(t1 != NULL && t2 != NULL)
-        income.AddTwoTails(t1, t2);
-    else if(t1 != NULL)
-        income.AddTail(t1);
-    else{
-        ShowNTErrors("TODO!");
-    }
-    income.SetType(id);
-    /* backward for t1 */
-    if(t1 != NULL){
-        XLink &outgo = t1->outgo;
-        CheckNTErrors(outgo.head != t1, "Wrong head of the hyperedge!");
-        outgo.AddTail(h);
-    }
-    /* backward for t2 */
-    if(t2 != NULL){
-        XLink &outgo = t2->outgo;
-        CheckNTErrors(outgo.head != t2, "Wrong head of the hyperedge!");
-        outgo.AddTail(h);
-    }
-}
-/* 
-create a hyper edge with a list of tensors and a output tensor 
->> list - a list of input tensors
->> h - head tensor
->> id - id of the edge type
-*/
-void XLink::MakeLink(XList * list, XTensor * h, int id)
-{
-    /* forward */
-    XLink &income = h->income;
-    income.Reset();
-    income.SetHead(h);
-    income.SetType(id);
-    for(int i = 0; i < list->count; i++){
-        XTensor * t = (XTensor*)list->GetItem(i);
-        income.AddTail(t);
-    }
-    /* backward */
-    for(int i = 0; i < list->count; i++){
-        XTensor * t = (XTensor*)list->GetItem(i);
-        XLink &outgo = t->outgo;
-        CheckNTErrors(outgo.head != t, "Wrong head of the hyperedge!");
-        outgo.AddTail(h);
-    }
-}
-/* 
-add parameters 
->> h - head
->> param - parameter we want introduce
-*/
-void XLink::AddParamToHead(XTensor * h, DTYPE param)
-{
-    if(h != NULL)
-        return;
-    h->income.AddParam(param);
-}
-/* 
-add an integer parameter 
->> h - head
->> param - parameter we want introduce
-*/
-void XLink::AddParamToHeadInt(XTensor * h, int param)
-{
-    if(h != NULL)
-        return;
-    h->income.AddParam(&param, sizeof(int));
-}
-} // namespace nts(NiuTrans.Tensor)
--- a/source/network/Main.cpp
+++ b/source/network/Main.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-10
+ */
+#include <stdio.h>
+#include "../tensor/XTensor.h"
+//#define CRTDBG_MAP_ALLOC
+//#include <stdlib.h>
+//#include <crtdbg.h>
+using namespace nts;
+int main( int argc, const char ** argv )
+{
+    if(argc > 1 && !strcmp(argv[1], "-test"))
+        1;//Test();
+    else{
+        fprintf(stderr, "Thanks for using NiuTrans.Network! This is a library for building\n");
+        fprintf(stderr, "neural networks in an easy way. \n\n");
+        fprintf(stderr, "Run this program with \"-test\" for unit test!\n");
+    }
+    //_CrtDumpMemoryLeaks();
+    return 0;
+}
--- a/source/Main.cpp
+++ b/source/Main.cpp
@@ -29,8 +29,7 @@
 #include "XTensor.h"
 #include "XDevice.h"
 #include "./sample/fnnlm/FNNLM.h"
+#include "./test/Test.h"
-#include "test/Test.h"
 //#define CRTDBG_MAP_ALLOC
 //#include <stdlib.h>  
@@ -39,8 +38,16 @@
 using namespace nts;
 using namespace samplefnnlm;
+void SmallTest();
 int main( int argc, const char ** argv )
 {
+    //_CrtSetBreakAlloc(78);
+    /* a tiny test */
+    //if(1)
+    //    SmallTest();
    if(argc > 1 && !strcmp(argv[1], "-test"))
        Test();
    else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
@@ -56,3 +63,30 @@ int main( int argc, const char ** argv )
    return 0;
 }
+void SmallTest()
+{
+    XTensor a;
+    XTensor b;
+    InitTensor2D(&a, 2, 2);
+    a.SetZeroAll();
+    a.Set2D(1.0F, 0, 0);
+    a.Set2D(2.0F, 1, 1);
+    b = Sum(a, Multiply(a, a));
+    XTensor c = a * b + a;
+    int nnn = 1;
+    XTensor d = a + b + c.Lin(0.5F);
+    XLink::CheckNetwork(&d);
+    XLink::ShowNetwork(stderr, &b);
+    a.Dump(stderr, "a: ");
+    b.Dump(stderr, "b: ");
+    c.Dump(stderr, "c: ");
+    d.Dump(stderr, "d: ");
+}
--- a/source/XBLAS.cpp
+++ b/source/XBLAS.cpp
--- a/source/XBLAS.h
+++ b/source/XBLAS.h
--- a/source/XDataType.cpp
+++ b/source/XDataType.cpp
--- a/source/XDataType.h
+++ b/source/XDataType.h
--- a/source/XDevice.cpp
+++ b/source/XDevice.cpp
--- a/source/XDevice.h
+++ b/source/XDevice.h
--- a/source/XGlobal.cpp
+++ b/source/XGlobal.cpp
--- a/source/XGlobal.h
+++ b/source/XGlobal.h
@@ -74,7 +74,7 @@ namespace nts {
 { \
    if(!(x)) \
    { \
-        fprintf(stderr, "Error! calling '%s' (%s line %d): %s\n", #x, __FILENAME__, __LINE__, msg); \
+        fprintf(stderr, "[ERROR] calling '%s' (%s line %d): %s\n", #x, __FILENAME__, __LINE__, msg); \
        exit(1); \
    } \
 } \
@@ -83,7 +83,7 @@ namespace nts {
 { \
    if(!(x)) \
    { \
-        fprintf(stderr, "Error! calling '%s' (%s line %d): %s\n", #x, __FILENAME__, __LINE__); \
+        fprintf(stderr, "[ERROR] calling '%s' (%s line %d): %s\n", #x, __FILENAME__, __LINE__); \
        exit(1); \
    } \
 } \
@@ -91,7 +91,7 @@ namespace nts {
 #define ShowNTErrors(msg) \
 { \
    { \
-        fprintf(stderr, "Error! (%s line %d): %s\n", __FILENAME__, __LINE__, msg); \
+        fprintf(stderr, "[ERROR] (%s line %d): %s\n", __FILENAME__, __LINE__, msg); \
        exit(1); \
    } \
 } \

--- a/source/XHeap.cpp
+++ b/source/XHeap.cpp
--- a/source/XHeap.h
+++ b/source/XHeap.h
--- a/source/tensor/XLink.cpp
+++ b/source/tensor/XLink.cpp
--- a/source/XLink.h
+++ b/source/XLink.h
@@ -86,6 +86,16 @@ struct XLink
    /* reset it */
    void Reset();
+    /* clear it */
+    void Clear();
+    /* clear tails */
+    void ClearTail();
+    /* clear the incoming node list of tensor node */
+    static
+    void ClearIncoming(XTensor * node);
    /* set edge type id and name */
    void SetType(int id);
@@ -106,7 +116,7 @@ struct XLink
    /* create a hyper edge with two input tensors and a output tensor */
    static
-    void MakeLink(XTensor * t1, XTensor * t2, XTensor * h, int id);
+    void MakeLink(const XTensor * t1, const XTensor * t2, XTensor * h, int id);
    /* create a hyper edge with a list of input tensors and a output tensor */
    static
@@ -119,6 +129,22 @@ struct XLink
    /* add an integer parameter */
    static
    void AddParamToHeadInt(XTensor * h, int param);
+    /* replace a node with another, i.e., we redirect the links to the new node */
+    static 
+    void Replace(const XTensor * oldOne, XTensor * newOne);
+    /* copy links of a given node */
+    static
+    void CopyIncoming(const XTensor * reference, XTensor * target);
+    /* check the correctness of the network encoded in a root node (tensor) */
+    static
+    void CheckNetwork(XTensor * root);
+    /* show the network encoded in a root node (tensor) */
+    static
+    void ShowNetwork(FILE * file, XTensor * root);
 };
 } // namespace nts(NiuTrans.Tensor)

--- a/source/XList.cpp
+++ b/source/XList.cpp
@@ -111,7 +111,7 @@ void XList::Create(int myMaxNum, XMem * myMem)
 add an item into the list
 >> item - pointer to the item
 */
-void XList::Add(void * item)
+void XList::Add(const void * item)
 {
    if( count == maxNum ){
        void ** newItems;
@@ -126,7 +126,8 @@ void XList::Add(void * item)
        maxNum = maxNum * 2 + 1;
    }
-    items[count++] = item;
+    MTYPE p = (MTYPE)item;
+    items[count++] = (MTYPE*)p;
 }
@@ -355,4 +356,4 @@ void XList::Shuffle(int nround, int beg, int len)
 }
 } 
 /* end of the nts (NiuTrans.Tensor) namespace */
\ No newline at end of file
--- a/source/XList.h
+++ b/source/XList.h
@@ -69,7 +69,7 @@ public:
    /* utilities */
    void Create(int myMaxNum, XMem * myMem);
-    void Add(void * item);
+    void Add(const void * item);
    void Add(void ** inputItems, int inputItemCount);
    void AddList(XList * l);
    void AddInt(int i);
@@ -99,4 +99,4 @@ public:
 } 
 /* end of the nts (NiuTrans.Tensor) namespace */
 #endif
\ No newline at end of file
--- a/source/XMem.cpp
+++ b/source/XMem.cpp
--- a/source/XMem.h
+++ b/source/XMem.h
--- a/source/XName.cpp
+++ b/source/XName.cpp
@@ -19,15 +19,10 @@
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-05
 */
-#ifndef __XNAME_H__
+#include "XName.h"
-#define __XNAME_H__
 namespace nts { // namespace nts(NiuTrans.Tensor)
-#define MATH_ARITHMETIC     0x00001000
-#define MATH_SUM            MATH_ARITHMETIC + 1
-#define MATH_MULTIPLY       MATH_SUM + 1
 /* get operator name */
 const char * GetOPName(int type)
 {
@@ -36,6 +31,8 @@ const char * GetOPName(int type)
            return "M_SUM";
        else if(type == MATH_MULTIPLY)
            return "M_MULTIPLY";
+        else if(type == MATH_SCALEANDSHIFT)
+            return "M_SCALEANDSHIFT";
    }
    return "NULL";
@@ -43,4 +40,3 @@ const char * GetOPName(int type)
 } // namespace nts(NiuTrans.Tensor)
-#endif // __XNAME_H__
--- a/source/XName.h
+++ b/source/XName.h
@@ -31,6 +31,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define MATH_ARITHMETIC     10000
 #define MATH_SUM            MATH_ARITHMETIC + 1
 #define MATH_MULTIPLY       MATH_SUM + 1
+#define MATH_SCALEANDSHIFT  MATH_MULTIPLY + 1
 /* get operator name */
 const char * GetOPName(int type);

--- a/source/XPRunner.cpp
+++ b/source/XPRunner.cpp
--- a/source/XPRunner.h
+++ b/source/XPRunner.h
--- a/source/XQueue.cpp
+++ b/source/XQueue.cpp
--- a/source/XQueue.h
+++ b/source/XQueue.h
--- a/source/XStream.cpp
+++ b/source/XStream.cpp
--- a/source/XStream.h
+++ b/source/XStream.h
--- a/source/XTensor.cpp
+++ b/source/XTensor.cpp
@@ -39,6 +39,10 @@
 #include "XHeap.h"
 #include "XBLAS.h"
 #include "core/shape/MergeBlockLists.h"
+#include "core/movement/CopyValues.h"
+#include "core/arithmetic/Sum.h"
+#include "core/arithmetic/Multiply.h"
+#include "core/math/ScaleAndShift.h"
 #ifdef USE_CUDA
@@ -55,6 +59,23 @@
 /* the nts (NiuTrans.Tensor) namespace */
 namespace nts{
+int tensorIDGlobal = 0;
+MUTEX_HANDLE tensorMutex;
+XTensor firstTensor;
+/* generate a tensor id */
+int MakeTensorID()
+{
+    if(tensorIDGlobal == 0)
+        MUTEX_INIT(tensorMutex);
+    MUTEX_LOCK(tensorMutex);
+    int id = tensorIDGlobal++;    
+    MUTEX_UNLOCK(tensorMutex);
+    return id;
+}
 /* 
 constructor 
 >> myOrder - order of the tensor
@@ -63,7 +84,9 @@ constructor
 XTensor::XTensor()
 {
    memset(this, 0, sizeof(XTensor));
+    SetDataPointer();
+    id = MakeTensorID();
    order = -1;
    memset(dimSize, 0, sizeof(int) * MAX_TENSOR_DIM_NUM);
    memset(dimSizeRDI, 0, sizeof(int) * MAX_TENSOR_DIM_NUM);
@@ -81,18 +104,22 @@ XTensor::XTensor()
    isDefaultDType = true;
    isInGlobalMem  = false;
    isInit = false;
+    isTmp =  false;
 }
 /* constructor */
-XTensor::XTensor(XTensor * reference)
+XTensor::XTensor(const XTensor * reference)
 {
    memset(this, 0, sizeof(XTensor));
+    SetDataPointer();
+    id = MakeTensorID();
    dataType = DEFAULT_DTYPE;
    devID = -1;
    denseRatio = 1.0F;
    isDefaultDType = true;
    isInit = false;
+    isTmp  = false;
    InitTensor(this, reference);
 }
@@ -107,6 +134,8 @@ XTensor::XTensor(const int myOrder, int myDevID, XMem * myMem)
 {
    CheckNTErrors((myOrder > 0), "Illegal tensor order1");
+    SetDataPointer();
+    id = MakeTensorID();
    order = myOrder;
    memset(dimSize, 0, sizeof(int) * MAX_TENSOR_DIM_NUM);
    memset(dimSizeRDI, 0, sizeof(int) * MAX_TENSOR_DIM_NUM);
@@ -127,6 +156,7 @@ XTensor::XTensor(const int myOrder, int myDevID, XMem * myMem)
    isDefaultDType = true;
    isInGlobalMem  = false;
    isInit = false;
+    isTmp  = false;
 }
 /* 
@@ -142,6 +172,8 @@ XTensor::XTensor(const int myOrder, const int * myDimSize, const TENSOR_DATA_TYP
 {
    CheckNTErrors((myOrder > 0), "Illegal tensor order1");
+    SetDataPointer();
+    id = MakeTensorID();
    order = myOrder;
    memset(dimSize, 0, sizeof(int) * MAX_TENSOR_DIM_NUM);
    memset(dimSizeRDI, 0, sizeof(int) * MAX_TENSOR_DIM_NUM);
@@ -157,10 +189,50 @@ XTensor::XTensor(const int myOrder, const int * myDimSize, const TENSOR_DATA_TYP
    isDefaultDType = true;
    isInGlobalMem  = false;
    isInit = false;
+    isTmp  = false;
    Resize(myOrder, myDimSize, myDataType, myDenseRatio);
 }
+/* copy constructor */
+XTensor::XTensor(const XTensor &reference)
+{
+    SetDataPointer();
+    id = MakeTensorID();
+    ShallowCopy(reference);
+    data = NULL;
+    dataHost = NULL;
+    if(reference.isTmp){
+        devID = reference.devID;
+        mem = reference.mem;
+        data = reference.data;
+        /* what we really want to do is "reference.data = NULL;"
+           As "reference" is constant, we cannot reset reference.data
+           here. So we save the ADDRESS of reference.data in
+           reference.dataP, and do this work by updating "*reference.dataP".
+           This is VERY trick and might not be the best solution :) */
+        *reference.dataP = NULL;
+    }
+    else{
+        devID = reference.devID;
+        mem = reference.mem;
+        InitTensor(this, &reference);
+        CopyValues(&reference, this);
+    }
+    if(reference.isTmp)
+        XLink::Replace(&reference, this);
+    else{
+        CheckNTErrors(outgo.tailNum == 0, "The node has outgoing edge to other nodes!");
+        XLink::CopyIncoming(&reference, this);
+    }
+    isInit = false;
+    isTmp  = false;
+}
 /* de-constructor */
 XTensor::~XTensor()
 {
@@ -168,6 +240,7 @@ XTensor::~XTensor()
    data = NULL;
    dataHost = NULL;
    mem = NULL;
+    XLink::ClearIncoming(this);
 }
 /* delete data arrays */
@@ -186,9 +259,32 @@ void XTensor::DestroyData()
    dataHost = NULL;
 }
+/* 
+shallow copy of tensor
+Note that we do not copy data array here
+>> tensor - the source tensor
+*/
+void XTensor::ShallowCopy(const XTensor &tensor)
+{
+    order = tensor.order;
+    memcpy(dimSize, tensor.dimSize, sizeof(int) * MAX_TENSOR_DIM_NUM);
+    memcpy(dimSizeRDI, tensor.dimSizeRDI, sizeof(int) * MAX_TENSOR_DIM_NUM);
+    dataType = tensor.dataType;
+    unitSize = tensor.unitSize;
+    unitNum = tensor.unitNum;
+    isSparse = tensor.isSparse;
+    unitNumNonZero = tensor.unitNumNonZero;
+    denseRatio =  tensor.denseRatio;
+    isShared = tensor.isShared;
+    isDefaultDType = tensor.isDefaultDType;
+    isInGlobalMem = tensor.isInGlobalMem;
+    memcpy(isAllValued, tensor.isAllValued, sizeof(bool) * MAX_TENSOR_DIM_NUM);
+}
 /* overloading of the equal-sign */
-XTensor& XTensor::operator = (const XTensor& tensor)
+XTensor& XTensor::operator= (const XTensor& tensor)
 {
+    /* hard copy of data array */
    int size = unitNum * unitSize;
    if( isInit && !isSparse && !tensor.isSparse &&
        size == tensor.unitNum * tensor.unitSize &&
@@ -201,40 +297,51 @@ XTensor& XTensor::operator = (const XTensor& tensor)
    }
    else{
        DestroyData();
-        if(!isInit){
+        if(isInit){
            devID = tensor.devID;
            mem = tensor.mem;
        }
        Resize(tensor.order, tensor.dimSize, tensor.dataType, tensor.denseRatio);
+        CopyValues(&tensor, this);
-        if(tensor.isSparse) {
-            int num = int(tensor.unitNum * tensor.denseRatio + 1);
-            int tupleSize = sizeof(int)+sizeof(DTYPE);
-            size = sizeof(int) + tupleSize * num;
-        }
-        XMemCopy(data, devID, tensor.data, tensor.devID, size);
    }
-    order = tensor.order;
+    /* copy member variables */
-    memcpy(dimSize, tensor.dimSize, sizeof(int) * MAX_TENSOR_DIM_NUM);
+    ShallowCopy(tensor);
-    memcpy(dimSizeRDI, tensor.dimSizeRDI, sizeof(int) * MAX_TENSOR_DIM_NUM);
-    dataType = tensor.dataType;
-    unitSize = tensor.unitSize;
-    unitNum = tensor.unitNum;
-    isSparse = tensor.isSparse;
-    unitNumNonZero = tensor.unitNumNonZero;
-    denseRatio =  tensor.denseRatio;
-    isShared = tensor.isShared;
-    isDefaultDType = tensor.isDefaultDType;
-    isInGlobalMem = tensor.isInGlobalMem;
-    memcpy(isAllValued, tensor.isAllValued, sizeof(bool) * MAX_TENSOR_DIM_NUM);
    isInit = true;
+    isTmp  = false;
+    CheckNTErrors(outgo.tailNum == 0, "The node has outgoing edge to other nodes!");
+    /* create tensor links for the new tensor */
+    XLink::Replace(&tensor, this);
    return *this;
 }
+/* overloading of the plus-sign */
+XTensor XTensor::operator+ (const XTensor& tensor)
+{
+    return Sum(*this, tensor);
+}
+/* overloading of the multiply-sign */
+XTensor XTensor::operator* (const XTensor& tensor)
+{
+    return Multiply(*this, tensor);
+}
+/* 
+linear transformation b = a * \scale + \shift
+>> scale - the slope
+>> shift - the intercept
+*/
+XTensor XTensor::Lin(DTYPE scale, DTYPE shift)
+{
+    return Linear(*this, scale, shift);
+}
 /* 
 judge whether the two matrices are in the same type and size 
 >> a - input tensor
@@ -555,6 +662,12 @@ bool XTensor::CheckData(const void * d, int num, int beg)
 #endif
    return true;
 }
+/* set the pointer to "data" */
+void XTensor::SetDataPointer()
+{
+    dataP = &data;
+}
 bool XTensor::CheckData(const void * d, int num, float tolerance, int beg)
 {
@@ -815,7 +928,7 @@ set the value of a cell
 */
 bool XTensor::Set(DTYPE value, int index[], int size)
 {
-    CheckNTErrors((dataType == DEFAULT_DTYPE), "The tensor is not in default type.");
+	CheckNTErrors((dataType == DEFAULT_DTYPE), "The tensor is not in default type.");
    return SetToDevice(devID, GetCell(index, size), value);
 }
@@ -932,6 +1045,15 @@ int XTensor::GetNonzeroSize()
 }
 /* 
+set the tensor as "temporary" 
+>> myIsTMP - flag
+*/
+void XTensor::SetTMP(bool myIsTmp)
+{
+    isTmp = myIsTmp;
+}
+/* 
 resize a tensor with a specified tensor size
 >> myOrder - order of the tensor
 >> myDimSize - the size of each dimension
@@ -1664,7 +1786,7 @@ initialize a tensor with a reference tensor
 >> tensor - the tensor we intend to initialize
 >> reference - the reference tensor
 */
-void InitTensor(XTensor * tensor, XTensor * reference)
+void InitTensor(XTensor * tensor, const XTensor * reference)
 {
    if(reference->order < 0)
        return;

--- a/source/XTensor.h
+++ b/source/XTensor.h
@@ -55,12 +55,13 @@ struct XLink;
 #define UNSAFE_BUT_FAST_MEM
 #define FAST_MATRIX
-/* 
+/* XTensor is a class to do everything a tensor can do :) */
-We implemente the tensor class here though we have defined the class of XMatrix. It
-is the parent class of XMatrix.
-*/
 struct XTensor
 {
+public:
+    /* id */
+    int id;
    /* memory pool */
    XMem * mem;
@@ -70,6 +71,10 @@ struct XTensor
    /* copy of data on the host memory. It is only activated 
       when the matrix is operated on GPUs */
    void * dataHost;
+    /* a pointer to data (i.e., a pointer to the address of "data".
+       This is for reset "data" when XTensor is used as a const variable. */
+    void ** dataP;
    /* 
    device id 
@@ -130,6 +135,9 @@ struct XTensor
    /* indicates whether the tensor is initialized or not */
    bool isInit;
+    /* indicates whether the tensor is created temporarily */
+    bool isTmp;
    /*
    the link used to form networks. Note that when we compute on tensors, we actually create a
@@ -152,7 +160,7 @@ struct XTensor
    XTensor();
    /* constructor */
-    XTensor(XTensor * reference);
+    XTensor(const XTensor * reference);
    /* constructor */
    XTensor(const int myOrder, int myDevID, XMem * myMem);
@@ -161,14 +169,29 @@ struct XTensor
    XTensor(const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType, 
            const float myDenseRatio, XMem * myMem);
+    /* copy constructor */
+    XTensor(const XTensor &reference);
    /* de-constructor */
    ~XTensor();
    /* delete data arrays */
    void DestroyData();
+    /* shallow copy of tensor */
+    void ShallowCopy(const XTensor &tensor);
    /* overloading of the equal-sign */
-    XTensor& operator = (const XTensor& tensor);
+    XTensor& operator= (const XTensor &tensor);
+    /* overloading of the plus-sign */
+    XTensor  operator+ (const XTensor &tensor);
+    /* overloading of the multiply-sign */
+    XTensor  operator* (const XTensor &tensor);
+    /* linear transformation */
+    XTensor Lin(DTYPE scale, DTYPE shift = 0);
    /* judge whether the two matrices are in the same type and size */
    static
@@ -213,6 +236,9 @@ struct XTensor
    /* check whether the data array is the same as the answer */
    bool CheckData(const void * answer, int num, float tolerance, int beg = 0);
+    /* set the pointer to "data" */
+    void SetDataPointer();
    /* set the cell to the ascending order along a given dimension */
    void SetAscendingOrder(int dim);
@@ -265,6 +291,9 @@ struct XTensor
    /* get the number of non-zero elements (in a sparse tensor) */
    int GetNonzeroSize();
+    /* set the tensor as "temporary" */
+    void SetTMP(bool myIsTmp = true);
    /* resize a matrix with a specified matrix size */
    bool Resize(const int myOrder, const int * myDimSize,
                const TENSOR_DATA_TYPE myDataType = DEFAULT_DTYPE,
@@ -299,6 +328,12 @@ struct XTensor
    void FreeData(XTensor * matrix, XMem * myMem = NULL, bool useBuf = false);
 };
+/* we make a unique id for every tensor */
+extern int tensorIDGlobal;
+extern MUTEX_HANDLE tensorMutex;
+extern XTensor firstTensor;
+extern int MakeTensorID();
 /************************************************
 * we define the "new and delete" functions below
 */
@@ -329,7 +364,7 @@ void InitTensor5D(XTensor * tensor, const int d0, const int d1, const int d2, co
                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL);
 /* initialize a tensor with a reference tensor */
-void InitTensor(XTensor * tensor, XTensor * reference);
+void InitTensor(XTensor * tensor, const XTensor * reference);
 /* generate a XTensor */
 XTensor * NewTensor(const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType = X_FLOAT,

--- a/source/XThread.cpp
+++ b/source/XThread.cpp
--- a/source/XThread.h
+++ b/source/XThread.h
--- a/source/XUtility.cpp
+++ b/source/XUtility.cpp
--- a/source/XUtility.h
+++ b/source/XUtility.h
--- a/source/core/CHeader.h
+++ b/source/core/CHeader.h
--- a/source/tensor/core/arithmetic/Absolute.cpp
+++ b/source/tensor/core/arithmetic/Absolute.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
+*/
+#include "../../XTensor.h"
+#include "Absolute.h"
+#include "Absolute.cuh"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/*
+set every entry to its absolute value
+>> a - the tensor we are processing
+*/
+void Absolute(XTensor * a)
+{
+#ifdef USE_CUDA
+    /* run it on GPUs */
+    if (a->devID >= 0) {
+        CudaAbsolute(a);
+    return;
+}
+#endif
+    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
+    DTYPE * d = (DTYPE*)a->data;
+    for (int i = 0; i < a->unitNum; i++)
+        d[i] = (DTYPE)fabs(d[i]);
+}
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Absolute.cu
+++ b/source/tensor/core/arithmetic/Absolute.cu
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
+*/
+#include "../../XDevice.h"
+#include "../../XTensor.h"
+#include "Absolute.h"
+#include "Absolute.cuh"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+#ifdef USE_CUDA
+/*
+set each entry to its absolute value (CUDA Kernel)
+>> d - pointer to the data array
+>> size - size of the data array
+*/
+__global__
+void KernelAbsolute(DTYPE * d, int size)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < size)
+        d[i] = fabs(d[i]);
+}
+/*
+set each entry to its absolute value (CUDA Kernel)
+This is for float16 computation
+>> d - pointer to the data array
+>> size - size of the data array
+*/
+__global__
+void KernelAbsolute(__half * d, int size)
+{
+    return;
+}
+/*
+set each entry to its  with float16 data type value
+>> a - the tensor
+*/
+extern "C"
+void CudaAbsolute(XTensor * a)
+{
+    CheckNTErrors((a->isSparse == false), "TODO!");
+    int gridSize[3];
+    int blockSize[3];
+    GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
+    dim3 blocks(gridSize[0]);
+    dim3 threads(blockSize[0]);
+    int devIDBackup;
+    ProtectCudaDev(a->devID, devIDBackup);
+    if (a->dataType == DEFAULT_DTYPE) {
+        KernelAbsolute << <blocks, threads >> >((DTYPE*)a->data, a->unitNum);
+    }
+    else if (a->dataType == X_FLOAT16) {
+        KernelAbsolute << <blocks, threads >> >((__half*)a->data, a->unitNum);
+    }
+    else {
+        ShowNTErrors("TODO!");
+    }
+    BacktoCudaDev(a->devID, devIDBackup);
+}
+#endif // USE_CUDA
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/Absolute.cuh
+++ b/source/tensor/core/arithmetic/Absolute.cuh
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
+*/
+#include "Absolute.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+#ifdef USE_CUDA
+/* set each entry to its absolute value (CUDA Kernel) */
+__global__
+void KernelAbsolute(DTYPE * d, int size);
+/* set each entry to its absolute value (CUDA Kernel) with float16 data type*/
+__global__
+void KernelAbsolute(__half * d, int size);
+/* set each entry to its absolute value */
+extern "C"
+void CudaAbsolute(XTensor * a);
+#endif // USE_CUDA
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/core/arithmetic/Multiply.h
+++ b/source/core/arithmetic/Multiply.h
@@ -16,20 +16,20 @@
 */
 /*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
+* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
 */
-#ifndef __MULTIPLY_H__
+#ifndef __ABSOLUTE_H__
-#define __MULTIPLY_H__
+#define __ABSOLUTE_H__
 #include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* element-wise product of two tensors */
+/* set every entry to its absolute value */
 extern "C"
-void Multiply(XTensor * a, XTensor * b, XTensor * c, int leadingDim = 0, DTYPE alpha = 0);
+void Absolute(XTensor * a);
 } // namespace nts(NiuTrans.Tensor)
-#endif // __MULTIPLY_H__
+#endif // __ABSOLUTE_H__
\ No newline at end of file
--- a/source/core/arithmetic/MatrixMULBatchedCPU.cpp
+++ b/source/core/arithmetic/MatrixMULBatchedCPU.cpp
--- a/source/core/arithmetic/MatrixMULBatchedCPU.h
+++ b/source/core/arithmetic/MatrixMULBatchedCPU.h
--- a/source/core/arithmetic/MatrixMul.cpp
+++ b/source/core/arithmetic/MatrixMul.cpp
--- a/source/core/arithmetic/MatrixMul.h
+++ b/source/core/arithmetic/MatrixMul.h
--- a/source/core/arithmetic/MatrixMul2D.cpp
+++ b/source/core/arithmetic/MatrixMul2D.cpp
--- a/source/core/arithmetic/MatrixMul2D.cu
+++ b/source/core/arithmetic/MatrixMul2D.cu
--- a/source/core/arithmetic/MatrixMul2D.cuh
+++ b/source/core/arithmetic/MatrixMul2D.cuh
--- a/source/core/arithmetic/MatrixMul2D.h
+++ b/source/core/arithmetic/MatrixMul2D.h
--- a/source/core/arithmetic/MatrixMul2DMultiTheading.cpp
+++ b/source/core/arithmetic/MatrixMul2DMultiTheading.cpp
--- a/source/core/arithmetic/MatrixMul2DMultiTheading.h
+++ b/source/core/arithmetic/MatrixMul2DMultiTheading.h
--- a/source/core/arithmetic/MatrixMul2DParallel.cpp
+++ b/source/core/arithmetic/MatrixMul2DParallel.cpp
--- a/source/core/arithmetic/MatrixMul2DParallel.h
+++ b/source/core/arithmetic/MatrixMul2DParallel.h
--- a/source/core/arithmetic/MatrixMulBatched.cpp
+++ b/source/core/arithmetic/MatrixMulBatched.cpp
--- a/source/core/arithmetic/MatrixMulBatched.h
+++ b/source/core/arithmetic/MatrixMulBatched.h
--- a/source/core/arithmetic/Multiply.cpp
+++ b/source/core/arithmetic/Multiply.cpp
@@ -34,18 +34,20 @@ where i is the index of the item
 >> b - matrix b
 >> c - result matrix
 >> alpha - the coefficient
+>> leadingDim - the dimension along which we perform broadcasting
 >>
 */
-void Multiply(XTensor * a, XTensor * b, XTensor * c, int leadingDim, DTYPE alpha)
+void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
 {
 	int leadingDimRDI = a->order - leadingDim - 1;
    CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum),
-        "Unmatched tensors in multiplication!");
+                  "Unmatched tensors in multiplication!");
-    CheckNTErrors((a->order == b->order && a->order == c->order), "Unmatched tensors!");
+    CheckNTErrors((a->order == b->order && a->order == c->order), 
+                  "Unmatched tensors!");
 #ifdef USE_CUDA
    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
-        CudaMultiply(a, b, c, leadingDim, alpha);
+        _CudaMultiply(a, b, c, alpha, leadingDim);
        return;
    }
 #endif
@@ -118,4 +120,46 @@ void Multiply(XTensor * a, XTensor * b, XTensor * c, int leadingDim, DTYPE alpha
    }
 }
+/*
+element-wise product of two tensors and keep the result in the input
+a(i) = a(i)*b(i) + \alpha * a(i)
+where i is the index of the item
+>> a - tensor a (where keep the result)
+>> b - tensor b
+>> alpha - the coefficient
+>> leadingDim - the dimension along which we perform broadcasting
+*/
+void _MultiplyMe(XTensor * a, const XTensor * b, DTYPE alpha, int leadingDim)
+{
+    _Multiply(a, b, a, alpha, leadingDim);
+}
+/*
+make a tensor of the element-wise product for two input tensors: 
+c(i) = a(i)*b(i) + \alpha * c(i)
+where i is the index of the item
+>> a - tensor a
+>> b - tensor b
+>> alpha - the coefficient
+>> leadingDim - the dimension along which we perform broadcasting
+<< return - the product of the tensors
+*/
+XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim)
+{
+    CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!");
+    XTensor c(&a);
+    c.SetTMP();
+    /* computation */
+    _Multiply(&a, &b, &c, alpha, leadingDim);
+    /* tensor connections */
+    XLink::MakeLink(&a, &b, &c, MATH_MULTIPLY);
+    XLink::AddParamToHead(&c, alpha);
+    XLink::AddParamToHeadInt(&c, leadingDim);
+    return c;
+}
 } // namespace nts(NiuTrans.Tensor)
--- a/source/core/arithmetic/Multiply.cu
+++ b/source/core/arithmetic/Multiply.cu
@@ -117,15 +117,15 @@ where i is the item index
 >> a - tensor a
 >> b - tensor b
 >> c - result tensor
->> leadingDim - leading dimension
 >> alpha - the coefficient
+>> leadingDim - dimension along which we perform broadcasting
 */
 extern "C"
-void CudaMultiply(XTensor * a, XTensor * b, XTensor * c, int leadingDim, DTYPE alpha)
+void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
 {
 	int leadingDimRDI = a->order - leadingDim - 1;
    CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum),
-        "Unmatched tensors in multiplication!");
+                  "Unmatched tensors in multiplication!");
    CheckNTErrors((a->order == b->order && a->order == c->order), "Unmatched tensors!");
    int stride = 1;
@@ -138,8 +138,8 @@ void CudaMultiply(XTensor * a, XTensor * b, XTensor * c, int leadingDim, DTYPE a
    for (int i = 0; i < a->order; i++) {
        if (i != leadingDimRDI) {
            CheckNTErrors((a->dimSizeRDI[i] == b->dimSizeRDI[i] &&
-                a->dimSizeRDI[i] == c->dimSizeRDI[i]),
+                           a->dimSizeRDI[i] == c->dimSizeRDI[i]),
-                "Unmatched tensors!");
+                          "Unmatched tensors!");
        }
        if (i < leadingDimRDI)
            stride *= a->dimSizeRDI[i];

--- a/source/core/arithmetic/Multiply.cuh
+++ b/source/core/arithmetic/Multiply.cuh
@@ -42,7 +42,7 @@ void KernelMulElementWiseTensorDynamic(DTYPE * a, DTYPE * b, DTYPE * c, DTYPE al
 /* element-wise product of two tensors */
 extern "C"
-void CudaMultiply(XTensor * a, XTensor * b, XTensor * c, int leadingDim = 0, DTYPE alpha = 0);
+void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha = 0, int leadingDim = 0);
 #endif // USE_CUDA

--- a/source/tensor/core/arithmetic/Multiply.h
+++ b/source/tensor/core/arithmetic/Multiply.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
+*/
+#ifndef __MULTIPLY_H__
+#define __MULTIPLY_H__
+#include "../../XTensor.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* element-wise product of two tensors: 
+   c(i) = a(i)*b(i) + \alpha * c(i) 
+   where i is the index of the element */
+void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha = 0, int leadingDim = 0);
+/* element-wise product of two tensors and keep the result in the input tensor: 
+   a(i) = a(i)*b(i) + \alpha * a(i) 
+   where i is the index of the element */
+void _MultiplyMe(XTensor * a, const XTensor * b, DTYPE alpha = 0, int leadingDim = 0);
+/* make a tensor of the element-wise product for two input tensors: 
+   c(i) = a(i)*b(i) + \alpha * c(i) 
+   where i is the index of the element */
+XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha = 0, int leadingDim = 0);
+} // namespace nts(NiuTrans.Tensor)
+#endif // __MULTIPLY_H__
\ No newline at end of file
--- a/source/core/arithmetic/Negate.cpp
+++ b/source/core/arithmetic/Negate.cpp
--- a/source/core/arithmetic/Negate.cu
+++ b/source/core/arithmetic/Negate.cu
--- a/source/core/arithmetic/Negate.cuh
+++ b/source/core/arithmetic/Negate.cuh
--- a/source/core/arithmetic/Negate.h
+++ b/source/core/arithmetic/Negate.h
--- a/source/tensor/core/arithmetic/Sign.cpp
+++ b/source/tensor/core/arithmetic/Sign.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
+*/
+#include "../../XTensor.h"
+#include "Sign.h"
+#include "Sign.cuh"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/*
+set every entry to its sign value
+>> a - the tensor we are processing
+*/
+void Sign(XTensor * a)
+{
+#ifdef USE_CUDA
+    /* run it on GPUs */
+    if (a->devID >= 0) {
+        CudaSign(a);
+    return;
+}
+#endif
+    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
+    DTYPE * d = (DTYPE*)a->data;
+    for (int i = 0; i < a->unitNum; i++) {
+        if (d[i] > 0)
+            d[i] = 1.0F;
+        else if (d[i] == 0)
+            d[i] = 0.0F;
+        else
+            d[i] = -1.0F;
+    }
+}
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Sign.cu
+++ b/source/tensor/core/arithmetic/Sign.cu
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
+*/
+#include "../../XDevice.h"
+#include "../../XTensor.h"
+#include "Sign.h"
+#include "Sign.cuh"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+#ifdef USE_CUDA
+/*
+set each entry to its sign value (CUDA Kernel)
+>> d - pointer to the data array
+>> size - size of the data array
+*/
+__global__
+void KernelSign(DTYPE * d, int size)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < size) {
+        if (d[i] > 0)
+            d[i] = 1.0F;
+        else if (d[i] == 0)
+            d[i] = 0.0F;
+        else
+            d[i] = -1.0F;
+    }
+}
+/*
+set each entry to its sign value (CUDA Kernel)
+This is for float16 computation
+>> d - pointer to the data array
+>> size - size of the data array
+*/
+__global__
+void KernelSign(__half * d, int size)
+{
+    return;
+}
+/*
+set each entry to its  with float16 data type value
+>> a - the tensor
+*/
+extern "C"
+void CudaSign(XTensor * a)
+{
+    CheckNTErrors((a->isSparse == false), "TODO!");
+    int gridSize[3];
+    int blockSize[3];
+    GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
+    dim3 blocks(gridSize[0]);
+    dim3 threads(blockSize[0]);
+    int devIDBackup;
+    ProtectCudaDev(a->devID, devIDBackup);
+    if (a->dataType == DEFAULT_DTYPE) {
+        KernelSign << <blocks, threads >> >((DTYPE*)a->data, a->unitNum);
+    }
+    else if (a->dataType == X_FLOAT16) {
+        KernelSign << <blocks, threads >> >((__half*)a->data, a->unitNum);
+    }
+    else {
+        ShowNTErrors("TODO!");
+    }
+    BacktoCudaDev(a->devID, devIDBackup);
+}
+#endif // USE_CUDA
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/Sign.cuh
+++ b/source/tensor/core/arithmetic/Sign.cuh
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
+*/
+#include "Sign.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+#ifdef USE_CUDA
+/* set each entry to its sign value (CUDA Kernel) */
+__global__
+void KernelSign(DTYPE * d, int size);
+/* set each entry to its sign value (CUDA Kernel) with float16 data type*/
+__global__
+void KernelSign(__half * d, int size);
+/* set each entry to its sign value */
+extern "C"
+void CudaSign(XTensor * a);
+#endif // USE_CUDA
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Sign.h
+++ b/source/tensor/core/arithmetic/Sign.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
+*/
+#ifndef __SIGN_H__
+#define __SIGN_H__
+#include "../../XTensor.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* set every entry to its sign value */
+extern "C"
+void Sign(XTensor * a);
+} // namespace nts(NiuTrans.Tensor)
+#endif // __SIGN_H__
--- a/source/core/arithmetic/Sum.cpp
+++ b/source/core/arithmetic/Sum.cpp
@@ -35,12 +35,9 @@ return a pointer
 >> c - where we put a+b*\beta. we save it in a if c is NULL
 >> beta - the scaling factor
 */
-void _Sum(XTensor * a, XTensor * b, XTensor * c, DTYPE beta)
+void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
 {
-    if (c == NULL)
+    CheckNTErrors(a && b && c, "Empty tensor input!");
-        c = a;
-    CheckNTErrors(a && b && c, "Empty tensors in addition!");
    CheckNTErrors(a->unitNum == b->unitNum && a->unitNum == c->unitNum,
                  "Unmatched tensors in addition!");
    CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
@@ -121,7 +118,7 @@ do it on site
 >> b - another tensor
 >> beta - the scaling factor
 */
-void _SumMe(XTensor * a, XTensor * b, DTYPE beta)
+void _SumMe(XTensor * a, const XTensor * b, DTYPE beta)
 {
    _Sum(a, b, a, beta);
 }
@@ -133,16 +130,17 @@ return a XTensor structure
 >> b - another tensor
 >> beta - the scaling factor
 */
-XTensor Sum(XTensor &a, XTensor &b, DTYPE beta)
+XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta)
 {
    XTensor c(&a);
+    c.SetTMP();
    /* computation */
    _Sum(&a, &b, &c, beta);
    /* tensor connections */
-    //XLink::MakeLink(&a, &b, &c, MATH_SUM);
+    XLink::MakeLink(&a, &b, &c, MATH_SUM);
-    //XLink::AddParamToHead(&c, beta);
+    XLink::AddParamToHead(&c, beta);
    return c;
 }

--- a/source/core/arithmetic/Sum.cu
+++ b/source/core/arithmetic/Sum.cu
@@ -51,11 +51,9 @@ tensor summation c = a + b * \beta (cuda version)
 >> c - where we put a+b*\beta. we save it in a if c is NULL
 >> beta - the scaling factor
 */
-void _CudaSum(XTensor * a, XTensor * b, XTensor * c, DTYPE beta)
+void _CudaSum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
 {
-    if (c == NULL)
+    CheckNTErrors(a && b && c, "Empty tensor input!");
-        c = a;
    CheckNTErrors((a->unitNum == b->unitNum && a->unitNum == c->unitNum),
                  "Unmatched tensors in addition!");
    CheckNTErrors((a->dataType == b->dataType && a->dataType == c->dataType),

--- a/source/core/arithmetic/Sum.cuh
+++ b/source/core/arithmetic/Sum.cuh
@@ -34,7 +34,7 @@ void KernelADD(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta = (DTYPE)1.
 /* tensor summation c = a + b * \beta (cuda version) */
 extern "C"
-void _CudaSum(XTensor * a, XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);
+void _CudaSum(const XTensor * a, const XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);
 /*  tensor summation c = a + b * \beta (cuda version) with an input handle */
 extern "C"

--- a/source/core/arithmetic/Sum.h
+++ b/source/core/arithmetic/Sum.h
@@ -27,13 +27,13 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /* tensor summation c = a + b * \beta */
-void _Sum(XTensor * a, XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);
+void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
 /* tensor summation a = a + b * \beta (return a pointer) */
-void _SumMe(XTensor * a, XTensor * b, DTYPE beta = (DTYPE)1.0);
+void _SumMe(XTensor * a, const XTensor * b, DTYPE beta = (DTYPE)1.0);
 /* tensor summation c = a + b * \beta (return a structure) */
-XTensor Sum(XTensor &a, XTensor &b, DTYPE beta = (DTYPE)1.0);
+XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta = (DTYPE)1.0);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/core/arithmetic/SumByColumnTV.cpp
+++ b/source/core/arithmetic/SumByColumnTV.cpp
--- a/source/core/arithmetic/SumByColumnTV.cu
+++ b/source/core/arithmetic/SumByColumnTV.cu
--- a/source/core/arithmetic/SumByColumnTV.cuh
+++ b/source/core/arithmetic/SumByColumnTV.cuh
--- a/source/core/arithmetic/SumByColumnTV.h
+++ b/source/core/arithmetic/SumByColumnTV.h
--- a/source/core/arithmetic/SumByColumnVT.cpp
+++ b/source/core/arithmetic/SumByColumnVT.cpp
--- a/source/core/arithmetic/SumByColumnVT.cu
+++ b/source/core/arithmetic/SumByColumnVT.cu
--- a/source/core/arithmetic/SumByColumnVT.cuh
+++ b/source/core/arithmetic/SumByColumnVT.cuh
--- a/source/core/arithmetic/SumByColumnVT.h
+++ b/source/core/arithmetic/SumByColumnVT.h
--- a/source/core/arithmetic/XTensorBLAS.cpp
+++ b/source/core/arithmetic/XTensorBLAS.cpp
--- a/source/core/arithmetic/XTensorBLAS.cu
+++ b/source/core/arithmetic/XTensorBLAS.cu
--- a/source/core/arithmetic/XTensorBLAS.h
+++ b/source/core/arithmetic/XTensorBLAS.h
--- a/source/tensor/core/getandset/ConvertDataType.cpp
+++ b/source/tensor/core/getandset/ConvertDataType.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
+*/
+#include "../../XTensor.h"
+#include "ConvertDataType.h"
+#include "ConvertDataType.cuh"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/*
+convert data type
+>> input - input tensor
+>> output - output tensor
+*/
+void ConvertTensorDataType(XTensor * input, XTensor * output)
+{
+    CheckNTErrors(XTensor::IsIdentical(input, output), "Input and Output are different in type or size!");
+    if (input->dataType == output->dataType)
+        return;
+#ifdef USE_CUDA
+    /* run it on GPUs */
+    if (input->devID >= 0) {
+        CudaConvertDataType(input, output);
+    return;
+}
+#endif
+    if (input->dataType == X_FLOAT && output->dataType == X_INT) {
+        float * inputData = (float*)input->data;
+        int * outputData = (int*)output->data;
+        for (int i = 0; i < input->unitNum; i++) 
+            outputData[i] = (int)inputData[i];
+    }
+    else if (input->dataType == X_INT && output->dataType == X_FLOAT) {
+        int * inputData = (int*)input->data;
+        float * outputData = (float*)output->data;
+        for (int i = 0; i < input->unitNum; i++) 
+            outputData[i] = (float)inputData[i];
+    }
+    else
+        ShowNTErrors("Unsupported data types for conversion!");
+}
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/core/getandset/ConvertDataType.cu
+++ b/source/core/getandset/ConvertDataType.cu
@@ -21,6 +21,7 @@
 #include "../../XTensor.h"
 #include "../../XDevice.h"
+#include "ConvertDataType.cuh"
 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -49,6 +50,24 @@ void KernelFloat16ToFloat(__half * s, float * t, int size)
    }
 }
+__global__ 
+void KernelFloatToInt(float * inputData, int * outputData, int size)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < size){
+        outputData[i] = (int)(inputData[i]);
+    }
+}
+__global__ 
+void KernelIntToFloat(int * inputData, float * outputData, int size)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < size){
+        outputData[i] = (float)(inputData[i]);
+    }}
 /* 
 data conversion (cuda code) 
@@ -88,6 +107,39 @@ void CudaConvertDataType(int devID, void * s, TENSOR_DATA_TYPE typeS, void * t, 
    ProtectCudaDev(devID, devIDBackup);
 }
+/*
+convert data type (cuda code) 
+>> input - input tensor
+>> output - output tensor
+*/
+void CudaConvertDataType(XTensor * input, XTensor * output)
+{
+    CheckNTErrors(XTensor::IsIdentical(input, output), "Input and Output are different in type or size!");
+    if (input->dataType == output->dataType)
+        return;
+    int gridSize[3];
+    int blockSize[3];
+    GDevs.GetCudaThread(input->devID, input->unitNum, gridSize, blockSize);
+    dim3 blocks(gridSize[0]);
+    dim3 threads(blockSize[0]);
+    int devIDBackup;
+    ProtectCudaDev(input->devID, devIDBackup);
+    if(input->dataType == X_FLOAT && output->dataType == X_INT)
+        KernelFloatToInt<<<blocks, threads>>>((float*)input->data, (int*)output->data, input->unitNum);
+    else if(input->dataType == X_INT && output->dataType == X_FLOAT)
+        KernelIntToFloat<<<blocks, threads>>>((int*)input->data, (float*)output->data, input->unitNum);
+    else{
+        ShowNTErrors("Unsupported data types for conversion!");
+    }
+    ProtectCudaDev(input->devID, devIDBackup);
+}
 #endif // USE_CUDA
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/getandset/ConvertDataType.cuh
+++ b/source/tensor/core/getandset/ConvertDataType.cuh
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
+*/
+#include "ConvertDataType.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+#ifdef USE_CUDA
+/* convert data type from X_FLOAT to X_FLOAT16 (CUDA Kernel) */
+__global__
+void KernelFloatToFloat16(float * s, __half * t, int size);
+/* convert data type from X_FLOAT16 to X_FLOAT (CUDA Kernel) */
+__global__
+void KernelFloat16ToFloat(__half * s, float * t, int size);
+/* convert data type from X_FLOAT to X_INT (CUDA Kernel) */
+__global__
+void KernelFloatToInt(float * inputData, int * outputData, int size);
+/* convert data type from X_INT to X_FLOAT (CUDA Kernel) */
+__global__
+void KernelIntToFloat(int * inputData, float * outputData, int size);
+/* convert data type */
+void CudaConvertDataType(XTensor * input, XTensor * output);
+#endif // USE_CUDA
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/getandset/ConvertDataType.h
+++ b/source/tensor/core/getandset/ConvertDataType.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
+*/
+#ifndef __CONVERTDATATYPE_H__
+#define __CONVERTDATATYPE_H__
+#include "../../XTensor.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* convert data type */
+void ConvertDataType(XTensor * input, XTensor * output);
+} // namespace nts(NiuTrans.Tensor)
+#endif // __CONVERTDATATYPE_H__
--- a/source/core/getandset/Select.cpp
+++ b/source/core/getandset/Select.cpp
--- a/source/core/getandset/Select.cu
+++ b/source/core/getandset/Select.cu
--- a/source/core/getandset/Select.cuh
+++ b/source/core/getandset/Select.cuh
--- a/source/core/getandset/Select.h
+++ b/source/core/getandset/Select.h
--- a/source/core/getandset/SetData.cpp
+++ b/source/core/getandset/SetData.cpp
--- a/source/core/getandset/SetData.cu
+++ b/source/core/getandset/SetData.cu
--- a/source/core/getandset/SetData.cuh
+++ b/source/core/getandset/SetData.cuh
--- a/source/core/getandset/SetData.h
+++ b/source/core/getandset/SetData.h
--- a/source/core/arithmetic/Log.cpp
+++ b/source/core/arithmetic/Log.cpp
--- a/source/core/arithmetic/Log.cu
+++ b/source/core/arithmetic/Log.cu
--- a/source/core/arithmetic/Log.cuh
+++ b/source/core/arithmetic/Log.cuh
--- a/source/core/arithmetic/Log.h
+++ b/source/core/arithmetic/Log.h
--- a/source/core/math/Normalize.cpp
+++ b/source/core/math/Normalize.cpp
--- a/source/core/math/Normalize.cu
+++ b/source/core/math/Normalize.cu
--- a/source/core/math/Normalize.cuh
+++ b/source/core/math/Normalize.cuh
--- a/source/core/math/Normalize.h
+++ b/source/core/math/Normalize.h
--- a/source/core/math/Power.cpp
+++ b/source/core/math/Power.cpp
--- a/source/core/math/Power.cu
+++ b/source/core/math/Power.cu
--- a/source/core/math/Power.cuh
+++ b/source/core/math/Power.cuh
--- a/source/core/math/Power.h
+++ b/source/core/math/Power.h
--- a/source/core/math/ScaleAndShift.cpp
+++ b/source/core/math/ScaleAndShift.cpp
--- a/source/core/math/ScaleAndShift.cu
+++ b/source/core/math/ScaleAndShift.cu
--- a/source/core/math/ScaleAndShift.cuh
+++ b/source/core/math/ScaleAndShift.cuh
--- a/source/core/math/ScaleAndShift.h
+++ b/source/core/math/ScaleAndShift.h
--- a/source/core/movement/CopyBlocks.cpp
+++ b/source/core/movement/CopyBlocks.cpp
--- a/source/core/movement/CopyBlocks.h
+++ b/source/core/movement/CopyBlocks.h
--- a/source/core/movement/CopyBlocksInGrid.cpp
+++ b/source/core/movement/CopyBlocksInGrid.cpp
--- a/source/core/movement/CopyBlocksInGrid.cu
+++ b/source/core/movement/CopyBlocksInGrid.cu
--- a/source/core/movement/CopyBlocksInGrid.cuh
+++ b/source/core/movement/CopyBlocksInGrid.cuh
--- a/source/core/movement/CopyBlocksInGrid.h
+++ b/source/core/movement/CopyBlocksInGrid.h
--- a/source/core/movement/CopyBlocksOnSite.cpp
+++ b/source/core/movement/CopyBlocksOnSite.cpp
--- a/source/core/movement/CopyBlocksOnSite.cu
+++ b/source/core/movement/CopyBlocksOnSite.cu
--- a/source/core/movement/CopyBlocksOnSite.cuh
+++ b/source/core/movement/CopyBlocksOnSite.cuh
--- a/source/core/movement/CopyBlocksOnSite.h
+++ b/source/core/movement/CopyBlocksOnSite.h
--- a/source/core/movement/CopyBlocksSelected.cu
+++ b/source/core/movement/CopyBlocksSelected.cu
--- a/source/core/movement/CopyBlocksSelected.cuh
+++ b/source/core/movement/CopyBlocksSelected.cuh
--- a/source/core/movement/CopyData2D.cpp
+++ b/source/core/movement/CopyData2D.cpp
--- a/source/core/movement/CopyData2D.h
+++ b/source/core/movement/CopyData2D.h
--- a/source/core/movement/CopyInGrid.cpp
+++ b/source/core/movement/CopyInGrid.cpp
--- a/source/core/movement/CopyInGrid.h
+++ b/source/core/movement/CopyInGrid.h
--- a/source/core/movement/CopyIndexed.cpp
+++ b/source/core/movement/CopyIndexed.cpp
--- a/source/core/movement/CopyIndexed.h
+++ b/source/core/movement/CopyIndexed.h
--- a/source/core/movement/CopyValues.cpp
+++ b/source/core/movement/CopyValues.cpp
--- a/source/core/movement/CopyValues.cu
+++ b/source/core/movement/CopyValues.cu
--- a/source/core/movement/CopyValues.cuh
+++ b/source/core/movement/CopyValues.cuh
--- a/source/core/movement/CopyValues.h
+++ b/source/core/movement/CopyValues.h
--- a/source/core/reduce/ReduceMax.cpp
+++ b/source/core/reduce/ReduceMax.cpp
--- a/source/core/reduce/ReduceMax.cu
+++ b/source/core/reduce/ReduceMax.cu
--- a/source/core/reduce/ReduceMax.cuh
+++ b/source/core/reduce/ReduceMax.cuh
--- a/source/core/reduce/ReduceMax.h
+++ b/source/core/reduce/ReduceMax.h
--- a/source/core/reduce/ReduceMean.cpp
+++ b/source/core/reduce/ReduceMean.cpp
--- a/source/core/reduce/ReduceMean.h
+++ b/source/core/reduce/ReduceMean.h
--- a/source/core/reduce/ReduceStandardVariance.h
+++ b/source/core/reduce/ReduceStandardVariance.h
--- a/source/core/reduce/ReduceSum.cpp
+++ b/source/core/reduce/ReduceSum.cpp
--- a/source/core/reduce/ReduceSum.cu
+++ b/source/core/reduce/ReduceSum.cu
--- a/source/core/reduce/ReduceSum.cuh
+++ b/source/core/reduce/ReduceSum.cuh
--- a/source/core/reduce/ReduceSum.h
+++ b/source/core/reduce/ReduceSum.h
--- a/source/core/reduce/ReduceSumSquared.cpp
+++ b/source/core/reduce/ReduceSumSquared.cpp
--- a/source/core/reduce/ReduceSumSquared.h
+++ b/source/core/reduce/ReduceSumSquared.h
--- a/source/core/reduce/ReduceVariance.cpp
+++ b/source/core/reduce/ReduceVariance.cpp
--- a/source/core/reduce/ReduceVariance.h
+++ b/source/core/reduce/ReduceVariance.h
--- a/source/core/shape/Concatenate.cpp
+++ b/source/core/shape/Concatenate.cpp
--- a/source/core/shape/Concatenate.h
+++ b/source/core/shape/Concatenate.h
--- a/source/core/shape/ConcatenateSolely.cpp
+++ b/source/core/shape/ConcatenateSolely.cpp
--- a/source/core/shape/ConcatenateSolely.h
+++ b/source/core/shape/ConcatenateSolely.h
--- a/source/core/shape/MakeMergeBlockIndex.cpp
+++ b/source/core/shape/MakeMergeBlockIndex.cpp
--- a/source/core/shape/MakeMergeBlockIndex.cu
+++ b/source/core/shape/MakeMergeBlockIndex.cu
--- a/source/core/shape/MakeMergeBlockIndex.cuh
+++ b/source/core/shape/MakeMergeBlockIndex.cuh
--- a/source/core/shape/MakeMergeBlockIndex.h
+++ b/source/core/shape/MakeMergeBlockIndex.h
--- a/source/core/shape/MakeSplitBlockIndex.cpp
+++ b/source/core/shape/MakeSplitBlockIndex.cpp
--- a/source/core/shape/MakeSplitBlockIndex.cu
+++ b/source/core/shape/MakeSplitBlockIndex.cu
--- a/source/core/shape/MakeSplitBlockIndex.cuh
+++ b/source/core/shape/MakeSplitBlockIndex.cuh
--- a/source/core/shape/MakeSplitBlockIndex.h
+++ b/source/core/shape/MakeSplitBlockIndex.h
--- a/source/core/shape/Merge.cpp
+++ b/source/core/shape/Merge.cpp
--- a/source/core/shape/Merge.h
+++ b/source/core/shape/Merge.h
--- a/source/core/shape/MergeBlockLists.cpp
+++ b/source/core/shape/MergeBlockLists.cpp
--- a/source/core/shape/MergeBlockLists.cu
+++ b/source/core/shape/MergeBlockLists.cu
--- a/source/core/shape/MergeBlockLists.cuh
+++ b/source/core/shape/MergeBlockLists.cuh
--- a/source/core/shape/MergeBlockLists.h
+++ b/source/core/shape/MergeBlockLists.h
--- a/source/core/shape/Permute.cpp
+++ b/source/core/shape/Permute.cpp
--- a/source/core/shape/Permute.h
+++ b/source/core/shape/Permute.h
--- a/source/core/shape/Split.cpp
+++ b/source/core/shape/Split.cpp
--- a/source/core/shape/Split.h
+++ b/source/core/shape/Split.h
--- a/source/core/shape/Transpose.cpp
+++ b/source/core/shape/Transpose.cpp
--- a/source/core/shape/Transpose.h
+++ b/source/core/shape/Transpose.h
--- a/source/core/shape/Unsqueeze.cpp
+++ b/source/core/shape/Unsqueeze.cpp
--- a/source/core/shape/Unsqueeze.cu
+++ b/source/core/shape/Unsqueeze.cu
--- a/source/core/shape/Unsqueeze.cuh
+++ b/source/core/shape/Unsqueeze.cuh
--- a/source/core/shape/Unsqueeze.h
+++ b/source/core/shape/Unsqueeze.h
--- a/source/core/sort/Sort.cpp
+++ b/source/core/sort/Sort.cpp
--- a/source/core/sort/Sort.cu
+++ b/source/core/sort/Sort.cu
--- a/source/core/sort/Sort.cuh
+++ b/source/core/sort/Sort.cuh
--- a/source/core/sort/Sort.h
+++ b/source/core/sort/Sort.h
--- a/source/core/sort/TopK.cpp
+++ b/source/core/sort/TopK.cpp
--- a/source/core/sort/TopK.cu
+++ b/source/core/sort/TopK.cu
--- a/source/core/sort/TopK.cuh
+++ b/source/core/sort/TopK.cuh
--- a/source/core/sort/TopK.h
+++ b/source/core/sort/TopK.h
--- a/source/core/utilities/FlushToMem.cpp
+++ b/source/core/utilities/FlushToMem.cpp
--- a/source/core/utilities/FlushToMem.cu
+++ b/source/core/utilities/FlushToMem.cu
--- a/source/core/utilities/FlushToMem.cuh
+++ b/source/core/utilities/FlushToMem.cuh
--- a/source/core/utilities/FlushToMem.h
+++ b/source/core/utilities/FlushToMem.h
--- a/source/core/utilities/SetAscendingOrder.cu
+++ b/source/core/utilities/SetAscendingOrder.cu
--- a/source/core/utilities/SetAscendingOrder.cuh
+++ b/source/core/utilities/SetAscendingOrder.cuh
--- a/source/core/utilities/XMatrixSegment.cpp
+++ b/source/core/utilities/XMatrixSegment.cpp
--- a/source/core/utilities/XMatrixSegment.h
+++ b/source/core/utilities/XMatrixSegment.h
--- a/source/function/FHeader.h
+++ b/source/function/FHeader.h
--- a/source/function/HardTanH.cpp
+++ b/source/function/HardTanH.cpp
--- a/source/function/HardTanH.cu
+++ b/source/function/HardTanH.cu
--- a/source/function/HardTanH.cuh
+++ b/source/function/HardTanH.cuh
--- a/source/function/HardTanH.h
+++ b/source/function/HardTanH.h
--- a/source/function/Identity.cpp
+++ b/source/function/Identity.cpp
--- a/source/function/Identity.h
+++ b/source/function/Identity.h
--- a/source/function/LogSoftmax.cpp
+++ b/source/function/LogSoftmax.cpp
--- a/source/function/LogSoftmax.cu
+++ b/source/function/LogSoftmax.cu
--- a/source/function/LogSoftmax.cuh
+++ b/source/function/LogSoftmax.cuh
--- a/source/function/LogSoftmax.h
+++ b/source/function/LogSoftmax.h
--- a/source/function/Loss.cpp
+++ b/source/function/Loss.cpp
--- a/source/function/Loss.cu
+++ b/source/function/Loss.cu
--- a/source/function/Loss.cuh
+++ b/source/function/Loss.cuh
--- a/source/function/Loss.h
+++ b/source/function/Loss.h
--- a/source/function/Rectify.cpp
+++ b/source/function/Rectify.cpp
--- a/source/function/Rectify.cu
+++ b/source/function/Rectify.cu
--- a/source/function/Rectify.cuh
+++ b/source/function/Rectify.cuh
--- a/source/function/Rectify.h
+++ b/source/function/Rectify.h
--- a/source/function/Sigmoid.cpp
+++ b/source/function/Sigmoid.cpp
--- a/source/function/Sigmoid.cu
+++ b/source/function/Sigmoid.cu
--- a/source/function/Sigmoid.cuh
+++ b/source/function/Sigmoid.cuh
--- a/source/function/Sigmoid.h
+++ b/source/function/Sigmoid.h
--- a/source/function/Softmax.cpp
+++ b/source/function/Softmax.cpp
--- a/source/function/Softmax.cu
+++ b/source/function/Softmax.cu
--- a/source/function/Softmax.cuh
+++ b/source/function/Softmax.cuh
--- a/source/function/Softmax.h
+++ b/source/function/Softmax.h
--- a/source/sample/fnnlm/FNNLM.cpp
+++ b/source/sample/fnnlm/FNNLM.cpp
--- a/source/sample/fnnlm/FNNLM.h
+++ b/source/sample/fnnlm/FNNLM.h
--- a/source/test/TConcatenate.cpp
+++ b/source/test/TConcatenate.cpp
--- a/source/test/TConcatenate.h
+++ b/source/test/TConcatenate.h
--- a/source/test/TConcatenateSolely.cpp
+++ b/source/test/TConcatenateSolely.cpp
--- a/source/test/TConcatenateSolely.h
+++ b/source/test/TConcatenateSolely.h
--- a/source/test/TCopyIndexed.cpp
+++ b/source/test/TCopyIndexed.cpp
--- a/source/test/TCopyIndexed.h
+++ b/source/test/TCopyIndexed.h
--- a/source/tensor/test/TCopyValues.cpp
+++ b/source/tensor/test/TCopyValues.cpp
--- a/source/test/TCopyValues.h
+++ b/source/test/TCopyValues.h
--- a/source/tensor/test/THardTanH.cpp
+++ b/source/tensor/test/THardTanH.cpp
--- a/source/test/THardTanH.h
+++ b/source/test/THardTanH.h
--- a/source/tensor/test/TIdentity.cpp
+++ b/source/tensor/test/TIdentity.cpp
--- a/source/test/TIdentity.h
+++ b/source/test/TIdentity.h
--- a/source/test/TLogSoftmax.cpp
+++ b/source/test/TLogSoftmax.cpp
--- a/source/test/TLogSoftmax.h
+++ b/source/test/TLogSoftmax.h
--- a/source/tensor/test/TLoss.cpp
+++ b/source/tensor/test/TLoss.cpp
--- a/source/test/TLoss.h
+++ b/source/test/TLoss.h
--- a/source/tensor/test/TMatrixMULBatchedCPU.cpp
+++ b/source/tensor/test/TMatrixMULBatchedCPU.cpp
--- a/source/test/TMatrixMULBatchedCPU.h
+++ b/source/test/TMatrixMULBatchedCPU.h
--- a/source/test/TMatrixMul.cpp
+++ b/source/test/TMatrixMul.cpp
--- a/source/test/TMatrixMul.h
+++ b/source/test/TMatrixMul.h
--- a/source/tensor/test/TMatrixMul2D.cpp
+++ b/source/tensor/test/TMatrixMul2D.cpp
--- a/source/test/TMatrixMul2D.h
+++ b/source/test/TMatrixMul2D.h
--- a/source/tensor/test/TMatrixMul2DParallel.cpp
+++ b/source/tensor/test/TMatrixMul2DParallel.cpp
--- a/source/tensor/test/TMatrixMul2DParallel.h
+++ b/source/tensor/test/TMatrixMul2DParallel.h
--- a/source/tensor/test/TMatrixMulBatched.cpp
+++ b/source/tensor/test/TMatrixMulBatched.cpp
--- a/source/tensor/test/TMatrixMulBatched.h
+++ b/source/tensor/test/TMatrixMulBatched.h
--- a/source/test/TMerge.cpp
+++ b/source/test/TMerge.cpp
--- a/source/tensor/test/TMerge.h
+++ b/source/tensor/test/TMerge.h
--- a/source/test/TMultiply.cpp
+++ b/source/test/TMultiply.cpp
--- a/source/tensor/test/TMultiply.h
+++ b/source/tensor/test/TMultiply.h
--- a/source/tensor/test/TNegate.cpp
+++ b/source/tensor/test/TNegate.cpp
--- a/source/test/TMatrixMulBatched.h
+++ b/source/test/TMatrixMulBatched.h
--- a/source/tensor/test/TNormalize.cpp
+++ b/source/tensor/test/TNormalize.cpp
--- a/source/tensor/test/TNormalize.h
+++ b/source/tensor/test/TNormalize.h
--- a/source/tensor/test/TPower.cpp
+++ b/source/tensor/test/TPower.cpp
--- a/source/tensor/test/TPower.h
+++ b/source/tensor/test/TPower.h
--- a/source/tensor/test/TRectify.cpp
+++ b/source/tensor/test/TRectify.cpp
--- a/source/tensor/test/TRectify.h
+++ b/source/tensor/test/TRectify.h
--- a/source/tensor/test/TReduceMax.cpp
+++ b/source/tensor/test/TReduceMax.cpp
--- a/source/tensor/test/TReduceMax.h
+++ b/source/tensor/test/TReduceMax.h
--- a/source/tensor/test/TReduceMean.cpp
+++ b/source/tensor/test/TReduceMean.cpp
--- a/source/tensor/test/TReduceMean.h
+++ b/source/tensor/test/TReduceMean.h
--- a/source/tensor/test/TReduceSum.cpp
+++ b/source/tensor/test/TReduceSum.cpp
--- a/source/tensor/test/TReduceSum.h
+++ b/source/tensor/test/TReduceSum.h
--- a/source/tensor/test/TReduceSumSquared.cpp
+++ b/source/tensor/test/TReduceSumSquared.cpp
--- a/source/tensor/test/TReduceSumSquared.h
+++ b/source/tensor/test/TReduceSumSquared.h
--- a/source/tensor/test/TReduceVariance.cpp
+++ b/source/tensor/test/TReduceVariance.cpp
--- a/source/tensor/test/TReduceVariance.h
+++ b/source/tensor/test/TReduceVariance.h
--- a/source/tensor/test/TScaleAndShift.cpp
+++ b/source/tensor/test/TScaleAndShift.cpp
--- a/source/tensor/test/TScaleAndShift.h
+++ b/source/tensor/test/TScaleAndShift.h
--- a/source/tensor/test/TSelect.cpp
+++ b/source/tensor/test/TSelect.cpp
--- a/source/tensor/test/TSelect.h
+++ b/source/tensor/test/TSelect.h
--- a/source/tensor/test/TSetAscendingOrder.cpp
+++ b/source/tensor/test/TSetAscendingOrder.cpp
--- a/source/tensor/test/TSetAscendingOrder.h
+++ b/source/tensor/test/TSetAscendingOrder.h
--- a/source/tensor/test/TSetData.cpp
+++ b/source/tensor/test/TSetData.cpp
--- a/source/test/TMatrixMul2DParallel.h
+++ b/source/test/TMatrixMul2DParallel.h
--- a/source/tensor/test/TSigmoid.cpp
+++ b/source/tensor/test/TSigmoid.cpp
--- a/source/tensor/test/TSigmoid.h
+++ b/source/tensor/test/TSigmoid.h
--- a/source/tensor/test/TSoftmax.cpp
+++ b/source/tensor/test/TSoftmax.cpp
--- a/source/tensor/test/TSoftmax.h
+++ b/source/tensor/test/TSoftmax.h
--- a/source/tensor/test/TSort.cpp
+++ b/source/tensor/test/TSort.cpp
--- a/source/tensor/test/TSort.h
+++ b/source/tensor/test/TSort.h
--- a/source/tensor/test/TSplit.cpp
+++ b/source/tensor/test/TSplit.cpp
--- a/source/tensor/test/TSplit.h
+++ b/source/tensor/test/TSplit.h
--- a/source/tensor/test/TSum.cpp
+++ b/source/tensor/test/TSum.cpp
--- a/source/tensor/test/TSum.h
+++ b/source/tensor/test/TSum.h
--- a/source/tensor/test/TSumByColumnTV.cpp
+++ b/source/tensor/test/TSumByColumnTV.cpp
--- a/source/tensor/test/TSumByColumnTV.h
+++ b/source/tensor/test/TSumByColumnTV.h
--- a/source/tensor/test/TSumByColumnVT.cpp
+++ b/source/tensor/test/TSumByColumnVT.cpp
--- a/source/tensor/test/TSumByColumnVT.h
+++ b/source/tensor/test/TSumByColumnVT.h
--- a/source/tensor/test/TTopK.cpp
+++ b/source/tensor/test/TTopK.cpp
--- a/source/tensor/test/TTopK.h
+++ b/source/tensor/test/TTopK.h
--- a/source/tensor/test/TUnsqueeze.cpp
+++ b/source/tensor/test/TUnsqueeze.cpp
--- a/source/tensor/test/TUnsqueeze.h
+++ b/source/tensor/test/TUnsqueeze.h
--- a/source/tensor/test/TXMem.cpp
+++ b/source/tensor/test/TXMem.cpp
--- a/source/tensor/test/TXMem.h
+++ b/source/tensor/test/TXMem.h
--- a/source/tensor/test/Test.cpp
+++ b/source/tensor/test/Test.cpp
--- a/source/tensor/test/Test.h
+++ b/source/tensor/test/Test.h
--- a/source/test/TCopyValues.cpp
+++ b/source/test/TCopyValues.cpp
--- a/source/test/THardTanH.cpp
+++ b/source/test/THardTanH.cpp
--- a/source/test/TIdentity.cpp
+++ b/source/test/TIdentity.cpp
--- a/source/test/TLoss.cpp
+++ b/source/test/TLoss.cpp
--- a/source/test/TMatrixMULBatchedCPU.cpp
+++ b/source/test/TMatrixMULBatchedCPU.cpp
--- a/source/test/TMatrixMul2D.cpp
+++ b/source/test/TMatrixMul2D.cpp
--- a/source/test/TMatrixMul2DParallel.cpp
+++ b/source/test/TMatrixMul2DParallel.cpp
--- a/source/test/TMatrixMulBatched.cpp
+++ b/source/test/TMatrixMulBatched.cpp
--- a/source/test/TMerge.h
+++ b/source/test/TMerge.h
--- a/source/test/TMultiply.h
+++ b/source/test/TMultiply.h
--- a/source/test/TNegate.cpp
+++ b/source/test/TNegate.cpp
--- a/source/test/TNegate.h
+++ b/source/test/TNegate.h
--- a/source/test/TNormalize.cpp
+++ b/source/test/TNormalize.cpp
--- a/source/test/TNormalize.h
+++ b/source/test/TNormalize.h
--- a/source/test/TPower.cpp
+++ b/source/test/TPower.cpp
--- a/source/test/TPower.h
+++ b/source/test/TPower.h
--- a/source/test/TRectify.cpp
+++ b/source/test/TRectify.cpp
--- a/source/test/TRectify.h
+++ b/source/test/TRectify.h
--- a/source/test/TReduceMax.cpp
+++ b/source/test/TReduceMax.cpp
--- a/source/test/TReduceMax.h
+++ b/source/test/TReduceMax.h
--- a/source/test/TReduceMean.cpp
+++ b/source/test/TReduceMean.cpp
--- a/source/test/TReduceMean.h
+++ b/source/test/TReduceMean.h
--- a/source/test/TReduceSum.cpp
+++ b/source/test/TReduceSum.cpp
--- a/source/test/TReduceSum.h
+++ b/source/test/TReduceSum.h
--- a/source/test/TReduceSumSquared.cpp
+++ b/source/test/TReduceSumSquared.cpp
--- a/source/test/TReduceSumSquared.h
+++ b/source/test/TReduceSumSquared.h
--- a/source/test/TReduceVariance.cpp
+++ b/source/test/TReduceVariance.cpp
--- a/source/test/TReduceVariance.h
+++ b/source/test/TReduceVariance.h
--- a/source/test/TScaleAndShift.cpp
+++ b/source/test/TScaleAndShift.cpp
--- a/source/test/TScaleAndShift.h
+++ b/source/test/TScaleAndShift.h
--- a/source/test/TSelect.cpp
+++ b/source/test/TSelect.cpp
--- a/source/test/TSelect.h
+++ b/source/test/TSelect.h
--- a/source/test/TSetAscendingOrder.cpp
+++ b/source/test/TSetAscendingOrder.cpp
--- a/source/test/TSetAscendingOrder.h
+++ b/source/test/TSetAscendingOrder.h
--- a/source/test/TSetData.cpp
+++ b/source/test/TSetData.cpp
--- a/source/test/TSetData.h
+++ b/source/test/TSetData.h
--- a/source/test/TSigmoid.cpp
+++ b/source/test/TSigmoid.cpp
--- a/source/test/TSigmoid.h
+++ b/source/test/TSigmoid.h
--- a/source/test/TSoftmax.cpp
+++ b/source/test/TSoftmax.cpp
--- a/source/test/TSoftmax.h
+++ b/source/test/TSoftmax.h
--- a/source/test/TSort.cpp
+++ b/source/test/TSort.cpp
--- a/source/test/TSort.h
+++ b/source/test/TSort.h
--- a/source/test/TSplit.cpp
+++ b/source/test/TSplit.cpp
--- a/source/test/TSplit.h
+++ b/source/test/TSplit.h
--- a/source/test/TSum.cpp
+++ b/source/test/TSum.cpp
--- a/source/test/TSum.h
+++ b/source/test/TSum.h
--- a/source/test/TSumByColumnTV.cpp
+++ b/source/test/TSumByColumnTV.cpp
--- a/source/test/TSumByColumnTV.h
+++ b/source/test/TSumByColumnTV.h
--- a/source/test/TSumByColumnVT.cpp
+++ b/source/test/TSumByColumnVT.cpp
--- a/source/test/TSumByColumnVT.h
+++ b/source/test/TSumByColumnVT.h
--- a/source/test/TTopK.cpp
+++ b/source/test/TTopK.cpp
--- a/source/test/TTopK.h
+++ b/source/test/TTopK.h
--- a/source/test/TUnsqueeze.cpp
+++ b/source/test/TUnsqueeze.cpp
--- a/source/test/TUnsqueeze.h
+++ b/source/test/TUnsqueeze.h
--- a/source/test/TXMem.cpp
+++ b/source/test/TXMem.cpp
--- a/source/test/TXMem.h
+++ b/source/test/TXMem.h
--- a/source/test/Test.cpp
+++ b/source/test/Test.cpp
--- a/source/test/Test.h
+++ b/source/test/Test.h