Merge branch 'xiaotong-working'

dd6646ed · xuchen · 4f37a5ad · 7ac8e731 · dd6646ed · 4f37a5ad
Commit dd6646ed authored Jul 20, 2018 by xuchen
--- a/doc/manual.md
+++ b/doc/manual.md
--- a/source/XLink.cpp
+++ b/source/XLink.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-04
- */
-
-#include <stdio.h>
-#include "XLink.h"
-
-namespace nts{ // namespace nts(NiuTrans.Tensor)
-
-int XLink::paramSize = 64;
-
-/* constuctor */
-XLink::XLink()
-{
-    head   = NULL;
-    tails  = NULL;
-    params = NULL;
-    tailNum  = 0;
-    paramNum = 0;
-    type[0] = 0;
-}
-    
-/* deconstructor */
-XLink::~XLink()
-{
-    delete[] tails;
-    delete[] (char*)params;
-}
-
-/* reset it */
-void XLink::Reset()
-{
-    delete[] tails;
-    delete[] (char*)params;
-    head   = NULL;
-    tails  = NULL;
-    params = NULL;
-    tailNum  = 0;
-    paramNum = 0;
-    type[0] = 0;
-}
-
-/* 
-set edge type name 
->> typeName - type name in string
-*/
-void XLink::SetType(const char * typeName)
-{
-    type[0] = 0;
-    if(typeName == NULL)
-        return;
-    strcpy(type, typeName);
-}
-
-/* 
-set head 
->> h - pointer to the head tensor
-*/
-void XLink::SetHead(XTensor * h)
-{
-    head = h;
-}
-
-/* 
-add a tail
->> t - pointer to the tail tensor
-*/
-void XLink::AddTail(XTensor * t)
-{
-    XTensor ** ts = tails;
-    tails = new XTensor*[tailNum + 1];
-    memcpy(tails, ts, sizeof(XTensor*) * tailNum);
-    tails[tailNum++] = t;
-    delete[] ts;
-}
-
-/* 
-add two tails in one time 
->> t1 - pointer to the tail tensor
->> t2 - pointer to another tail tensor
-*/
-void XLink::AddTwoTails(XTensor * t1, XTensor * t2)
-{
-    XTensor ** ts = tails;
-    tails = new XTensor*[tailNum + 2];
-    memcpy(tails, ts, sizeof(XTensor*) * tailNum);
-    tails[tailNum++] = t1;
-    tails[tailNum++] = t2;
-    delete[] ts;
-}
-
-/* 
-add a parameter 
->> param - parameter in default type
-*/
-void XLink::AddParam(DTYPE param)
-{
-    void * ps = params;
-    params = new char[paramNum + 1];
-    memcpy(params, ps, paramNum * paramSize);
-    DTYPE * p = (DTYPE*)((char*)params + paramNum * paramSize);
-    *p = param;
-    paramNum++;
-    delete[] (char*)ps;
-}
-
-/* 
-add a parameter 
->> param - pointer to the parameter
->> size - size of the parameter
-*/
-void XLink::AddParam(void * param, int size)
-{
-    void * ps = params;
-    params = new char[paramNum + 1];
-    memcpy(params, ps, paramNum * paramSize);
-    char * p = (char*)params + paramNum * paramSize;
-    memcpy(p, param, size);
-    paramNum++;
-    delete[] (char*)ps;
-}
-/* 
-create a hyperedge with two input tensors and a output tensor 
->> t1 - a tail tensor
->> t2 - another tail tensor
->> h - head tensor
->> typeName - name of edge type
-*/
-void XLink::MakeLink(XTensor * t1, XTensor * t2, XTensor * h, const char * typeName)
-{
-    if(h != NULL)
-        return;
-
-    /* forward */
-    XLink &income = h->income;
-    income.Reset();
-    income.SetHead(h);
-    if(t1 != NULL && t2 != NULL)
-        income.AddTwoTails(t1, t2);
-    else if(t1 != NULL)
-        income.AddTail(t1);
-    else{
-        ShowNTErrors("TODO!");
-    }
-    income.SetType(typeName);
-
-    /* backward for t1 */
-    if(t1 != NULL){
-        XLink &outgo = t1->outgo;
-        CheckNTErrors(outgo.head != t1, "Wrong head of the hyperedge!");
-        outgo.AddTail(h);
-    }
-
-    /* backward for t2 */
-    if(t2 != NULL){
-        XLink &outgo = t2->outgo;
-        CheckNTErrors(outgo.head != t2, "Wrong head of the hyperedge!");
-        outgo.AddTail(h);
-    }
-}
-
-/* 
-create a hyper edge with a list of tensors and a output tensor 
->> list - a list of input tensors
->> h - head tensor
->> typeName - name of edge type
-*/
-void XLink::MakeLink(XList * list, XTensor * h, const char * typeName)
-{
-    /* forward */
-    XLink &income = h->income;
-    income.Reset();
-    income.SetHead(h);
-    income.SetType(typeName);
-
-    for(int i = 0; i < list->count; i++){
-        XTensor * t = (XTensor*)list->GetItem(i);
-        income.AddTail(t);
-    }
-
-    /* backward */
-    for(int i = 0; i < list->count; i++){
-        XTensor * t = (XTensor*)list->GetItem(i);
-        XLink &outgo = t->outgo;
-        CheckNTErrors(outgo.head != t, "Wrong head of the hyperedge!");
-        outgo.AddTail(h);
-    }
-}
-
-/* 
-add parameters 
->> h - head
->> param - parameter we want introduce
-*/
-void XLink::AddParamToHead(XTensor * h, DTYPE param)
-{
-    if(h != NULL)
-        return;
-    h->income.AddParam(param);
-}
-
-/* 
-add an integer parameter 
->> h - head
->> param - parameter we want introduce
-*/
-void XLink::AddParamToHeadInt(XTensor * h, int param)
-{
-    if(h != NULL)
-        return;
-    h->income.AddParam(&param, sizeof(int));
-}
-    
-} // namespace nts(NiuTrans.Tensor)
-
--- a/source/core/CHeader.h
+++ b/source/core/CHeader.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
-
-/* this is a header to include all functions in the "core" workspace */
-
-#ifndef __CHEADER_H__
-#define __CHEADER_H__
-
-#include "../XTensor.h"
-
-#include "Concatenate.h"
-#include "ConcatenateSolely.h"
-#include "CopyIndexed.h"
-#include "CopyInGrid.h"
-#include "CopyValues.h"
-#include "FlushToMem.h"
-#include "MakeMergeBlockIndex.h"
-#include "MakeSplitBlockIndex.h"
-#include "MatrixMul.h"
-#include "MatrixMul2D.h"
-#include "MatrixMul2DMultiTheading.h"
-#include "MatrixMul2DParallel.h"
-#include "MatrixMulBatched.h"
-#include "MatrixMULBatchedCPU.h"
-#include "Merge.h"
-#include "MergeBlockLists.h"
-#include "Multiply.h"
-#include "Negate.h"
-#include "Normalize.h"
-#include "Permute.h"
-#include "Power.h"
-#include "ReduceMax.h"
-#include "ReduceMean.h"
-#include "ReduceStandardVariance.h"
-#include "ReduceSum.h"
-#include "ReduceSumSquared.h"
-#include "ReduceVariance.h"
-#include "ScaleAndShift.h"
-#include "SetData.h"
-#include "Sort.h"
-#include "Split.h"
-#include "Sum.h"
-#include "SumByColumnTV.h"
-#include "SumByColumnVT.h"
-#include "TopK.h"
-#include "Unsqueeze.h"
-#include "XMatrixSegment.h"
-#include "XTensorBLAS.h"
-
-#endif // __CHEADER_H__
\ No newline at end of file
--- a/source/core/Concatenate.cpp
+++ b/source/core/Concatenate.cpp
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
-
-#include "../XTensor.h"
-#include "Concatenate.h"
-#include "Merge.h"
-#include "ConcatenateSolely.h"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-/*
-concatenate a list of tensors along a given dimension
-Note that this is actually a wrapper that selects "ConcatenateSolely"
-or "Merge" by means of the tensor shapes
->> smalls - a list of tensors for concatenation
->> big - the resulting tensor
->> dim - which dimension we perform the concatenation
-*/
-void Concatenate(XList * smalls, XTensor * big, int dim)
-{
-    bool uniform = true;
-    for (int i = 1; i < smalls->count; i++) {
-        XTensor * a = (XTensor*)smalls->GetItem(i - 1);
-        XTensor * b = (XTensor*)smalls->GetItem(i);
-        CheckNTErrors((a && b), "Empty input tensors!");
-        if (!XTensor::IsIdentical(a, b))
-            uniform = false;
-    }
-
-    if (uniform)
-        Merge(smalls, big, dim);
-    else
-        ConcatenateSolely(smalls, big, dim);
-}
-
-/*
-concatenate two tensors along a given dimension
-*/
-void Concatenate(XTensor * smallA, XTensor * smallB, XTensor * big, int dim)
-{
-    XList smalls(2);
-    smalls.Add(smallA);
-    smalls.Add(smallB);
-
-    Concatenate(&smalls, big, dim);
-}
-
-} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/core/Select.cuh
+++ b/source/core/Select.cuh
--- a/source/core/SetData.cpp
+++ b/source/core/SetData.cpp
-/*
- * NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-05-08
- */
-
-#include "SetData.h"
-#include "CopyValues.h"
-
-#if !defined( WIN32 ) && !defined( _WIN32 )
-    #include "sys/time.h"
-    #include "time.h"
-    #include "iconv.h"
-#else
-    #include "time.h"
-    #include "windows.h"
-    #include "process.h"
-#endif
-
-namespace nts{ // namespace nts(NiuTrans.Tensor)
-
-/*
-generate data items with a uniform distribution in [low,high]
->> tensor - the tensor whose data array would be initialized
->> low - lower value of the range
->> high - higher value of the range
-*/
-void SetDataRand(XTensor * tensor, DTYPE low, DTYPE high)
-{
-    if(tensor == NULL)
-        return;
-    
-    /* GPU code */
-    if(tensor->devID < 0){
-        DTYPE variance = high - low;
-        
-        srand((unsigned)time(NULL));
-        
-        if(tensor->dataType == X_FLOAT){
-            float * d = (float*)tensor->data;
-            for(int i = 0; i < tensor->unitNum; i++){
-                d[i] = variance * ((float)rand()/RAND_MAX) + low;
-            }
-        }
-        else if(tensor->dataType == X_DOUBLE){
-            double * d = (double*)tensor->data;
-            for(int i = 0; i < tensor->unitNum; i++){
-                d[i] = variance * ((double)rand()/RAND_MAX) + low;
-            }
-        }
-        else{
-            ShowNTErrors("TODO");
-        }
-    }
-    /* GPU code
-       The trick here is that initialize the data on a temperary tensor on CPU.
-       The CPU data is then copied to GPU.
-       TODO: generate data points on GPUs straightforwardly.
-    */
-    else{
-        XTensor * t2 = NewTensor(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, -1);
-        SetDataRand(t2, low, high);
-        CopyValues(t2, tensor);
-        delete t2;
-    }
-}
-    
-} // namespace nts(NiuTrans.Tensor)
-
--- a/source/core/SetData.cu
+++ b/source/core/SetData.cu
--- a/source/core/SetData.cuh
+++ b/source/core/SetData.cuh
--- a/source/core/XTensorCore.h
+++ b/source/core/XTensorCore.h
--- a/source/function/Loss.cu
+++ b/source/function/Loss.cu
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
-
-#include "Loss.h"
-#include "Loss.cuh"
-#include "../XDevice.h"
-
-namespace nts{ // namespace nts(NiuTrans.Tensor)
-
-#ifdef USE_CUDA
-
-/*
-loss function to measure the "number" of errors
-*/
-
-
-/* 
-compute the loss 
->> gold - gold standard
->> y - model prediction
->> LFName - name of loss function
->> isLogOutput - is the output in log scale?
->> leadDim - the leading dimension for the output
->> gBeg - where to start in the gold standard (along the leading dimension)
->> gLen - segment length from oBeg (along the leading dimension)
->> yBeg - where to start in the model output (along the leading dimension)
-<< return - error in model prediction with respect to gold standard
-*/
-DTYPE CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
-                      bool isLogOutput, int leadDim, int gBeg, int gLen, int yBeg)
-{
-    return 0;
-
-    // TODO: call cuda kernels for computing the errors
-}
-
-/* 
-the log version of loss computation
-
->> gold - gold standard
->> y - model prediction
->> LFName - name of loss function
->> leadDim - the leading dimension for the output
->> gBeg - where to start in the gold standard (along the leading dimension)
->> gLen - segment length from oBeg (along the leading dimension)
->> yBeg - where to start in the model output (along the leading dimension)
-<< return - error in model prediction with respect to gold standard
-*/
-DTYPE CudaLossComputeForLogScale(XTensor * gold, XTensor * y, 
-                                 LOSS_FUNCTION_NAME LFName,
-                                 int leadDim, int gBeg, int gLen, int yBeg)
-{
-    return 0;
-
-    // TODO: call cuda kernels for computing the errors
-}
-
-/* 
-backward compuation for a single element (Cuda version)
-dE/dy
-where E is the error(loss) function that measure the errors in y
-with respect to gold standard, and y this the model output
->> t - gold standard
->> y - model output
->> LFName - name of loss function
-<< return dE/dy
-*/
-DTYPE CudaLossBackward(DTYPE t, DTYPE y, LOSS_FUNCTION_NAME LFName)
-{
-    return LossBackwardPoint(t, y, LFName);
-   
-    // TODO: call cuda kernels for computing the errors
-}
-
-/* 
-backward compuation for squared error (Cuda kernel)
->> dedy - dE/dy (for return)
->> t - gold standard (in vector)
->> y - model output (in vector)
->> size - size of the vector (dedy)
-*/
-extern "C" __global__ 
-void KernelLossBackwardSquaredError(DTYPE * dedy, DTYPE * t, DTYPE * y, int size)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-    if (i < size){
-        dedy[i] = y[i] - t[i];
-    }
-}
-
-/* 
-backward compuation of blocks for squared error (Cuda kernel)
->> dedy - dE/dy (for return)
->> t - gold standard (in vector)
->> y - model output (in vector)
->> blockSize - size of a block
->> begInBlock - the begining position in a block for computation 
->> lenInBlock - number of items in a block for computation 
->> size - size of the vector (dedy)
-*/
-extern "C" __global__ 
-void KernelLossBackwardSquaredErrorBlock(DTYPE * dedy, DTYPE * t, DTYPE * y, 
-                                         int blockSize, int begInBlock, int lenInBlock, int size)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-    int offset = i % blockSize;
-
-    if(offset < begInBlock || offset >= begInBlock + lenInBlock)
-        return;
-
-    if (i < size){
-        dedy[i] = y[i] - t[i];
-    }
-}
-
-/* 
-backward compuation for cross entropy (Cuda kernel)
->> dedy - dE/dy (for return)
->> t - gold standard (in vector)
->> y - model output (in vector)
->> size - size of the vector (dedy)
-*/
-extern "C" __global__ 
-void KernelLossBackwardCrossEntropy(DTYPE * dedy, DTYPE * t, DTYPE * y, int size)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-    if (i < size){
-        dedy[i] =  -t[i]/y[i];
-    }
-}
-
-/* 
-backward compuation for cross entropy (Cuda kernel)
->> dedy - dE/dy (for return)
->> t - gold standard (in vector)
->> y - model output (in vector)
->> blockSize - size of a block
->> begInBlock - the begining position in a block for computation 
->> lenInBlock - number of items in a block for computation 
->> size - size of the vector (dedy)
-*/
-extern "C" __global__ 
-void KernelLossBackwardCrossEntropyBlock(DTYPE * dedy, DTYPE * t, DTYPE * y, 
-                                         int blockSize, int begInBlock, int lenInBlock, int size)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-    int offset = i % blockSize;
-
-    if(offset < begInBlock || offset >= begInBlock + lenInBlock)
-        return;
-
-    if (i < size){
-        dedy[i] =  -t[i]/y[i];
-    }
-}
-
-/* 
-backward compuation for (dense) vectors (Cuda version)
-dE/dy
-where E is the error(loss) function that measure the errors in y
-with respect to gold standard, and y this the model output
->> dedy - dE/dy (for return)
->> t - gold standard (in vector)
->> y - model output (in vector)
->> LFName - name of loss function
->> leadDim - the leading dimension for the output
->> tBeg - where to start in the gold standard (along the leading dimension)
->> tLen - segment length from oBeg (along the leading dimension)
->> yBeg - where to start in the model output (along the leading dimension)
-*/
-void CudaLossBackward(XTensor * dedy, XTensor * t, XTensor * y, 
-                      LOSS_FUNCTION_NAME LFName, 
-                      int leadDim, int tBeg, int tLen, int yBeg)
-{
-    CheckNTErrors((XTensor::IsIdentical(t, y)&& XTensor::IsIdentical(dedy, y)), 
-                        "The input tensors must be of the same size!");
-    CheckNTErrors((t->dimSizeRDI[0] == 1 && y->dimSizeRDI[0] == 1 && dedy->dimSizeRDI[1] == 1), "TODO!");
-    CheckNTErrors((t->dataType == DEFAULT_DTYPE && 
-                         y->dataType == DEFAULT_DTYPE && 
-                         dedy->dataType == DEFAULT_DTYPE),
-                         "Input vectors are not in default type.");
-
-    CheckNTErrors((dedy->devID >= 0 && t->devID >= 0 && y->devID >= 0),
-                         "The backward compuation must be performed on GPUs.");
-
-    CheckNTErrors((dedy->devID == t->devID && dedy->devID == y->devID),
-                        "The vectors must be on the same GPU.");
-    CheckNTErrors((tBeg == yBeg), "TODO!");
-
-    int leadDimRDI = y->order - leadDim - 1;
-    if(leadDimRDI < 0){
-        leadDimRDI = y->dimSizeRDI[y->order - 1];
-        tBeg = 0;
-        yBeg = 0;
-        tLen = y->dimSizeRDI[leadDimRDI];
-    }
-
-    int stride = 1;
-    int blockSize = 1;
-    int size = 1;
-
-    for(int i = 0; i < leadDimRDI; i++)
-        stride *= y->dimSizeRDI[i];
-    size = tLen * stride;
-
-    int cudaGridSize[3], cudaBlockSize[3];
-
-    GDevs.GetCudaThread(dedy->devID, y->unitNum, cudaGridSize, cudaBlockSize);
-
-    dim3 blocks(cudaGridSize[0]);
-    dim3 threads(cudaBlockSize[0]);
-
-    DTYPE * tp = (DTYPE*)t->data;
-    DTYPE * yp = (DTYPE*)y->data;
-    DTYPE * dedyp = (DTYPE*)dedy->data;
-
-    int devIDBackup;
-    ProtectCudaDev(y->devID, devIDBackup);
-
-    /* 
-    squared error 
-    loss = sum_{i} 0.5*(t_i - y_i)^2, where t_i is the gold standard and y_i is the model output
-    dloss/dy_i = y_i - t_i
-    */
-    if(LFName == SQUAREDERROR){
-        if(t->isSparse){
-            ShowNTErrors("TODO!");
-        }
-        else if(size == y->unitNum){
-            KernelLossBackwardSquaredError<<<blocks, threads>>>(dedyp, tp, yp, y->unitNum);
-        }
-        else{
-            KernelLossBackwardSquaredErrorBlock<<<blocks, threads>>>(dedyp, tp, yp, blockSize, tBeg * stride, tLen * stride, y->unitNum);
-        }
-    }
-
-    /* 
-    cross entropy
-    loss = sum_{i} (-t_i * log(y_i)), where t and y are distributions 
-    dloss/dy_i = -t_i / y_i
-    */
-    if(LFName == CROSSENTROPY){
-        if(t->isSparse){
-            ShowNTErrors("TODO!");
-        }
-        else if(size == y->unitNum){
-            KernelLossBackwardCrossEntropy<<<blocks, threads>>>(dedyp, tp, yp, tLen);
-        }
-        else{
-            KernelLossBackwardCrossEntropyBlock<<<blocks, threads>>>(dedyp, tp, yp, blockSize, tBeg * stride, tLen * stride, y->unitNum);
-        }
-    }
-
-    BacktoCudaDev(y->devID, devIDBackup);
-}
-
-#endif
-
-} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/network/Main.cpp
+++ b/source/network/Main.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-10
+ */
+
+#include <stdio.h>
+#include "XNet.h"
+#include "../tensor/function/FHeader.h"
+#include "../tensor/core/CHeader.h"
+#include "../sample/fnnlm/FNNLM.h"
+
+//#define CRTDBG_MAP_ALLOC
+//#include <stdlib.h>
+//#include <crtdbg.h>
+
+using namespace nts;
+using namespace samplefnnlm;
+
+
+int main( int argc, const char ** argv )
+{
+    
+    if(argc > 1 && !strcmp(argv[1], "-test"))
+        1;//Test();
+    else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
+        FNNLMMain(argc - 1, argv + 1);
+    else{
+        fprintf(stderr, "Thanks for using NiuTrans.Network! This is a library for building\n");
+        fprintf(stderr, "neural networks in an easy way. \n\n");
+        fprintf(stderr, "Run this program with \"-test\" for unit test!\n");
+        fprintf(stderr, "Or run this program with \"-fnnlm\" for sample FNNLM!\n");
+    }
+
+    XNet net;
+    XTensor a;
+    XTensor b;
+    XTensor c;
+
+    InitTensor2D(&a, 2, 2);
+    InitTensor2D(&b, 2, 4);
+    InitTensor2D(&c, 2, 4);
+
+    a.SetZeroAll();
+    b.SetZeroAll();
+    c.SetZeroAll();
+
+    SetDataFixed(a, 0.1F);
+    a.Set2D(0.3F, 1, 0);
+    a.Set2D(0.4F, 1, 1);
+
+    b = Merge(a, a, 1);
+    c = HTanH(MMul(a, b));
+
+    a.Dump(stderr, "a:");
+    b.Dump(stderr, "b:");
+    c.Dump(stderr, "c:");
+    XLink::ShowNetwork(stderr, &c);
+
+    net.Backward(c);
+
+    net.Dump(stderr);
+    
+    //_CrtDumpMemoryLeaks();
+    
+    return 0;
+}
--- a/source/network/XBackwardFunc.cpp
+++ b/source/network/XBackwardFunc.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * backward computation for activation function
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-18
+ * Dingdang won 5 games in the GO training yesterday, hahaha ...
+ */
+
+#include "XNoder.h"
+#include "XBackwardFunc.h"
+#include "../tensor/XName.h"
+#include "../tensor/function/FHeader.h"
+
+namespace nts{
+
+/* compute dE/dx of a node */
+void XFuncGrad::MakeGrad(XTensor * node)
+{
+    
+
+    XLink &income = node->income;
+    int operID = income.typeID;
+
+    CheckNTErrors(node->grad != NULL, "No gradient found!");
+    CheckNTErrors(income.tailNum == 1, "Too many input tensors for the function!");
+
+    XTensor * input = income.tails[0];
+    XTensor * output = node;
+
+    XNoder::MakeGrad(input);
+
+    if(operID == FUNC_HARDTANH)
+        _HardTanHBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
+    else if(operID == FUNC_IDENTITY)
+        _IdentityBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
+    else if(operID == FUNC_LOGSOFTMAX){
+        int leadDim = income.GetParamInt(0);
+        _LogSoftmaxBackward(NULL, output, input, output->grad, input->grad, leadDim, NOLOSS);
+    }
+    else if(operID == FUNC_RECTIFY)
+        _RectifyBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
+    else if(operID == FUNC_SIGMOID)
+        _SigmoidBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
+    else if(operID == FUNC_SOFTMAX){
+        int leadDim = income.GetParamInt(0);
+        _SoftmaxBackward(NULL, output, input, output->grad, input->grad, leadDim, NOLOSS);
+    }
+    else{
+        ShowNTErrors("Wrong activation function type!");
+    }
+}
+
+/* indicates whether the node is for an activation function */
+bool XFuncGrad::IsFunc(XTensor * node)
+{
+    XLink &income = node->income;
+    return (income.typeID & FUNCTION_BASE) != 0;
+}
+
+}
--- a/source/network/XBackwardFunc.h
+++ b/source/network/XBackwardFunc.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * backward computation for activation function
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-18
+ * Dingdang won 5 games in the GO training yesterday, hahaha ...
+ */
+
+#include "../tensor/XTensor.h"
+#include "../tensor/function/FHeader.h"
+
+#ifndef __XBACKWARDFUNC_H__
+#define __XBACKWARDFUNC_H__
+
+namespace nts{
+
+/* this class computes the gradient for activation functions given a node */
+class XFuncGrad
+{
+public:
+    /* compute dE/dx of a node */
+    static
+    void MakeGrad(XTensor * node);
+
+    /* indicates whether the node is for an activation function */
+    static
+    bool IsFunc(XTensor * node);
+};
+
+}
+
+#endif
\ No newline at end of file
--- a/source/network/XBackwardLoss.cpp
+++ b/source/network/XBackwardLoss.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-17
+ */
+
+#include "XBackwardLoss.h"
+#include "../tensor/XName.h"
+#include "../tensor/function/HardTanH.h"
+#include "../tensor/function/LogSoftmax.h"
+
+namespace nts{
+
+/* 
+compute dE/dx for a given function y = f(x) 
+>> gold - gold standard to measure error (or loss)
+>> y - output of the function
+>> x - input of the function
+>> dedy - dE/dy
+>> dedx - dE/dx
+>> funcID - id of the function f
+>> params - parameters of the function
+>> lossName - name of the loss, e.g., cross entropy
+*/
+void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x, 
+                        XTensor * dedy, XTensor * dedx,
+                        int funcID, void * params,
+                        LOSS_FUNCTION_NAME lossName)
+{
+    CheckNTErrors(gold && y && x, "Empty input tensors!");
+    CheckNTErrors(dedx, "Empty gradient tensors!");
+    CheckNTErrors((funcID & FUNCTION_BASE) != 0, "Illegal function id");
+
+    if(funcID == FUNC_HARDTANH){
+        _HardTanHBackward(gold, y, x, dedy, dedx, lossName);
+    }
+    else if(funcID == FUNC_LOGSOFTMAX){
+        int leadDim = *(int*)params;
+        _LogSoftmaxBackward(gold, y, x, dedy, dedx, leadDim, lossName);
+    }
+    else{
+        ShowNTErrors("wrong function found when call the backward process!");
+    }
+
+}
+
+/* 
+compute dE/dy for variable y and error(loss) function E
+>> gold - gold standard to measure error (or loss)
+>> y - output of the function
+>> dedy - dE/dy
+>> lossName - name of the loss, e.g., cross entropy
+*/
+void XLossGrad::Compute(XTensor * gold, XTensor * y, 
+                        XTensor * dedy, 
+                        LOSS_FUNCTION_NAME lossName)
+{
+    _LossBackward(dedy, gold, y, lossName);
+}
+
+}
\ No newline at end of file
--- a/source/network/XBackwardLoss.h
+++ b/source/network/XBackwardLoss.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-17
+ * My students worked all night to prepare a submission to CWMT. Good luck
+ * to them!
+ */
+
+#include "../tensor/XTensor.h"
+#include "../tensor/function/FHeader.h"
+
+#ifndef __XBACKWARDLOSS_H__
+#define __XBACKWARDLOSS_H__
+
+namespace nts{
+
+/* this class computes the gradient (of a output node) 
+   with respect to the loss */
+class XLossGrad
+{
+public:
+    /* compute dE/dx for a given function y = f(x) */
+    void Compute(XTensor * gold, XTensor * y, XTensor * x, 
+                 XTensor * dedy, XTensor * dedx,
+                 int funcID, void * params,
+                 LOSS_FUNCTION_NAME lossName);
+
+    /* compute dE/dy for variable y and error(loss) function E */
+    void Compute(XTensor * gold, XTensor * y, 
+                 XTensor * dedy, 
+                 LOSS_FUNCTION_NAME lossName);
+};
+
+}
+
+#endif
\ No newline at end of file
--- a/source/network/XBackwardMath.cpp
+++ b/source/network/XBackwardMath.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * backward computation for math operations
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-18
+ */
+
+#include "XNoder.h"
+#include "XBackwardMath.h"
+#include "../tensor/XName.h"
+#include "../tensor/core/CHeader.h"
+
+namespace nts{
+
+/* compute dE/dx of a node */
+void XMathGrad::MakeGrad(XTensor * node)
+{
+    CheckNTErrors(node->grad != NULL, "No gradient found!");
+
+    XLink &income = node->income;
+    int operID = income.typeID;
+
+    if(operID == MATH_SUM)
+        GradSum(node);
+    else if(operID == MATH_MULTIPLY)
+        GradMultiply(node);
+    else if(operID == MATH_MATRIXMUL)
+        GradMatrixMul(node);
+    else{
+        ShowNTErrors("TODO!");
+    }
+}
+
+/* indicates whether the node is for a math operation */
+bool XMathGrad::IsMathOP(XTensor * node)
+{
+    XLink &income = node->income;
+    return (income.typeID & MATH_BASE) != 0;
+}
+
+/* 
+gradient for sum
+for 
+c =  a + b * \beta
+we have
+dE/da = dE/dc 
+dE/db = dE/dc * \beta
+>> node - the node (c) for backward computation
+*/
+void XMathGrad::GradSum(XTensor * node)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for SUM!");
+
+    XTensor * a = income.tails[0];
+    XTensor * b = income.tails[1];
+    DTYPE beta = income.GetParam(0);
+    XNoder::MakeGrad(a);
+    XNoder::MakeGrad(b);
+
+    _Sum(a->grad, node->grad, a->grad);
+    _Sum(b->grad, node->grad, b->grad, beta);
+}
+
+/* 
+gradient for multiply (dot production)
+for
+c =  a * b 
+we have
+dE/da = dE/dc * b
+dE/db = dE/dc * a 
+>> node - the node (c) for backward computation
+*/
+void XMathGrad::GradMultiply(XTensor * node)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for MULTIPLY!");
+
+    XTensor * a = income.tails[0]; 
+    XTensor * b = income.tails[1];
+    XNoder::MakeGrad(a);
+    XNoder::MakeGrad(b);
+
+    CheckNTErrors(XTensor::IsSameShaped(a, b), "Wrong sized input tensors!");
+    _Multiply(node->grad, b, a->grad, 1.0F);
+    _Multiply(node->grad, a, b->grad, 1.0F);
+}
+
+/* 
+gradient for matrix multiply
+for c = matmul(a, b) * \alpha
+we have 
+dE/da = dE/dc * b^T * \alpha
+dE/db = a^T * dE/dc * \alpha
+>> node - the node (c) for backward computation
+*/
+void XMathGrad::GradMatrixMul(XTensor * node)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for MULTIPLY!");
+    CheckNTErrors(income.paramNum == 3, "Wrong parameter number for MULTIPLY!");
+
+    XTensor * a = income.tails[0]; 
+    XTensor * b = income.tails[1];
+    MATRIX_TRANS_TYPE transA = income.GetParamTrans(0);
+    MATRIX_TRANS_TYPE transB = income.GetParamTrans(1);
+    DTYPE alpha = income.GetParam(2);
+
+    XNoder::MakeGrad(a);
+    XNoder::MakeGrad(b);
+
+    XTensor * dedc = node->grad;
+    XTensor * deda = a->grad;
+    XTensor * dedb = b->grad;
+    
+    /* c = a * b * \alpha */
+    if(transA == X_NOTRANS && transB == X_NOTRANS){
+
+        /* dE/da = dE/dc * b^T * \alpha */
+        _MatrixMul(dedc, X_NOTRANS, b, X_TRANS, deda, alpha, 1.0F);
+
+        /* dE/db = a^T * dE/dc * \alpha */
+        _MatrixMul(a, X_TRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F);
+    }
+    
+    /* c = a^T * b * \alpha */
+    else if(transA == X_TRANS && transB == X_NOTRANS){
+        
+        /* dE/da = dE/dc * b^T * \alpha */
+        _MatrixMul(dedc, X_NOTRANS, b, X_TRANS, deda, alpha, 1.0F);
+        
+        /* dE/db = a * dE/dc * \alpha */
+        _MatrixMul(a, X_NOTRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F);
+    }
+    
+    /* c = a * b^T * \alpha */
+    else if(transA == X_NOTRANS && transB == X_TRANS){
+        
+        /* dE/da = dE/dc * b * \alpha */
+        _MatrixMul(dedc, X_NOTRANS, b, X_NOTRANS, deda, alpha, 1.0F);
+        
+        /* dE/db = a^T * dE/dc * \alpha */
+        _MatrixMul(a, X_TRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F);
+    }
+    
+    /* c = a^T * b^T * \alpha */
+    else if(transA == X_TRANS && transB == X_TRANS){
+        
+        /* dE/da = dE/dc * b * \alpha */
+        _MatrixMul(dedc, X_NOTRANS, b, X_NOTRANS, deda, alpha, 1.0F);
+        
+        /* dE/db = a * dE/dc * \alpha */
+        _MatrixMul(a, X_NOTRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F);
+    }
+}
+
+}
--- a/source/network/XBackwardMath.h
+++ b/source/network/XBackwardMath.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * backward computation for math operations
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-18
+ */
+
+#include "../tensor/XTensor.h"
+
+#ifndef __XBACKWARDMATH_H__
+#define __XBACKWARDMATH_H__
+
+namespace nts{
+
+/* this class computes the gradient for math operations given a node */
+class XMathGrad
+{
+public:
+    /* compute dE/dx of a node */
+    static
+    void MakeGrad(XTensor * node);
+
+    /* indicates whether the node is for a math operation */
+    static
+    bool IsMathOP(XTensor * node);
+
+private:
+    /* gradient for sum: c =  a + b * \beta */
+    static
+    void GradSum(XTensor * node);
+
+    /* gradient for multiply (dot production): c =  a * b */
+    static
+    void GradMultiply(XTensor * node);
+
+    /* gradient for matrix multiply: c = matmul(a, b) */
+    static
+    void GradMatrixMul(XTensor * node);
+};
+
+}
+
+#endif
\ No newline at end of file
--- a/source/network/XBackwardShape.cpp
+++ b/source/network/XBackwardShape.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * backward computation for math operations
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-19
+ * It was chilly when I came into the office this morning ...
+ * because i forgot to turn the air-condition off last night :(
+ */
+
+#include "XNoder.h"
+#include "XBackwardShape.h"
+#include "../tensor/XName.h"
+#include "../tensor/core/CHeader.h"
+
+namespace nts{
+
+/* compute dE/dx of a node */
+void XShapeGrad::MakeGrad(XTensor * node)
+{
+    CheckNTErrors(node->grad != NULL, "No gradient found!");
+
+    XLink &income = node->income;
+    int operID = income.typeID;
+
+    if(operID == SHAPE_MERGE)
+        GradMerge(node);
+    else if(operID == SHAPE_MERGE_LIST)
+        GradMergeList(node);
+    else if(operID == SHAPE_UNSQUEEZE)
+        GradUnsqueeze(node);
+    else{
+        ShowNTErrors("TODO!");
+    }
+}
+
+/* indicates whether the node is for a math operation */
+bool XShapeGrad::IsShapeOP(XTensor * node)
+{
+    XLink &income = node->income;
+    return (income.typeID & DATA_BASE) != 0;
+}
+
+/* 
+gradient for merge
+for 
+c = merge(a_0, a_1, ...)
+where a_i is the i-th block in a tensor a
+we have
+dE/da_0 = dE/dc_{split_0}
+dE/db_1 = dE/dc_{split_1}
+...
+i.e.,
+dE/da = split(dE/dc)
+>> node - the node (c) for backward computation
+*/
+void XShapeGrad::GradMerge(XTensor * node)
+{
+    XLink &income = node->income;
+    XTensor * input = income.tails[0];
+
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for MERGE!");
+    CheckNTErrors(node->order == input->order - 1, "wrong tensor orders!");
+
+    int whereToMerge = income.GetParamInt(0);
+    int leadDim = income.GetParamInt(1);
+
+    int blockSize = 1;
+    int blockNum = 1;
+    for(int i = 0; i < input->order; i++){
+        if(i < leadDim)
+            blockNum *= input->dimSize[i];
+    }
+    blockSize = input->GetDataSizeInChar() / blockNum;
+
+    XNoder::MakeGrad(input);
+
+    int * dims = new int[input->order];
+    for(int i = 0, j = 0; i < input->order; i++){
+        if(i >= leadDim){
+            dims[j++] = input->dimSize[i];
+        }
+    }
+    dims[0] = -dims[0];
+    XTensor gradInputSmall(input->order - leadDim, dims,
+                           input->dataType, input->denseRatio, 
+                           input->devID, input->mem);
+
+    dims[whereToMerge - leadDim] *= dims[0];
+    XTensor gradNodeSmall(node->order - leadDim, dims + leadDim + 1,
+                          node->dataType, node->denseRatio, 
+                          node->devID, node->mem);
+
+    /* we can simply split the gradient tensor 
+       if the input is used in merging only */
+    if(input->outgo.tailNum == 1){
+        for(int i = 0; i < blockNum; i++){
+            gradNodeSmall.data = (char*)node->grad->data + i * blockSize;
+            gradInputSmall.data = (char*)input->grad->data + i * blockSize;
+            _Split(&gradNodeSmall, &gradInputSmall, whereToMerge - leadDim - 1, input->dimSize[leadDim]);
+        }
+    }
+
+    /* a more complicated case is that the input tensor is used for 
+       other operations somewhere else. So we have to do gradient 
+       accumulation after spliting, i.e., we need an additional 
+       SUM operation */
+    else{
+        XTensor gradInputSmallBuf(&gradInputSmall);
+
+        for(int i = 0; i < blockNum; i++){
+            gradNodeSmall.data = (char*)node->grad->data + i * blockSize;
+            gradInputSmall.data = (char*)input->grad->data + i * blockSize;
+            _Split(&gradNodeSmall, &gradInputSmallBuf, whereToMerge - leadDim - 1, input->dimSize[leadDim]);
+            _Sum(&gradInputSmall, &gradInputSmallBuf, &gradInputSmall);
+        }
+    }
+
+    gradNodeSmall.data = NULL;
+    gradInputSmall.data = NULL;
+
+    delete[] dims;
+}
+
+/* 
+gradient for merging a list of tensors
+for 
+c = merge(list(a, b, ...)) 
+where a, b ... are of the same size
+we have
+dE/da = dE/dc_{split_0}
+dE/db = dE/dc_{split_1}
+i.e.,
+list(dE/da, dE/db, ...) = split(dE/dc)
+>> node - the node (c) for backward computation
+*/
+void XShapeGrad::GradMergeList(XTensor * node)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for MERGE!");
+
+    XTensor * last = NULL;
+    XList smalls(income.tailNum);
+    XList smallsGrad(income.tailNum);
+    bool mergeOnly = true;
+    for(int i = 0; i < income.tailNum; i++){
+        XTensor * tail = income.tails[i];
+        XNoder::MakeGrad(tail);
+        smalls.Add(tail);
+        smallsGrad.Add(tail->grad);
+        
+        if(i > 1){
+            CheckNTErrors(XTensor::IsSameShaped(last, tail), 
+                         "Input tensors must be of the same size!");
+        }
+
+        if(tail->outgo.tailNum  > 1)
+            mergeOnly = false;
+
+        last = tail;
+    }
+
+    int whereToMerge = income.GetParamInt(0);
+
+    /* we can simply split the gradient tensor into the input tensors 
+       if the inputs are used in merging only */
+    if(mergeOnly)
+        _Split(node->grad, &smallsGrad, whereToMerge, smalls.count);
+
+    /* a more complicated case is that the input tensors are used for 
+       other operations somewhere else. So we have to do gradient 
+       accumulation after spliting, i.e., we need an additional 
+       SUM operation */
+    else{
+        int * dims = new int[last->order + 1];
+        dims[0] = smalls.count;
+        for(int i = 0; i < last->order; i++)
+            dims[i + 1] = last->dimSize[i];
+
+        XTensor gradSplit(last->order + 1, dims, 
+                          last->dataType, last->denseRatio, 
+                          last->devID, last->mem);
+
+        _Split(node->grad, &gradSplit, whereToMerge, smalls.count);
+
+        memcpy(dims, last->dimSize, sizeof(int) * last->order);
+        dims[0] = -dims[0];
+        XTensor gradSmall(last->order, dims,
+                          last->dataType, last->denseRatio, 
+                          last->devID, last->mem);
+
+        /* gradient accumulation for each split */
+        for(int i = 0; i < smalls.count; i++){
+            XTensor * inputGrad = (XTensor*)smallsGrad.Get(i);
+            gradSmall.data = (char*)gradSplit.data + i * last->unitNum * last->unitSize;
+            _Sum(inputGrad, &gradSmall, inputGrad);
+        }
+
+        gradSmall.data = NULL;
+        delete[] dims;
+    }
+}
+
+/* 
+gradient for unsqueezing a tensor
+for
+c = unsqueeze(a) 
+we have
+dE/da = reduecesum(dE/dc)
+>> node - the node (c) for backward computation
+*/
+void XShapeGrad::GradUnsqueeze(XTensor * node)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for UNSQUEEZE!");
+
+    XTensor * output = node;
+    XTensor * input = income.tails[0];
+    XNoder::MakeGrad(input);
+
+    int dim = income.GetParamInt(0);
+    int dSize = income.GetParamInt(1);
+
+    CheckNTErrors(dSize == output->GetDim(dim), "Wrong dim size for UNSQUEEZE!");
+    CheckNTErrors(output->unitNum = input->unitNum * dSize, "Wrong tensor size!");
+
+    _ReduceSum(output->grad, input->grad, dim);
+}
+
+}
\ No newline at end of file
--- a/source/network/XBackwardShape.h
+++ b/source/network/XBackwardShape.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * backward computation for shaping and data movement
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-18
+ */
+
+#include "../tensor/XTensor.h"
+#include "../tensor/function/FHeader.h"
+
+#ifndef __XBACKWARDSHAPE_H__
+#define __XBACKWARDSHAPE_H__
+
+namespace nts{
+
+/* this class computes the gradient for tensor shaping and movement given a node */
+class XShapeGrad
+{
+public:
+    /* compute dE/dx of a node */
+    static
+    void MakeGrad(XTensor * node);
+
+    /* indicates whether the node is for a shaping operation */
+    static
+    bool IsShapeOP(XTensor * node);
+
+private:
+    /* gradient for merge: c = merge(a, b, ...) */
+    static
+    void GradMerge(XTensor * node);
+
+    /* gradient for merging a list of tensors : c = merge(list(a, b, ...)) */
+    static
+    void GradMergeList(XTensor * node);
+
+    /* gradient for unsqueezing a tensor : c = unsqueeze(a) */
+    static
+    void GradUnsqueeze(XTensor * node);
+};
+
+}
+
+#endif
\ No newline at end of file
--- a/source/network/XNet.cpp
+++ b/source/network/XNet.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-12
+ */
+
+#include "XNet.h"
+#include "XNoder.h"
+#include "XBackwardLoss.h"
+#include "XBackwardMath.h"
+#include "XBackwardFunc.h"
+#include "XBackwardShape.h"
+#include "../tensor/XName.h"
+
+namespace nts{
+
+unsigned int netIDGlobal = 0;
+MUTEX_HANDLE netMutex;
+
+/* generate a network id */
+unsigned int MakeNetID()
+{
+    if(netIDGlobal == 0)
+        MUTEX_INIT(netMutex);
+
+    MUTEX_LOCK(netMutex);
+    netIDGlobal += 3;
+    unsigned int id = netIDGlobal;
+    MUTEX_UNLOCK(netMutex);
+
+    return id;
+}
+
+/* constructor */
+XNet::XNet()
+{
+    nodes.Clear();
+}
+
+/* de-constructor */
+XNet::~XNet()
+{
+}
+
+/* clear the network */
+void XNet::Clear()
+{
+    nodes.Clear();
+    gradNodes.Clear();
+    outputs.Clear();
+    inputs.Clear();
+}
+
+/* 
+backward propagation to obtain gradient wrt. the loss/error function 
+>> root - root node (output) of the network
+>> gold - gold standard for the output
+>> loss - name of loss function
+*/
+void XNet::Backward(XTensor &root, XTensor &gold, LOSS_FUNCTION_NAME loss)
+{
+    XList roots(1);
+    roots.Add(&root);
+
+    XList golds(1);
+    golds.Add(&gold);
+
+    Backward(roots, golds, loss);
+}
+
+/* 
+backward propagation to obtain gradient
+>> root - root node (output) of the network
+>> loss - name of loss function
+*/
+void XNet::Backward(XTensor &root, LOSS_FUNCTION_NAME loss)
+{
+    XList roots(1);
+    roots.Add(&root);
+
+    XList golds(1);
+    golds.Add(NULL);
+
+    Backward(roots, golds, loss);
+}
+
+/* 
+backward propagation to obtain gradient wrt. the loss/error function
+with a number of root nodes 
+>> root - a list of root nodes (output) of the network
+>> gold - a list of gold standard for the output
+>> loss - name of loss function
+*/
+void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
+{
+    Traverse(roots);
+
+    for(int i = 0; i < nodes.count; i++){
+        XTensor * node = (XTensor*)nodes.Get(i);
+        node->visitMark = NODE_UNFINISHED;
+    }
+
+    XLossGrad lossGrad;
+
+    /* we start with the gradient with respect to the loss for output layers */
+    for(int i = 0; i < roots.count; i++){
+        XTensor * root = (XTensor*)roots.Get(i);
+        XTensor * gold = (XTensor*)golds.Get(i);
+        XLink &income = root->income;
+        int funcID = income.typeID;
+        void * params = income.params;
+
+        /* we compute dE/dx if the output is generated by an activation function y = f(x).
+           Note that we do not need to obtain dE/dy here because it is no use in the 
+           folloing process of back-propagation */
+        if(gold != NULL && income.tailNum == 1 && (funcID & FUNCTION_BASE)){
+            XTensor * x = income.tails[0];
+            XNoder::MakeGrad(x);
+            lossGrad.Compute(gold, root, x, NULL, x->grad, funcID, params, loss);
+            root->visitMark = NODE_FINISHED;
+        }
+        /* we compuate dE/dy (y is the output) if no predefined activation function is used */
+        else{
+            XNoder::MakeGrad(root);
+            lossGrad.Compute(gold, root, root->grad, loss);
+        }
+    }
+
+    /* back-propagation from output to input */
+    for(int i = nodes.count - 1; i >= 0; i--){
+        XTensor * node = (XTensor*)nodes.Get(i);
+        if(node->visitMark == NODE_FINISHED)
+            continue;
+
+        BackwardNode(node);
+    }
+}
+
+/* 
+backward propagation to obtain gradient
+with a number of root nodes 
+>> root - a list of root nodes (output) of the network
+>> loss - name of loss function
+*/
+void XNet::Backward(XList &roots, LOSS_FUNCTION_NAME loss)
+{
+    XList golds(roots.count);
+    for(int i = 0; i < roots.count; i++)
+        golds.Add(NULL);
+
+    Backward(roots, golds, loss);
+}
+
+/* 
+backward computation for a given node 
+>> node - the node keeps the result of an operation (e.g., activation function)
+*/
+void XNet::BackwardNode(XTensor * node)
+{
+    if(node == NULL || node->visitMark == NODE_FINISHED)
+        return;
+
+    if(!XNoder::IsLeaf(node)){
+        if(XMathGrad::IsMathOP(node))
+            XMathGrad::MakeGrad(node);
+        else if(XFuncGrad::IsFunc(node))
+            XFuncGrad::MakeGrad(node);
+        else if(XShapeGrad::IsShapeOP(node))
+            XShapeGrad::MakeGrad(node);
+        else{
+            ShowNTErrors("Wrong node type!");
+        }
+    }
+
+    node->visitMark = NODE_FINISHED;
+}
+
+/* 
+traverse the net and find the topological order by 
+depth-first search (Tarjan's algorithm) 
+>> root - root node (or output of the net)
+*/
+void XNet::Traverse(XTensor &root)
+{
+    XList roots(1);
+    roots.Add(&root);
+
+    Traverse(roots);
+}
+
+/* 
+traverse the net and find the topological order by 
+depth-first search (Tarjan's algorithm) 
+>> roots - a list of roots (or output nodes)
+*/
+void XNet::Traverse(XList &roots)
+{
+    id = MakeNetID();
+    nodes.Clear();
+ 
+    for (int i = 0; i < roots.count; i++)
+        TarjanVisit((XTensor*)roots.Get(i), nodes, id);
+
+    for(int i = 0; i < nodes.count; i++){
+        XTensor * node = (XTensor*)nodes.Get(i);
+        if(XNoder::IsRoot(node))
+            outputs.Add(node);
+        if(XNoder::IsLeaf(node))
+            inputs.Add(node);
+        if(XNoder::IsGrad(node))
+            gradNodes.Add(node);
+    }
+}
+
+/* 
+depth-first search given a node (Tarjan's algorithm for topological ordering)
+>> node - the node to visit (mark 0:unvisited, 1:visiting, 2:done)
+>> orders - topological order of the nodes
+>> code - code of the network
+*/
+void XNet::TarjanVisit(XTensor * node, XList &orders, const unsigned int code)
+{
+    if(node == NULL)
+        return;
+
+    if(node->visitMark == code + 1){
+        ShowNTErrors("There is a circle in the network\n");
+    }
+    else if(node->visitMark <= code || node->visitMark >= code + 2){
+        node->visitMark = code + 1;
+        XLink &income = node->income;
+        for(int i = 0; i < income.tailNum; i++){
+            XTensor * child = income.tails[i];
+            if(child == NULL)
+                continue;
+            TarjanVisit(child, orders, code);
+        }
+        node->visitMark = code + 2;
+        orders.Add(node);
+    }
+}
+
+/* 
+dump network information 
+>> file - the file for dumping
+*/
+void XNet::Dump(FILE * file)
+{
+    for(int i = 0; i < nodes.count; i++){
+        XTensor * node =  (XTensor*)nodes.Get(i);
+        fprintf(file, "node %d: %d\n", i, node->id);
+        node->Dump(file, "tensor: ");
+        if(node->grad != NULL)
+            node->grad->Dump(file, "grad: ");
+        else
+            fprintf(file, "no gradient!\n");
+        fprintf(file, "\n");
+    }
+}
+
+}
\ No newline at end of file
--- a/source/network/XNet.h
+++ b/source/network/XNet.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-12
+ * We expected a heavy rain today but a drizzle came down. Should I
+ * take a big umbrella?
+ */
+
+#include "../tensor/XTensor.h"
+#include "../tensor/function/FHeader.h"
+
+#ifndef __XNET_H__
+#define __XNET_H__
+
+namespace nts{
+
+/* management of tensor net (or graph) */
+struct XNet
+{
+    /* id of the network */
+    unsigned int id;
+
+    /* tensor nodes of the network (in order) */
+    XList nodes;
+
+    /* tensor nodes to keep gradient for output (e.g., SGD)*/
+    XList gradNodes;
+
+    /* output nodes of the network */
+    XList outputs;
+
+    /* input nodes of the network */
+    XList inputs;
+
+    /* constructor */
+    XNet();
+
+    /* de-constructor */
+    ~XNet();
+
+    /* clear the network */
+    void Clear();
+
+    /* backward propagation to obtain gradient wrt. the loss/error function */
+    void Backward(XTensor &root, XTensor &gold, LOSS_FUNCTION_NAME loss = NOLOSS);
+
+    /* backward propagation to obtain gradient */
+    void Backward(XTensor &root, LOSS_FUNCTION_NAME loss = NOLOSS);
+
+    /* backward propagation to obtain gradient wrt. the loss/error function
+       with a number of root nodes */
+    void Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss = NOLOSS);
+
+    /* backward propagation to obtain gradient
+       with a number of root nodes */
+    void Backward(XList &roots, LOSS_FUNCTION_NAME loss = NOLOSS);
+
+    /* backward computation for a given node */
+    void BackwardNode(XTensor * node);
+
+    /* traverse the net and find the topological order by 
+       depth-first search (Tarjan's algorithm) */
+    void Traverse(XTensor &root);
+
+    /* traverse the net and find the topological order by 
+       depth-first search (Tarjan's algorithm) */
+    void Traverse(XList &roots);
+
+    /* depth-first search given a node (Tarjan's algorithm for topological ordering) */
+    void TarjanVisit(XTensor * node, XList &orders, const unsigned int code);
+
+    /* dump network information */
+    void Dump(FILE * file);
+};
+
+/* we make a unique id for every tensor */
+extern unsigned int netIDGlobal;
+extern MUTEX_HANDLE netMutex;
+extern unsigned int MakeNetID();
+
+}
+
+#endif
\ No newline at end of file
--- a/source/network/XNoder.cpp
+++ b/source/network/XNoder.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-18
+ */
+
+#include "XNoder.h"
+
+namespace nts{
+
+/* make gradient tensor for a node */
+void XNoder::MakeGrad(XTensor * node)
+{
+    if(node == NULL)
+        return;
+
+    if(!XTensor::IsSameShaped(node, node->grad)){
+        delete node->grad;
+        node->grad = NewTensor(node);
+        node->grad->SetZeroAll();
+    }
+}
+
+/* the node is a leaf node (intput) or not */
+bool XNoder::IsLeaf(XTensor * node)
+{
+    if(node == NULL)
+        return false;
+
+    if(node->income.tailNum == 0)
+        return true;
+    else
+        return false;
+}
+
+/* the node is a root node (output) or not */
+bool XNoder::IsRoot(XTensor * node)
+{
+    if(node == NULL)
+        return false;
+
+    if(node->outgo.tailNum == 0)
+        return true;
+    else
+        return false;
+}
+
+/* the node keeps the gradinent or not */
+bool XNoder::IsGrad(XTensor * node)
+{
+    if(node == NULL)
+        return false;
+    
+    if(node->isGrad)
+        return true;
+    else
+        return false;
+}
+
+}
\ No newline at end of file
--- a/source/network/XNoder.h
+++ b/source/network/XNoder.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * low-level utilities
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-18
+ */
+
+#include "../tensor/XTensor.h"
+
+#ifndef __XNODER_H__
+#define __XNODER_H__
+
+namespace nts{
+
+#define NODE_UNFINISHED 0
+#define NODE_DOING      1
+#define NODE_FINISHED   2
+
+/* node management */
+class XNoder
+{
+public:
+    /* make gradient tensor for a node */
+    static
+    void MakeGrad(XTensor * node);
+
+    /* the node is a leaf node (intput) or not */
+    static
+    bool IsLeaf(XTensor * node);
+
+    /* the node is a root node (output) or not */
+    static
+    bool IsRoot(XTensor * node);
+
+    /* the node keeps the gradinent or not */
+    static
+    bool IsGrad(XTensor * node);
+};
+
+}
+
+#endif
\ No newline at end of file
--- a/source/sample/fnnlm/FNNLM.cpp
+++ b/source/sample/fnnlm/FNNLM.cpp
--- a/source/sample/fnnlm/FNNLM.h
+++ b/source/sample/fnnlm/FNNLM.h
@@ -30,9 +30,9 @@
 #ifndef __FNNLM_H__
 #define __FNNLM_H__

-#include "../../XGlobal.h"
-#include "../../XTensor.h"
-#include "../../core/CHeader.h"
+#include "../../tensor/XGlobal.h"
+#include "../../tensor/XTensor.h"
+#include "../../tensor/core/CHeader.h"

 using namespace nts;

@@ -127,7 +127,6 @@ struct FNNNet
 };

 /* entry of the program */
-extern "C" 
 int FNNLMMain(int argc, const char ** argv);

 };

--- a/source/Main.cpp
+++ b/source/Main.cpp
@@ -28,29 +28,67 @@
 #include <time.h>
 #include "XTensor.h"
 #include "XDevice.h"
-#include "./sample/fnnlm/FNNLM.h"
+#include "./test/Test.h"

-#include "test/Test.h"
+//#define CRTDBG_MAP_ALLOC
+//#include <stdlib.h>  
+//#include <crtdbg.h> 

 using namespace nts;
-using namespace samplefnnlm;

-//#define CRTDBG_MAP_ALLOC    
-//#include <stdlib.h>    
-//#include <crtdbg.h> 
+void SmallTest();

 int main( int argc, const char ** argv )
 {
-    srand((unsigned)time(0));
+    //_CrtSetBreakAlloc(123);
+
+    /* a tiny test */
+    SmallTest();
+
+    //_CrtDumpMemoryLeaks();
+    //return 0;
+
    if(argc > 1 && !strcmp(argv[1], "-test"))
        Test();
-    if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
-        return FNNLMMain(argc - 1, argv + 1);
    else{
        fprintf(stderr, "Thanks for using NiuTrans.Tensor! This is a library that eases the\n");
        fprintf(stderr, "use of tensors. All you need is to ... \n\n");
        fprintf(stderr, "Run this program with \"-test\" for unit test!\n");
-        fprintf(stderr, "Or run this program with \"-fnnlm\" for sample FNNLM!\n");
    }
+
+    //_CrtDumpMemoryLeaks();
+
    return 0;
 }
+
+void SmallTest()
+{
+    XTensor a;
+    XTensor b;
+    XTensor c;
+    XTensor d;
+
+    InitTensor2D(&a, 2, 2);
+    InitTensor2D(&b, 2, 2);
+    a.SetZeroAll();
+    b.SetZeroAll();
+    a.Set2D(1.0F, 0, 0);
+    a.Set2D(2.0F, 1, 1);
+
+    b = Sum(a, Multiply(a, a));
+
+    /* this is prohibited !!!!!!!!!!!!! */
+    //XTensor c = a * b + a;
+    //XTensor d = a + b + c.Lin(0.5F);
+    
+    c = a * b + a;
+    d = a + b + c.Lin(0.5F);
+
+    XLink::CheckNetwork(&d);
+    XLink::ShowNetwork(stderr, &d);
+        
+    a.Dump(stderr, "a:");
+    b.Dump(stderr, "b:");
+    c.Dump(stderr, "c:");
+    d.Dump(stderr, "d:");
+}
--- a/source/XBLAS.cpp
+++ b/source/XBLAS.cpp
--- a/source/XBLAS.h
+++ b/source/XBLAS.h
--- a/source/XDataType.cpp
+++ b/source/XDataType.cpp
@@ -82,7 +82,7 @@ _XINLINE_ float Float16ToFloat(unsigned short h)
 }

 /* 
-data conversion
+data type conversion
 >> devID - device id
 >> s - source data array
 >> typeS - source data type
@@ -92,7 +92,7 @@ data conversion
 */
 void ConvertDataType(int devID, void * s, TENSOR_DATA_TYPE typeS, void * t, TENSOR_DATA_TYPE typeT, int size)
 {
-    CheckNTErrors((devID < 0), "This code must be run on GPUs!");
+    CheckNTErrors((devID < 0), "This code must be run on CPUs!");

    if(typeS == typeT)
        return;

--- a/source/XDataType.h
+++ b/source/XDataType.h
@@ -47,9 +47,9 @@ extern const char * GetDataTypeName(TENSOR_DATA_TYPE type);
 extern TENSOR_DATA_TYPE GetDataType(const char * typeName);

 /* data conversion (for lower precision computation) */
-extern "C" unsigned short FloatToFloat16(float f);
-extern "C" float Float16ToFloat(unsigned short h);
-extern "C" void ConvertDataType(int devID, 
+unsigned short FloatToFloat16(float f);
+float Float16ToFloat(unsigned short h);
+void ConvertDataType(int devID, 
                                void * s, TENSOR_DATA_TYPE typeS, 
                                void * t, TENSOR_DATA_TYPE typeT, int size);


--- a/source/XDevice.cpp
+++ b/source/XDevice.cpp
--- a/source/XDevice.h
+++ b/source/XDevice.h
--- a/source/XGlobal.cpp
+++ b/source/XGlobal.cpp
--- a/source/XGlobal.h
+++ b/source/XGlobal.h
@@ -74,7 +74,7 @@ namespace nts {
 { \
    if(!(x)) \
    { \
-        fprintf(stderr, "Error! calling '%s' (%s line %d): %s\n", #x, __FILENAME__, __LINE__, msg); \
+        fprintf(stderr, "[ERROR] calling '%s' (%s line %d): %s\n", #x, __FILENAME__, __LINE__, msg); \
        exit(1); \
    } \
 } \
@@ -83,7 +83,7 @@ namespace nts {
 { \
    if(!(x)) \
    { \
-        fprintf(stderr, "Error! calling '%s' (%s line %d): %s\n", #x, __FILENAME__, __LINE__); \
+        fprintf(stderr, "[ERROR] calling '%s' (%s line %d): %s\n", #x, __FILENAME__, __LINE__); \
        exit(1); \
    } \
 } \
@@ -91,7 +91,7 @@ namespace nts {
 #define ShowNTErrors(msg) \
 { \
    { \
-        fprintf(stderr, "Error! (%s line %d): %s\n", __FILENAME__, __LINE__, msg); \
+        fprintf(stderr, "[ERROR] (%s line %d): %s\n", __FILENAME__, __LINE__, msg); \
        exit(1); \
    } \
 } \

--- a/source/XHeap.cpp
+++ b/source/XHeap.cpp
--- a/source/XHeap.h
+++ b/source/XHeap.h
--- a/source/tensor/XLink.cpp
+++ b/source/tensor/XLink.cpp
--- a/source/XLink.h
+++ b/source/XLink.h
@@ -34,6 +34,7 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 struct XTensor;

 #define MAX_OP_NAME_LENGTH 16
+#define PARAM_UNTI_SIZE    64

 /*
 This defines the link among tensors in networks. XLink can be
@@ -74,6 +75,12 @@ struct XLink
    /* name of the hyperedge type. e.g., sum, mul ... */
    char type[MAX_OP_NAME_LENGTH];
    
+    /* type id */
+    int typeID;
+
+    /* caculator (pointer to the class for computation) */
+    void * caculator;
+    
    /* constuctor */
    XLink();
    
@@ -83,8 +90,22 @@ struct XLink
    /* reset it */
    void Reset();

-    /* set edge type name */
-    void SetType(const char * typeName);
+    /* clear it */
+    void Clear();
+
+    /* clear tails */
+    void ClearTail();
+
+    /* clear the incoming node list of tensor node */
+    static
+    void ClearIncoming(XTensor * node);
+    
+    /* clear the outgoing node list of tensor node */
+    static
+    void ClearOutgoing(XTensor * node);
+
+    /* set edge type id and name */
+    void SetType(int id);

    /* set head */
    void SetHead(XTensor * h);
@@ -95,19 +116,32 @@ struct XLink
    /* add two tails in one time */
    void AddTwoTails(XTensor * t1, XTensor * t2);

-    /* add a integer parameter */
+    /* add a parameter in default type */
    void AddParam(DTYPE param);

-    /* add a integer parameter */
+    /* add a parameter */
    void AddParam(void * param, int size);

+    /* get a paramter in default type */
+    DTYPE GetParam(int i);
+
+    /* get a paramter in integer */
+    int GetParamInt(int i);
+    
+    /* get a parameter in MATRIX_TRANS_TYPE */
+    MATRIX_TRANS_TYPE GetParamTrans(int i);
+
    /* create a hyper edge with two input tensors and a output tensor */
    static
-    void MakeLink(XTensor * t1, XTensor * t2, XTensor * h, const char * typeName);
+    void MakeLink(const XTensor * t1, const XTensor * t2, XTensor * h, int id);
+
+    /* create a hyper edge with a list of input tensors and a output tensor */
+    static
+    void MakeLink(const XList * list, XTensor * h, int id);

-    /* create a hyper edge with a list of tensors and a output tensor */
+    /* create a hyper edge with a input tensors and a list of output tensors */
    static
-    void MakeLink(XList * list, XTensor * h, const char * typeName);
+    void MakeLink(XTensor * h, XList * list, int id);

    /* add a parameter */
    static
@@ -116,8 +150,36 @@ struct XLink
    /* add an integer parameter */
    static
    void AddParamToHeadInt(XTensor * h, int param);
+
+    /* add a MATRIX_TRANS_TYPE parameter */
+    static
+    void AddParamToHeadTrans(XTensor * h, MATRIX_TRANS_TYPE param);
+
+    /* add a boolean parameter */
+    static
+    void AddParamToHeadBool(XTensor * h, bool param);
+
+    /* add a pointer parameter */
+    static
+    void AddParamToHeadPointer(XTensor * h, void * param);
+
+    /* replace a node with another, i.e., we redirect the links to the new node */
+    static 
+    void Replace(const XTensor * oldOne, XTensor * newOne);
+
+    /* copy links of a given node */
+    static
+    void CopyIncoming(const XTensor * reference, XTensor * target);
+
+    /* check the correctness of the network encoded in a root node (tensor) */
+    static
+    void CheckNetwork(XTensor * root);
+
+    /* show the network encoded in a root node (tensor) */
+    static
+    void ShowNetwork(FILE * file, XTensor * root);
 };
    
 } // namespace nts(NiuTrans.Tensor)

-#endif // __XLINK_H__
\ No newline at end of file
+#endif // __XLINK_H__
--- a/source/XList.cpp
+++ b/source/XList.cpp
@@ -42,6 +42,8 @@
 /* the nts (NiuTrans.Tensor) namespace */
 namespace nts{

+XList NULLList;
+
 /* constructor */
 XList::XList()
 {
@@ -111,7 +113,7 @@ void XList::Create(int myMaxNum, XMem * myMem)
 add an item into the list
 >> item - pointer to the item
 */
-void XList::Add(void * item)
+void XList::Add(const void * item)
 {
    if( count == maxNum ){
        void ** newItems;
@@ -126,7 +128,8 @@ void XList::Add(void * item)
        maxNum = maxNum * 2 + 1;
    }
    
-    items[count++] = item;
+    MTYPE p = (MTYPE)item;
+    items[count++] = (MTYPE*)p;

 }

@@ -203,7 +206,7 @@ void XList::Insert(int pos, void * item)
 }

 /* get the item at position i */
-void * XList::GetItem(int i)
+void * XList::GetItem(int i) const
 {
    if( i >= 0 && i < count )
        return items[i];
@@ -355,4 +358,4 @@ void XList::Shuffle(int nround, int beg, int len)
 }

 } 
-/* end of the nts (NiuTrans.Tensor) namespace */
\ No newline at end of file
+/* end of the nts (NiuTrans.Tensor) namespace */
--- a/source/XList.h
+++ b/source/XList.h
@@ -69,12 +69,12 @@ public:

    /* utilities */
    void Create(int myMaxNum, XMem * myMem);
-    void Add(void * item);
+    void Add(const void * item);
    void Add(void ** inputItems, int inputItemCount);
    void AddList(XList * l);
    void AddInt(int i);
    void Insert(int pos, void * item);
-    void * GetItem(int i);   
+    void * GetItem(int i) const;   
    int GetItemInt(int i);
    void SetItem(int i, void * item);
    void SetItemInt(int i, int item);
@@ -96,7 +96,9 @@ public:

 };

+extern XList NULLList;
+
 } 
 /* end of the nts (NiuTrans.Tensor) namespace */

-#endif
\ No newline at end of file
+#endif
--- a/source/XMem.cpp
+++ b/source/XMem.cpp
--- a/source/XMem.h
+++ b/source/XMem.h
@@ -24,6 +24,8 @@
 #ifndef __XMEM_H__
 #define __XMEM_H__

+#include <stdlib.h>
+
 #ifdef CUDA_BLAS
 #define USE_CUDA
 #endif

--- a/source/tensor/XName.cpp
+++ b/source/tensor/XName.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-05
+ */
+
+#include "XName.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+    
+/* get operator name */
+const char * GetOPName(int type)
+{
+    if ((type & MATH_BASE) != 0){
+        if (type == MATH_ABSOLUTE)
+            return "M_ABSOLUTE";
+        else if (type == MATH_MATRIXMUL)
+            return "M_MATRIXMUL";
+        else if (type == MATH_MATRIXMULBATCHED)
+            return "M_MATRIXMULBATCHED";
+        else if (type == MATH_MULTIPLY)
+            return "M_MULTIPLY";
+        else if (type == MATH_NEGATE)
+            return "M_NEGATE";
+        else if (type == MATH_SIGN)
+            return "M_SIGN";
+        else if (type == MATH_SUM)
+            return "M_SUM";
+        else if (type == MATH_LOG)
+            return "M_LOG";
+        else if (type == MATH_NORMALIZE)
+            return "M_NORMALIZE";
+        else if (type == MATH_POWER)
+            return "M_POWER";
+        else if (type == MATH_SCALEANDSHIFT)
+            return "M_SCALEANDSHIFT";
+        else if (type == REDUCE_REDUCEMAX)
+            return "R_REDUCEMAX";
+        else if (type == REDUCE_REDUCEMEAN)
+            return "R_REDUCEMEAN";
+        else if (type == REDUCE_REDUCESUM)
+            return "R_REDUCESUM";
+        else if (type == REDUCE_REDUCESUMSQUARED)
+            return "R_REDUCESUMSQUARED";
+        else if (type == REDUCE_REDUCEVARIANCE)
+            return "R_REDUCEVARIANCE";
+    }
+    else if ((type & DATA_BASE) != 0){
+        if (type == GETANDSET_SELECT)
+            return "G_SELECT";
+        else if (type == MOVEMENT_COPYINDEXED)
+            return "M_COPYINDEXED";
+        else if (type == MOVEMENT_COPYVALUES)
+            return "M_COPYVALUES";
+        else if (type == SHAPE_CONCATENATE)
+            return "S_CONCATENATE";
+        else if (type == SHAPE_MERGE)
+            return "S_MERGE";
+        else if (type == SHAPE_MERGE_LIST)
+            return "S_MERGE_LIST";
+        else if (type == SHAPE_PERMUTE)
+            return "S_PERMUTE";
+        else if (type == SHAPE_SPLIT)
+            return "S_SPLIT";
+        else if (type == SHAPE_SPLIT_LIST)
+            return "S_SPLIT_LIST";
+        else if (type == SHAPE_TRANSPOSE)
+            return "S_TRANSPOSE";
+        else if (type == SHAPE_UNSQUEEZE)
+            return "S_UNSQUEEZE";
+        else if (type == SORT_SORT)
+            return "S_SORT";
+        else if (type == SORT_TOPK)
+            return "S_TOPK";
+    }
+    else if ((type & FUNCTION_BASE) != 0){
+        if (type == FUNC_HARDTANH)
+            return "F_HARDTANH";
+        else if (type == FUNC_IDENTITY)
+            return "F_IDENTITY";
+        else if (type == FUNC_LOGSOFTMAX)
+            return "F_LOGSOFTMAX";
+        else if (type == FUNC_RECTIFY)
+            return "F_RECTIFY";
+        else if (type == FUNC_SIGMOID)
+            return "F_SIGMOID";
+        else if (type == FUNC_SOFTMAX)
+            return "F_SOFTMAX";
+    }
+    
+    return "NULL";
+}
+    
+} // namespace nts(NiuTrans.Tensor)
+
--- a/source/tensor/XName.h
+++ b/source/tensor/XName.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ *
+ * We define various names here
+ *
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-05
+ * It was really HOT these days. I can't imagine it is SO hot here in Shenyang!
+*/
+
+#ifndef __XNAME_H__
+#define __XNAME_H__
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* math operations */
+#define MATH_BASE               0x00001000
+#define MATH_ABSOLUTE           MATH_BASE + 1
+#define MATH_MATRIXMUL          MATH_ABSOLUTE + 1
+#define MATH_MATRIXMULBATCHED   MATH_MATRIXMUL + 1
+#define MATH_MULTIPLY           MATH_MATRIXMULBATCHED + 1
+#define MATH_NEGATE             MATH_MULTIPLY + 1
+#define MATH_SIGN               MATH_NEGATE + 1
+#define MATH_SUM                MATH_SIGN + 1
+
+#define MATH_LOG                MATH_SUM + 1
+#define MATH_NORMALIZE          MATH_LOG + 1
+#define MATH_POWER              MATH_NORMALIZE + 1
+#define MATH_SCALEANDSHIFT      MATH_POWER + 1
+
+#define REDUCE                  MATH_SCALEANDSHIFT + 1
+#define REDUCE_REDUCEMAX        REDUCE + 1
+#define REDUCE_REDUCEMEAN       REDUCE_REDUCEMAX + 1
+#define REDUCE_REDUCESUM        REDUCE_REDUCEMEAN + 1
+#define REDUCE_REDUCESUMSQUARED REDUCE_REDUCESUM + 1
+#define REDUCE_REDUCEVARIANCE   REDUCE_REDUCESUMSQUARED + 1
+
+/* data and shape related operations */
+#define DATA_BASE               MATH_BASE * 2
+#define GETANDSET               DATA_BASE + 1
+#define GETANDSET_SELECT        GETANDSET + 1
+
+#define MOVEMENT                GETANDSET_SELECT + 1
+#define MOVEMENT_COPYINDEXED    MOVEMENT + 1
+#define MOVEMENT_COPYVALUES     MOVEMENT_COPYINDEXED + 1
+
+#define SHAPE                   MOVEMENT_COPYVALUES + 1
+#define SHAPE_CONCATENATE       SHAPE + 1
+#define SHAPE_MERGE             SHAPE_CONCATENATE + 1
+#define SHAPE_MERGE_LIST        SHAPE_MERGE + 1
+#define SHAPE_PERMUTE           SHAPE_MERGE_LIST + 1
+#define SHAPE_SPLIT             SHAPE_PERMUTE + 1
+#define SHAPE_SPLIT_LIST        SHAPE_SPLIT + 1
+#define SHAPE_TRANSPOSE         SHAPE_SPLIT_LIST + 1
+#define SHAPE_UNSQUEEZE         SHAPE_TRANSPOSE + 1
+
+#define SORT                    SHAPE_UNSQUEEZE + 1
+#define SORT_SORT               SORT + 1
+#define SORT_TOPK               SORT_SORT + 1
+
+/* activation functions */
+#define FUNCTION_BASE           DATA_BASE * 2
+#define FUNC_HARDTANH           FUNCTION_BASE + 1
+#define FUNC_IDENTITY           FUNC_HARDTANH + 1
+#define FUNC_LOGSOFTMAX         FUNC_IDENTITY + 1
+#define FUNC_RECTIFY            FUNC_LOGSOFTMAX + 1
+#define FUNC_SIGMOID            FUNC_RECTIFY + 1
+#define FUNC_SOFTMAX            FUNC_SIGMOID + 1
+
+/* get operator name */
+const char * GetOPName(int type);
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __XNAME_H__
--- a/source/XPRunner.cpp
+++ b/source/XPRunner.cpp
--- a/source/XPRunner.h
+++ b/source/XPRunner.h
--- a/source/XQueue.cpp
+++ b/source/XQueue.cpp
--- a/source/XQueue.h
+++ b/source/XQueue.h
--- a/source/XStream.cpp
+++ b/source/XStream.cpp
--- a/source/XStream.h
+++ b/source/XStream.h
--- a/source/XTensor.cpp
+++ b/source/XTensor.cpp
--- a/source/XTensor.h
+++ b/source/XTensor.h
@@ -55,12 +55,13 @@ struct XLink;
 #define UNSAFE_BUT_FAST_MEM
 #define FAST_MATRIX

-/* 
-We implemente the tensor class here though we have defined the class of XMatrix. It
-is the parent class of XMatrix.
-*/
+/* XTensor is a class to do everything a tensor can do :) */
 struct XTensor
 {
+public:
+    /* id */
+    int id;
+
    /* memory pool */
    XMem * mem;

@@ -70,6 +71,10 @@ struct XTensor
    /* copy of data on the host memory. It is only activated 
       when the matrix is operated on GPUs */
    void * dataHost;
+    
+    /* a pointer to data (i.e., a pointer to the address of "data".
+       This is for reset "data" when XTensor is used as a const variable. */
+    void ** dataP;

    /* 
    device id 
@@ -130,6 +135,18 @@ struct XTensor

    /* indicates whether the tensor is initialized or not */
    bool isInit;
+
+    /* indicates whether the tensor is created temporarily */
+    bool isTmp;
+
+    /* indicates whether the tensor keeps the gradient when used as model parameters */
+    bool isGrad;
+
+    /* mark for traversing the gragh */
+    unsigned int visitMark;
+
+    /* gradient (for back-propagation) */
+    XTensor * grad;
    
    /*
    the link used to form networks. Note that when we compute on tensors, we actually create a
@@ -152,31 +169,49 @@ struct XTensor
    XTensor();

    /* constructor */
-    XTensor(XTensor * reference);
+    XTensor(const XTensor * reference);

    /* constructor */
    XTensor(const int myOrder, int myDevID, XMem * myMem);

    /* constructor */
    XTensor(const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType, 
-            const float myDenseRatio, XMem * myMem);
+            const float myDenseRatio, int myDevID, XMem * myMem);
+
+    /* copy constructor */
+    XTensor(const XTensor &reference);

    /* de-constructor */
    ~XTensor();

+    /* initialize member variables */
+    void Init();
+
    /* delete data arrays */
    void DestroyData();

+    /* shallow copy of tensor */
+    void ShallowCopy(const XTensor &tensor);
+
    /* overloading of the equal-sign */
-    XTensor& operator = (const XTensor& tensor);
+    XTensor& operator= (const XTensor &tensor);
+
+    /* overloading of the plus-sign */
+    XTensor  operator+ (const XTensor &tensor);
+
+    /* overloading of the multiply-sign */
+    XTensor  operator* (const XTensor &tensor);
+
+    /* linear transformation */
+    XTensor Lin(DTYPE scale, DTYPE shift = 0);

    /* judge whether the two matrices are in the same type and size */
    static
-    bool IsIdentical(XTensor * a, XTensor * b);
+    bool IsSameShaped(const XTensor * a, const XTensor * b);

    /* judge whether the three matrices are in the same type and size */
    static
-    bool IsIdentical(XTensor * a, XTensor * b, XTensor * c);
+    bool IsSameShaped(XTensor * a, XTensor * b, XTensor * c);

    /* set the size of each dimension */
    void SetDim(int * myDimSize);
@@ -188,7 +223,7 @@ struct XTensor
    void Reshape(const int order, const int * myDimSize);

    /* get the number of items in the data array */
-    int GetSize();
+    int GetSize() const;

    /* get size of the memory used */
    int GetDataSizeInChar();
@@ -211,6 +246,12 @@ struct XTensor
    /* check whether the data array is the same as the answer */
    bool CheckData(const void * answer, int num, int beg = 0);

+    /* check whether the data array is the same as the answer */
+    bool CheckData(const void * answer, int num, float tolerance, int beg = 0);
+    
+    /* set the pointer to "data" */
+    void SetDataPointer();
+
    /* set the cell to the ascending order along a given dimension */
    void SetAscendingOrder(int dim);

@@ -218,17 +259,26 @@ struct XTensor
    DTYPE Get(int index[], int size = -1);

    /* get the pointer to a cell */
-    void * GetCell(int index[], int size = -1);
+    void * GetCell(int index[], int size = -1) const;

-    /* get the value of a cell in a 1d tensor */
+    /* get the default type value of a cell in a 1d tensor */
    DTYPE Get1D(int i);

-    /* get the value of a cell in a 2d tensor */
-    DTYPE Get2D(int ni, int mi);
+    /* get the default type value of a cell in a 2d tensor */
+    DTYPE Get2D(int ni, int mi) const;
    
-    /* get the value of a cell in a 3d tensor */
+    /* get the default type value of a cell in a 3d tensor */
    DTYPE Get3D(int d0, int d1, int d2);

+    /* get the int value of a cell in a 1d tensor */
+    int Get1DInt(int i);
+
+    /* get the int value of a cell in a 2d tensor */
+    int Get2DInt(int ni, int mi);
+    
+    /* get the int value of a cell in a 3d tensor */
+    int Get3DInt(int d0, int d1, int d2);
+
    /* get the value of a cell in a sparse tensor */
    DTYPE GetInSparse(int i);

@@ -253,6 +303,12 @@ struct XTensor
    /* get the number of non-zero elements (in a sparse tensor) */
    int GetNonzeroSize();

+    /* set the tensor as "temporary" */
+    void SetTMP(bool myIsTmp = true);
+
+    /* set the tensor as "keep-gradient" */
+    void SetGrad(bool myIsGrad = true);
+
    /* resize a matrix with a specified matrix size */
    bool Resize(const int myOrder, const int * myDimSize,
                const TENSOR_DATA_TYPE myDataType = DEFAULT_DTYPE,
@@ -267,7 +323,7 @@ struct XTensor
    bool Resize(const XTensor * myTensor);

    /* binary search to find an element in a sparse matrix*/
-    bool BinarySearch(int key, DTYPE &value, void * &position);
+    bool BinarySearch(int key, DTYPE &value, void * &position) const;

    /* dump data to a file */
    void Dump(FILE * file, const char * label = NULL, const int n = -1, const int verbose = 0);
@@ -287,6 +343,12 @@ struct XTensor
    void FreeData(XTensor * matrix, XMem * myMem = NULL, bool useBuf = false);
 };

+/* we make a unique id for every tensor */
+extern int tensorIDGlobal;
+extern MUTEX_HANDLE tensorMutex;
+extern XTensor NULLTensor;
+extern int MakeTensorID();
+
 /************************************************
 * we define the "new and delete" functions below
 */
@@ -317,7 +379,7 @@ void InitTensor5D(XTensor * tensor, const int d0, const int d1, const int d2, co
                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL);

 /* initialize a tensor with a reference tensor */
-void InitTensor(XTensor * tensor, XTensor * reference);
+void InitTensor(XTensor * tensor, const XTensor * reference);

 /* generate a XTensor */
 XTensor * NewTensor(const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType = X_FLOAT,

--- a/source/XThread.cpp
+++ b/source/XThread.cpp
--- a/source/XThread.h
+++ b/source/XThread.h
@@ -32,9 +32,6 @@
 #define USE_PTHREAD // for linux
 #endif

-/* the nts (NiuTrans.Tensor) namespace */
-namespace nts{
-
 //////////////////////////////////////////////////
 // neccessary libs
 #ifdef USE_PTHREAD
@@ -46,12 +43,15 @@ namespace nts{
 #endif
 #endif

+/* the nts (NiuTrans.Tensor) namespace */
+namespace nts{
+
 #if(defined(_WIN32) && !defined (__CYGWIN__))
 #define CRFPP_USE_THREAD 1
 #define BEGINTHREAD(src, stack, func, arg, flag, id) \
-     (HANDLE)_beginthreadex((void *)(src), (unsigned)(stack), \
-                            (unsigned(_stdcall *)(void *))(func), (void *)(arg), \
-                            (unsigned)(flag), (unsigned *)(id))
+                   (HANDLE)_beginthreadex((void *)(src), (unsigned)(stack), \
+                   (unsigned(_stdcall *)(void *))(func), (void *)(arg), \
+                   (unsigned)(flag), (unsigned *)(id))
 #endif

 //////////////////////////////////////////////////

--- a/source/XUtility.cpp
+++ b/source/XUtility.cpp
@@ -176,12 +176,16 @@ void XMemCopy(void * t, int devIDT, const void * s, int devIDS, size_t size)
    }
 #ifdef USE_CUDA
    else if(devIDT >= 0 && devIDS < 0){
-        CheckNTErrors((cudaMemcpy(t, s, size, cudaMemcpyHostToDevice) == cudaSuccess),
-                            "cudaMemcpy error (cudaMemcpyHostToDevice)");
+        cudaError_t error = cudaMemcpy(t, s, size, cudaMemcpyHostToDevice);
+        if(error != cudaSuccess){
+            ShowNTErrors("cudaMemcpy error (cudaMemcpyHostToDevice)");
+        }
    }
    else if(devIDT < 0 && devIDS >= 0){
-        CheckNTErrors((cudaMemcpy(t, s, size, cudaMemcpyDeviceToHost) == cudaSuccess),
-                            "cudaMemcpy error (cudaMemcpyDeviceToHost)");
+        cudaError_t error = cudaMemcpy(t, s, size, cudaMemcpyDeviceToHost);
+        if(error != cudaSuccess){
+            ShowNTErrors("cudaMemcpy error (cudaMemcpyDeviceToHost)");
+        }
    }
    else{
        //if(devIDT == devIDS){

--- a/source/XUtility.h
+++ b/source/XUtility.h
--- a/source/tensor/core/CHeader.h
+++ b/source/tensor/core/CHeader.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
+*/
+
+/* this is a header to include all functions in the "core" workspace */
+
+#ifndef __CHEADER_H__
+#define __CHEADER_H__
+
+#include "../XTensor.h"
+
+#include "shape/Concatenate.h"
+#include "shape/ConcatenateSolely.h"
+#include "movement/CopyBlocks.h"
+#include "movement/CopyBlocksInGrid.h"
+#include "movement/CopyBlocksOnSite.h"
+#include "movement/CopyData2D.h"
+#include "movement/CopyIndexed.h"
+#include "movement/CopyInGrid.h"
+#include "movement/CopyValues.h"
+#include "utilities/FlushToMem.h"
+#include "shape/MakeMergeBlockIndex.h"
+#include "shape/MakeSplitBlockIndex.h"
+#include "arithmetic/MatrixMul.h"
+#include "arithmetic/MatrixMul2D.h"
+#include "arithmetic/MatrixMul2DMultiTheading.h"
+#include "arithmetic/MatrixMul2DParallel.h"
+#include "arithmetic/MatrixMulBatched.h"
+#include "arithmetic/MatrixMULBatchedCPU.h"
+#include "shape/Merge.h"
+#include "shape/MergeBlockLists.h"
+#include "arithmetic/Multiply.h"
+#include "arithmetic/Negate.h"
+#include "math/Normalize.h"
+#include "shape/Permute.h"
+#include "math/Power.h"
+#include "reduce/ReduceMax.h"
+#include "reduce/ReduceMean.h"
+#include "reduce/ReduceStandardVariance.h"
+#include "reduce/ReduceSum.h"
+#include "reduce/ReduceSumSquared.h"
+#include "reduce/ReduceVariance.h"
+#include "math/ScaleAndShift.h"
+#include "getandset/Select.h"
+#include "getandset/SetData.h"
+#include "sort/Sort.h"
+#include "shape/Split.h"
+#include "arithmetic/Sum.h"
+#include "arithmetic/SumByColumnTV.h"
+#include "arithmetic/SumByColumnVT.h"
+#include "sort/TopK.h"
+#include "shape/Transpose.h"
+#include "shape/Unsqueeze.h"
+#include "utilities/XMatrixSegment.h"
+#include "arithmetic/XTensorBLAS.h"
+
+#endif // __CHEADER_H__
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Absolute.cpp
+++ b/source/tensor/core/arithmetic/Absolute.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
+*/
+
+#include <math.h>
+#include "../../XTensor.h"
+#include "../../XName.h"
+#include "Absolute.h"
+#include "Absolute.cuh"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/*
+set every entry to its absolute value
+>> a - input tensor we are processing
+>> b - output tensor we are processing
+*/
+void _Absolute(const XTensor * a, XTensor * b)
+{
+#ifdef USE_CUDA
+    /* run it on GPUs */
+    if (a->devID >= 0) {
+        _CudaAbsolute(a, b);
+    return;
+}
+#endif
+
+    CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
+    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
+    DTYPE * d = (DTYPE*)a->data;
+    DTYPE * db = (DTYPE*)b->data;
+    for (int i = 0; i < a->unitNum; i++)
+        db[i] = (DTYPE)fabs(d[i]);
+}
+
+/*
+set every entry to its absolute value (do it on site)
+keep the result in the input tensor a and return nothing
+>> a - the tensor we are processing
+*/
+void _AbsoluteMe(XTensor * a)
+{
+    _Absolute(a, a);
+}
+
+/*
+set every entry to its absolute value (return a XTensor structure)
+make a new tensor to keep the result and return it
+>> a - input tensor we are processing
+<< return - the absolute value of input tensor
+*/
+XTensor Absolute(const XTensor & a)
+{
+    XTensor b(&a);
+    b.SetTMP();
+    
+    /* call _Absolute function */
+    _Absolute(&a, &b);
+    
+    /* tensor connections */
+    XLink::MakeLink(&a, NULL, &b, MATH_ABSOLUTE);
+    
+    return b;
+}
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Absolute.cu
+++ b/source/tensor/core/arithmetic/Absolute.cu
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
+*/
+
+#include "../../XDevice.h"
+#include "../../XTensor.h"
+#include "Absolute.h"
+#include "Absolute.cuh"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+#ifdef USE_CUDA
+/*
+set each entry to its absolute value (CUDA Kernel)
+>> a - pointer to input data array
+>> b - pointer to output data array
+>> size - size of the data array
+*/
+__global__
+void KernelAbsolute(DTYPE * a, DTYPE * b, int size)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < size)
+        b[i] = fabs(a[i]);
+}
+
+/*
+set each entry to its absolute value (CUDA Kernel)
+This is for float16 computation
+>> a - pointer to input data array
+>> b - pointer to output data array
+>> size - size of the data array
+*/
+__global__
+void KernelAbsolute(__half * a, __half * b, int size)
+{
+    return;
+}
+
+/*
+set each entry to its absolute value
+>> a - input tensor
+>> b - output tensor
+*/
+void _CudaAbsolute(const XTensor * a, XTensor * b)
+{
+    CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
+    CheckNTErrors((a->isSparse == false), "TODO!");
+
+    int gridSize[3];
+    int blockSize[3];
+
+    GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
+
+    dim3 blocks(gridSize[0]);
+    dim3 threads(blockSize[0]);
+
+    int devIDBackup;
+    ProtectCudaDev(a->devID, devIDBackup);
+
+    if (a->dataType == DEFAULT_DTYPE) {
+        KernelAbsolute << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum);
+    }
+    else if (a->dataType == X_FLOAT16) {
+        KernelAbsolute << <blocks, threads >> >((__half*)a->data, (__half*)b->data, a->unitNum);
+    }
+    else {
+        ShowNTErrors("TODO!");
+    }
+
+    BacktoCudaDev(a->devID, devIDBackup);
+}
+
+#endif // USE_CUDA
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/Absolute.cuh
+++ b/source/tensor/core/arithmetic/Absolute.cuh
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
+*/
+
+#include "Absolute.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+#ifdef USE_CUDA
+
+/* set each entry to its absolute value (CUDA Kernel) */
+__global__
+void KernelAbsolute(DTYPE * a, DTYPE * b, int size);
+
+/* set each entry to its absolute value (CUDA Kernel) with float16 data type*/
+__global__
+void KernelAbsolute(__half * a, __half * b, int size);
+
+/* set each entry to its absolute value */
+void _CudaAbsolute(const XTensor * a, XTensor * b);
+
+#endif // USE_CUDA
+
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Absolute.h
+++ b/source/tensor/core/arithmetic/Absolute.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
+*/
+
+#ifndef __ABSOLUTE_H__
+#define __ABSOLUTE_H__
+
+#include "../../XTensor.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* set every entry to its absolute value */
+void _Absolute(const XTensor * a, XTensor * b);
+
+/*
+set every entry to its absolute value (do it on site)
+keep the result in the input tensor a and return nothing
+*/
+void _AbsoluteMe(XTensor * a);
+
+/* 
+set every entry to its absolute value (return a XTensor structure)
+make a new tensor to keep the result and return it
+*/
+XTensor Absolute(const XTensor & a);
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __ABSOLUTE_H__
--- a/source/core/MatrixMULBatchedCPU.cpp
+++ b/source/core/MatrixMULBatchedCPU.cpp
@@ -19,7 +19,7 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */

-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "MatrixMULBatchedCPU.h"
 #include "MatrixMul2D.h"
 #include "XTensorBLAS.h"
@@ -33,16 +33,16 @@ c_i = trans(a_i) * trans(b_i) * \alpha + c_i * \beta for each i in [0,count-1]
 >> transposedA - indicate whether the matrix a is transposed
 >> b - another list of input matrices (2d tensors)
 >> transposedB - indicate whether the matrix b is transposed
+>> c - output matrix (2d tensor)
 >> alpha - scalar
 >> beta - scalar
->> c - output matrix (2d tensor)
 */
-void MatrixMULBatchedCPU(XList * a, MATRIX_TRANS_TYPE transposedA,
-                         XList * b, MATRIX_TRANS_TYPE transposedB,
-                         XList * c, DTYPE alpha, DTYPE beta)
+void _MatrixMULBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA,
+                          const XList * b, MATRIX_TRANS_TYPE transposedB,
+                          XList * c, DTYPE alpha, DTYPE beta)
 {
-    CheckNTErrors((a && b && c), "Empty input lists!");
-    CheckNTErrors((a->count == b->count && a->count == c->count), "Input lists must be of the same size!");
+    CheckNTErrors(a && b && c, "Empty input lists!");
+    CheckNTErrors(a->count == b->count && a->count == c->count, "Input lists must be of the same size!");

    if (a->count == 0)
        return;
@@ -55,19 +55,15 @@ void MatrixMULBatchedCPU(XList * a, MATRIX_TRANS_TYPE transposedA,
        XTensor * ai = (XTensor*)a->GetItem(i);
        XTensor * bi = (XTensor*)b->GetItem(i);
        XTensor * ci = (XTensor*)c->GetItem(i);
-        if (!XTensor::IsIdentical(aim, ai) ||
-            !XTensor::IsIdentical(bim, bi) ||
-            !XTensor::IsIdentical(cim, ci))
+        if (!XTensor::IsSameShaped(aim, ai) ||
+            !XTensor::IsSameShaped(bim, bi) ||
+            !XTensor::IsSameShaped(cim, ci))
        {
            isUniform = false;
            break;
        }
    }

-    //if(isUniform){
-    //}
-    //else{
-
    for (int i = 0; i < a->count; i++) {
        XTensor * ai = (XTensor*)a->GetItem(i);
        XTensor * bi = (XTensor*)b->GetItem(i);
@@ -77,11 +73,11 @@ void MatrixMULBatchedCPU(XList * a, MATRIX_TRANS_TYPE transposedA,
        CheckNTErrors((ci->order == 2), "2d tensor (i.e., matrix) is required!");
 #ifdef USE_BLAS
        if (useBLAS)
-            MatrixMULCPU(ai, transposedA, bi, transposedB, ci, alpha, beta);
+            _MatrixMULCPU(ai, transposedA, bi, transposedB, ci, alpha, beta);
        else
-        MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
+            _MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
 #else
-        MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
+        _MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
 #endif
    }
    //}

--- a/source/core/MatrixMULBatchedCPU.h
+++ b/source/core/MatrixMULBatchedCPU.h
@@ -22,14 +22,13 @@
 #ifndef __MATRIXMULBATCHEDCPU_H__
 #define __MATRIXMULBATCHEDCPU_H__

-#include "../XTensor.h"
+#include "../../XTensor.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

 /* matrix multiplication in batch mode (CPU code) */
-extern "C"
-void MatrixMULBatchedCPU(XList * a, MATRIX_TRANS_TYPE transposedA, XList * b, MATRIX_TRANS_TYPE transposedB, XList * c,
-    DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
+void _MatrixMULBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA, const XList * b, MATRIX_TRANS_TYPE transposedB, 
+                          XList * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/core/MatrixMul.cpp
+++ b/source/core/MatrixMul.cpp
--- a/source/tensor/core/arithmetic/MatrixMul.h
+++ b/source/tensor/core/arithmetic/MatrixMul.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
+*/
+
+#ifndef __MATRIXMUL_H__
+#define __MATRIXMUL_H__
+
+#include "../../XTensor.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+#define MMul MatrixMul
+
+/*
+matrix multiplication c = trans(a) * trans(b) * alpha + c * beta
+
+For the input tensors a and b, we perform matrix multiplicationon the first two dimentsions. 
+E.g., let A be a tensor of size y * z * m and B bea tensor of size x * y * n. 
+For A * B, we go over each order-2 tensor of A (of size x * y) and each order-2 tensor B (of size z * x), 
+like this c_{i,j} = trans(ai) * trans(bj) * alpha + c_{i,j} * beta
+where trans() returns the transposed matrix if the flag is fired, ai is the i-th element tensor of A,
+bj is the j-th element tensor of B, and c_{i,j} is the (i,j) elementtensor of the result C. 
+C should be a tensor of z * x * n * m. 
+Obviously C = A * B performs normal matrix multiplication if A = y * z and B = x * y.
+*/
+void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
+                DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
+
+/* 
+matrix multiplication (return a XTensor structure) c = trans(a) * trans(b) * alpha
+make a new tensor c to keep the result and return it
+
+For the input tensors a and b, we perform matrix multiplicationon the first two dimentsions. 
+E.g., let A be a tensor of size y * z * m and B bea tensor of size x * y * n. 
+For A * B, we go over each order-2 tensor of A (of size x * y) and each order-2 tensor B (of size z * x), 
+like this c_{i,j} = trans(ai) * trans(bj) * alpha + c_{i,j} * beta
+where trans() returns the transposed matrix if the flag is fired, ai is the i-th element tensor of A,
+bj is the j-th element tensor of B, and c_{i,j} is the (i,j) elementtensor of the result C. 
+C should be a tensor of z * x * n * m. 
+Obviously C = A * B performs normal matrix multiplication if A = y * z and B = x * y.
+*/
+XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB, 
+                  DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
+
+/* matrix multiplication with no transposition c = a * b * alpha*/
+XTensor MatrixMul(const XTensor &a, const XTensor &b, 
+                  DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
+
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __MATRIXMUL_H__
\ No newline at end of file
--- a/source/core/MatrixMul2D.cpp
+++ b/source/core/MatrixMul2D.cpp
@@ -19,8 +19,8 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */

-#include "../XTensor.h"
-#include "../XName.h"
+#include "../../XTensor.h"
+#include "../../XName.h"
 #include "MatrixMul2D.h"
 #include "MatrixMul2D.cuh"
 #include "MatrixMul2DParallel.h"
@@ -30,8 +30,10 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /*
 matrix multiplication (for 2d tensors)
+
 c = trans(a) * trans(b) * alpha + c * beta
 where trans() return the transposed matrix if the flag is fired
+
 >> a - tensor a
 >> transposedA - indicates whether the matrices in a are transposed
 >> b - tensor b
@@ -42,23 +44,16 @@ where trans() return the transposed matrix if the flag is fired
 >> parallelRunner - parallel processing module
 >> stream - the string for creating the job pipeline
 */
-void MatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA,
-                 XTensor * b, MATRIX_TRANS_TYPE transposedB,
-                 XTensor * c, DTYPE alpha, DTYPE beta,
-                 XPRunner * parallelRunner, XStream * stream)
+void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
+                  const XTensor * b, MATRIX_TRANS_TYPE transposedB,
+                  XTensor * c, DTYPE alpha, DTYPE beta,
+                  XPRunner * parallelRunner, XStream * stream)
 {
    CheckNTErrors((a && b && c), "Empty input tensors!");
-    CheckNTErrors((a->dataType == b->dataType), "Input tensors should have the same data type!");
+    CheckNTErrors((a->dataType == b->dataType), "Input tensors should                have the same data type!");
    CheckNTErrors((a->order == 2 && b->order == 2 && c->order == 2),
                  "Input tensors must have a order = 2!");

-    /* make tensor connections */
-    XLink::MakeLink(a, b, c, MATH_MATRIXMUL2D);
-    XLink::AddParamToHeadInt(c, transposedA);
-    XLink::AddParamToHeadInt(c, transposedB);
-    XLink::AddParamToHead(c, alpha);
-    XLink::AddParamToHead(c, beta);
-
 	int an = a->dimSize[0], am = a->dimSize[1];
 	int bn = b->dimSize[0], bm = b->dimSize[1];
 	int cn = c->dimSize[0], cm = c->dimSize[1];
@@ -74,7 +69,7 @@ void MatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA,

 #ifdef USE_CUDA
    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
-        CudaMatrixMul2D(a, transposedA, b, transposedB, c, alpha, beta, stream);
+        _CudaMatrixMul2D(a, transposedA, b, transposedB, c, alpha, beta, stream);
        return;
    }
 #endif
@@ -88,9 +83,9 @@ void MatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA,
            c->dataType == DEFAULT_DTYPE)
        {
            if (useBLAS)
-                MatrixMULCPU(a, transposedA, b, transposedB, c, alpha, beta);
+                _MatrixMULCPU(a, transposedA, b, transposedB, c, alpha, beta);
            else
-                MatrixMul2DParallel(a, transposedA, b, transposedB, c, alpha, beta, parallelRunner);
+                _MatrixMul2DParallel(a, transposedA, b, transposedB, c, alpha, beta, parallelRunner);
        }
        else {
            // TODO!!
@@ -112,7 +107,7 @@ void MatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA,
            int num = *((int*)b->data);
            char * p = (char*)b->data + sizeof(int); // pointer to the first tuple

-                                                            /* a * b */
+            /* a * b */
            if (transposedA == X_NOTRANS && transposedB == X_NOTRANS) {
                for (int i = 0; i < num; i++) {
                    int key = *((int*)p);

--- a/source/core/MatrixMul2D.cu
+++ b/source/core/MatrixMul2D.cu
@@ -19,8 +19,8 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */

-#include "../XDevice.h"
-#include "../XTensor.h"
+#include "../../XDevice.h"
+#include "../../XTensor.h"
 #include "MatrixMul2D.h"
 #include "MatrixMul2D.cuh"
 #include "XTensorBLAS.h"
@@ -37,17 +37,19 @@ c = a * b * \alpha
 >> aColSize - column size of matrix a
 >> aRowSize - row size of matrix a
 >> b - a sparse matrix
->> transposedA - indicates whether b is transposed
+>> transposedB - indicates whether b is transposed
 >> bNonZeroNum - number of non-zero items in b
 >> bColSize - column size of matrix b
 >> bRowSize - row size of matrix b
 >> c - the resulting (dense) matrix
+>> cColSize - column size of matrix c
+>> cRowSize - row size of matrix c
 >> alpha - the scaling factor
 */
-extern "C" __global__
+__global__
 void KernelMatrixMulDenseMSparseMV2(DTYPE * a, MATRIX_TRANS_TYPE transposedA, int aColSize, int aRowSize,
-    void * b, MATRIX_TRANS_TYPE transposedB, int bNonZeroNum, int bColSize, int bRowSize,
-    DTYPE * c, int cColSize, int cRowSize, DTYPE alpha)
+                                    void * b, MATRIX_TRANS_TYPE transposedB, int bNonZeroNum, int bColSize, int bRowSize,
+                                    DTYPE * c, int cColSize, int cRowSize, DTYPE alpha)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;

@@ -106,8 +108,10 @@ void KernelMatrixMulDenseMSparseMV2(DTYPE * a, MATRIX_TRANS_TYPE transposedA, in

 /*
 matrix multiplication (for 2d tensors) (cuda version)
+
 c = trans(a) * trans(b) * alpha + c * beta
 where trans() return the transposed matrix if the flag is fired
+
 >> a - tensor a
 >> transposedA - indicates whether the matrices in a are transposed
 >> b - tensor b
@@ -117,10 +121,9 @@ where trans() return the transposed matrix if the flag is fired
 >> beta - another coefficient
 >> stream - the string for creating the job pipeline
 */
-void CudaMatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA,
-                     XTensor * b, MATRIX_TRANS_TYPE transposedB,
-                     XTensor * c,
-                     DTYPE alpha, DTYPE beta, XStream * stream)
+void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
+                      const XTensor * b, MATRIX_TRANS_TYPE transposedB,
+                      XTensor * c, DTYPE alpha, DTYPE beta, XStream * stream)
 {
    int an = transposedA == X_TRANS ? a->dimSize[1] : a->dimSize[0];
    int am = transposedA == X_TRANS ? a->dimSize[0] : a->dimSize[1];
@@ -147,7 +150,6 @@ void CudaMatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA,
    if (!a->isSparse && !b->isSparse) {
        CheckNTErrors((!c->isSparse), "Illegal use of sparse matrix in multiplication!");

-        //cublasHandle_t * handle = GDevs->GetCudaHandle(a->devID);
        cublasHandle_t * handle = a->mem == NULL ? GDevs.GetCudaHandle(a->devID) : a->mem->GetCublasHandle();

        /* !!!! might have problems */
@@ -155,9 +157,12 @@ void CudaMatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA,
            cublasSetStream(*handle, stream->stream);

        if (a->dataType == X_FLOAT && b->dataType == X_FLOAT && c->dataType == X_FLOAT) {
-            CudaBLASMatrixMUL(handle, a->data, transposedA, a->dataType, b->data, transposedB, a->dataType, c->data, c->dataType,
-                a->dimSize[0], a->dimSize[1], b->dimSize[0], b->dimSize[1], c->dimSize[0], c->dimSize[1],
-                alpha, beta);
+            _CudaBLASMatrixMUL(handle, a->data, transposedA, a->dataType, 
+                               b->data, transposedB, a->dataType, c->data, c->dataType,
+                               a->dimSize[0], a->dimSize[1], 
+                               b->dimSize[0], b->dimSize[1], 
+                               c->dimSize[0], c->dimSize[1],
+                               alpha, beta);
        }
        else {
            // TODO!!
@@ -183,7 +188,6 @@ void CudaMatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA,
            if (beta == 0)
                c->SetZeroAll();
            else if (beta != 1.0F) {
-                //XTensor::ScaleAndShift(c, beta, 0);
                ShowNTErrors("TODO!");
            }


--- a/source/core/MatrixMul2D.cuh
+++ b/source/core/MatrixMul2D.cuh
@@ -32,19 +32,18 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 mutilication of a dense matrix with a sparse vector
 c = a * b * \alpha
 */
-extern "C" __global__
+__global__
 void KernelMatrixMulDenseMSparseMV2(DTYPE * a, MATRIX_TRANS_TYPE transposedA, int aColSize, int aRowSize,
-    void * b, MATRIX_TRANS_TYPE transposedB, int bNonZeroNum, int bColSize, int bRowSize,
-    DTYPE * c, int cColSize, int cRowSize, DTYPE alpha);
+                                    void * b, MATRIX_TRANS_TYPE transposedB, int bNonZeroNum, int bColSize, int bRowSize,
+                                    DTYPE * c, int cColSize, int cRowSize, DTYPE alpha);

 /*
 matrix multiplication (for 2d tensors) (cuda version)
 c = trans(a) * trans(b) * alpha + c * beta
 where trans() return the transposed matrix if the flag is fired
 */
-extern "C"
-void CudaMatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA, XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
-    DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XStream * stream = NULL);
+void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
+                      DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XStream * stream = NULL);

 #endif // USE_CUDA


--- a/source/core/MatrixMul2D.h
+++ b/source/core/MatrixMul2D.h
@@ -22,7 +22,7 @@
 #ifndef __MATRIXMUL2D_H__
 #define __MATRIXMUL2D_H__

-#include "../XTensor.h"
+#include "../../XTensor.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

@@ -31,9 +31,8 @@ matrix multiplication (for 2d tensors)
 c = trans(a) * trans(b) * alpha + c * beta
 where trans() return the transposed matrix if the flag is fired
 */
-extern "C"
-void MatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA, XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
-    DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL, XStream * stream = NULL);
+void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
+                  DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL, XStream * stream = NULL);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/core/MatrixMul2DMultiTheading.cpp
+++ b/source/core/MatrixMul2DMultiTheading.cpp
@@ -19,7 +19,7 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */

-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "MatrixMul2DMultiTheading.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -38,7 +38,7 @@ argument5: matrix a
 argument6: matrix b
 argument7: matrix c (c=a*b*\alpha + c*beta)
 */
-void MatrixMul2DMultiTheading(XList * args)
+void _MatrixMul2DMultiTheading(XList * args)
 {
    int x1 = *(int*)args->GetItem(0);
    int y1 = *(int*)args->GetItem(1);

--- a/source/core/MatrixMul2DMultiTheading.h
+++ b/source/core/MatrixMul2DMultiTheading.h
@@ -22,7 +22,7 @@
 #ifndef __MATRIXMUL2DMULTITHEADING_H__
 #define __MATRIXMUL2DMULTITHEADING_H__

-#include "../XTensor.h"
+#include "../../XTensor.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

@@ -30,8 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 matrix multiplication for a block (x1,y1) - (x2,y2)
 where (x1,y1) is the upper-left corner and (x2,y2) is the bottom-right corner
 */
-extern "C"
-void MatrixMul2DMultiTheading(XList * args);
+void _MatrixMul2DMultiTheading(XList * args);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/core/MatrixMul2DParallel.cpp
+++ b/source/core/MatrixMul2DParallel.cpp
@@ -19,10 +19,10 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */

-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "MatrixMul2DParallel.h"
 #include "MatrixMul2DMultiTheading.h"
-#include "XMatrixSegment.h"
+#include "../utilities/XMatrixSegment.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

@@ -30,6 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 matrix multiplication (for 2d tensors) with multi-threading
 c = trans(a) * trans(b) * alpha + c * beta
 where trans() return the transposed matrix if the flag is fired
+
 >> a - tensor a
 >> transposedA - indicates whether the matrices in a are transposed
 >> b - tensor b
@@ -39,10 +40,9 @@ where trans() return the transposed matrix if the flag is fired
 >> beta - another coefficient
 >> parallelRunner - parallel processing module
 */
-void MatrixMul2DParallel(XTensor * a, MATRIX_TRANS_TYPE transposedA,
-    XTensor * b, MATRIX_TRANS_TYPE transposedB,
-    XTensor * c, DTYPE alpha, DTYPE beta,
-    XPRunner * parallelRunner)
+void _MatrixMul2DParallel(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
+                          const XTensor * b, MATRIX_TRANS_TYPE transposedB,
+                          XTensor * c, DTYPE alpha, DTYPE beta, XPRunner * parallelRunner)
 {
    CheckNTErrors((a && b && c), "Empty input tensors!");
    CheckNTErrors((a->order == 2 && b->order == 2 && c->order == 2),
@@ -56,7 +56,7 @@ void MatrixMul2DParallel(XTensor * a, MATRIX_TRANS_TYPE transposedA,

    /* a * b */
    if (transposedA == X_NOTRANS && transposedB == X_NOTRANS) {
-        RunParallel2D(parallelRunner, (void*)MatrixMul2DMultiTheading, an * am * bm,
+        RunParallel2D(parallelRunner, (void*)_MatrixMul2DMultiTheading, an * am * bm,
                      cn, cm, 5,
                      a, b, c, &alpha, &beta);
    }

--- a/source/core/MatrixMul2DParallel.h
+++ b/source/core/MatrixMul2DParallel.h
@@ -22,18 +22,17 @@
 #ifndef __MATRIXMUL2DPARALLEL_H__
 #define __MATRIXMUL2DPARALLEL_H__

-#include "../XTensor.h"
+#include "../../XTensor.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

 /*
-matrix multiplication (for 2d tensors) with multi-threading
+matrix multiplication (for 2d tensors) with multi-threading.
 c = trans(a) * trans(b) * alpha + c * beta
-where trans() return the transposed matrix if the flag is fired
+where trans() return the transposed matrix if the flag is fired.
 */
-extern "C"
-void MatrixMul2DParallel(XTensor * a, MATRIX_TRANS_TYPE transposedA, XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
-    DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
+void _MatrixMul2DParallel(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, 
+                          XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/core/MatrixMulBatched.cpp
+++ b/source/core/MatrixMulBatched.cpp
@@ -19,9 +19,9 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */

-#include "../XTensor.h"
-#include "../XDevice.h"
-#include "../XName.h"
+#include "../../XTensor.h"
+#include "../../XDevice.h"
+#include "../../XName.h"
 #include "MatrixMulBatched.h"
 #include "MatrixMULBatchedCPU.h"
 #include "XTensorBLAS.h"
@@ -30,10 +30,12 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /*
 matrix multiplication of the two tensors
+
 for each 2-dimensional data array in a (denoted as ai) and
 each 2-dimensional data array in b (denoted as bi), we have
 ci = trans(ai) * trans(bi) * alpha + cm * beta
 where trans() returns the transposed matrix if the flag is fired
+
 >> a - tensor a
 >> transposedA - indicates whether the matrices in a are transposed
 >> b - tensor b
@@ -41,31 +43,26 @@ where trans() returns the transposed matrix if the flag is fired
 >> c - where we keep a*b
 >> alpha - a coefficient
 >> beta - another coefficient
+>> parallelRunner - parallel processing module
 */
-void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA,
-                      XTensor * b, MATRIX_TRANS_TYPE transposedB,
-                      XTensor * c, DTYPE alpha, DTYPE beta,
-                      XPRunner * parallelRunner)
+void _MatrixMulBatched(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
+                       const XTensor * b, MATRIX_TRANS_TYPE transposedB,
+                       XTensor * c, DTYPE alpha, DTYPE beta, XPRunner * parallelRunner)
 {
    CheckNTErrors((a && b && c), "Empty input tensors!");
    CheckNTErrors((a->dataType == b->dataType && a->dataType == c->dataType),
                  "Input tensors should have the same data type!");
    CheckNTErrors((a->order >= 2 && b->order >= 2 && c->order >= 2),
-                  "Input tensors must have a order > 2!");
-
-    /* make tensor connections */
-    XLink::MakeLink(a, b, c, MATH_MATRIXMULBATCHED);
-    XLink::AddParamToHeadInt(c, transposedA);
-    XLink::AddParamToHeadInt(c, transposedB);
-    XLink::AddParamToHead(c, alpha);
-    XLink::AddParamToHead(c, beta);
-
-    int an = transposedA == X_TRANS ? a->dimSize[1] : a->dimSize[0];
-    int am = transposedA == X_TRANS ? a->dimSize[0] : a->dimSize[1];
-    int bn = transposedB == X_TRANS ? b->dimSize[1] : b->dimSize[0];
-    int bm = transposedB == X_TRANS ? b->dimSize[0] : b->dimSize[1];
-    int cn = c->dimSize[0];
-    int cm = c->dimSize[1];
+                  "Input tensors must have a order >= 2!");
+    CheckNTErrors((a->order == b->order && a->order == c->order), 
+                  "Input tensor and output tensor must have same order!");
+
+    int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
+    int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
+    int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1];
+    int bm = transposedB == X_TRANS ? b->dimSizeRDI[1] : b->dimSizeRDI[0];
+    int cn = c->dimSizeRDI[1];
+    int cm = c->dimSizeRDI[0];

    CheckNTErrors((am == bn && an == cn && bm == cm),
        "Unmatched tensors in multiplication!");
@@ -87,17 +84,17 @@ void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA,
    XList * aList = new XList(10);
    XList * bList = new XList(10);
    XList * cList = new XList(10);
-    int aDimSize[2] = { -a->dimSizeRDI[0], a->dimSizeRDI[1] };
-    int bDimSize[2] = { -b->dimSizeRDI[0], b->dimSizeRDI[1] };
-    int cDimSize[2] = { -c->dimSizeRDI[0], c->dimSizeRDI[1] };
+    int aDimSize[2] = { -a->dimSizeRDI[1], a->dimSizeRDI[0] };
+    int bDimSize[2] = { -b->dimSizeRDI[1], b->dimSizeRDI[0] };
+    int cDimSize[2] = { -c->dimSizeRDI[1], c->dimSizeRDI[0] };

    for (int p = 0; p < blockNum; p++) {
        void * ap = (char*)a->data + aRealBlockSize * p;
        void * bp = (char*)b->data + bRealBlockSize * p;
        void * cp = (char*)c->data + cRealBlockSize * p;
-        XTensor * ai = new XTensor(2, aDimSize, a->dataType, a->denseRatio, a->mem);
-        XTensor * bi = new XTensor(2, bDimSize, b->dataType, b->denseRatio, b->mem);
-        XTensor * ci = new XTensor(2, cDimSize, c->dataType, c->denseRatio, c->mem);
+        XTensor * ai = NewTensor(2, aDimSize, a->dataType, a->denseRatio, a->devID, a->mem);
+        XTensor * bi = NewTensor(2, bDimSize, b->dataType, b->denseRatio, b->devID, b->mem);
+        XTensor * ci = NewTensor(2, cDimSize, c->dataType, c->denseRatio, c->devID, c->mem);
        ai->data = ap;
        bi->data = bp;
        ci->data = cp;
@@ -114,11 +111,12 @@ void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA,
        int devIDBackup;
        ProtectCudaDev(a->devID, devIDBackup);

-        CudaBLASMatrixMULList(a->mem != NULL ? a->mem->GetCublasHandle() : GDevs.GetCudaHandle(a->devID),
-                              aList, transposedA,
-                              bList, transposedB,
-                              cList, aList->count,
-                              alpha, beta);
+        cublasHandle_t * handle = a->mem != NULL ? a->mem->GetCublasHandle() : GDevs.GetCudaHandle(a->devID);
+        _CudaBLASMatrixMULList(handle,
+							   aList, transposedA,
+                               bList, transposedB,
+                               cList, aList->count,
+                               alpha, beta);

        BacktoCudaDev(a->devID, devIDBackup);
 #else
@@ -127,7 +125,7 @@ void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA,
    }
    else {
        CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
-        MatrixMULBatchedCPU(aList, transposedA,
+        _MatrixMULBatchedCPU(aList, transposedA,
            bList, transposedB,
            cList, alpha, beta);
    }
@@ -155,4 +153,63 @@ void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA,
    delete cList;
 }

+/*
+matrix multiplication of the two tensors (do it on site)
+c = trans(a) * trans(b) * alpha
+make a new tensor to keep the result and return it
+
+for each 2-dimensional data array in a (denoted as ai) and
+each 2-dimensional data array in b (denoted as bi), we have
+ci = trans(ai) * trans(bi) * alpha + cm * beta
+where trans() returns the transposed matrix if the flag is fired.
+
+>> a - tensor a
+>> transposedA - indicates whether the matrices in a are transposed
+>> b - tensor b
+>> transposedB - indicates whether teh matrices in b are transposed
+>> alpha - a coefficient
+>> parallelRunner - parallel processing module
+<< return - the result of matrix multiplication of the two tensors
+*/
+XTensor MatrixMulBatched(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB,
+                         DTYPE alpha, XPRunner * parallelRunner)
+{
+    CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
+    CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
+    CheckNTErrors(a.order == b.order, "Input tensor and output tensor must have same order!");
+
+    int an = transposedA == X_TRANS ? a.dimSizeRDI[0] : a.dimSizeRDI[1];
+    int am = transposedA == X_TRANS ? a.dimSizeRDI[1] : a.dimSizeRDI[0];
+    int bn = transposedB == X_TRANS ? b.dimSizeRDI[0] : b.dimSizeRDI[1];
+    int bm = transposedB == X_TRANS ? b.dimSizeRDI[1] : b.dimSizeRDI[0];
+
+    CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
+
+    int order = a.order;
+    int sub = 0;
+    int * dimSize = new int[order];
+    for (int i = 0; i < a.order - 2; i++)
+        dimSize[sub++] = a.dimSize[i];
+    dimSize[sub++] = an;
+    dimSize[sub++] = bm;
+
+    float dr = (!a.isSparse || !b.isSparse) ? 1.0F : MAX(a.denseRatio, b.denseRatio);
+    XTensor c(order, dimSize, a.dataType, dr, a.devID, a.mem);
+    c.SetTMP();
+
+    /*call _MatrixMulBatched function */
+    _MatrixMulBatched(&a, transposedA, &b, transposedB, &c, alpha, 0, parallelRunner);
+
+    /* tensor connections */
+    XLink::MakeLink(&a, &b, &c, MATH_MATRIXMULBATCHED);
+    XLink::AddParamToHeadTrans(&c, transposedA);
+    XLink::AddParamToHeadTrans(&c, transposedB);
+    XLink::AddParamToHead(&c, alpha);
+
+    /* destroy variables */
+    delete[] dimSize;
+
+    return c;
+}
+
 } // namespace nts(NiuTrans.Tensor)
--- a/source/core/MatrixMulBatched.h
+++ b/source/core/MatrixMulBatched.h
@@ -22,20 +22,32 @@
 #ifndef __MATRIXMULBATCHED_H__
 #define __MATRIXMULBATCHED_H__

-#include "../XTensor.h"
+#include "../../XTensor.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

 /*
-matrix multiplication of the two tensors
+matrix multiplication of the two tensors c = trans(a) * trans(b) * alpha + c * beta
+
+for each 2-dimensional data array in a (denoted as ai) and
+each 2-dimensional data array in b (denoted as bi), we have
+ci = trans(ai) * trans(bi) * alpha + cm * beta
+where trans() returns the transposed matrix if the flag is fired
+*/
+void _MatrixMulBatched(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB,
+                       XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
+
+/*
+matrix multiplication of the two tensors (return a XTensor structure) c = trans(a) * trans(b) * alpha
+make a new tensor to keep the result and return it
+
 for each 2-dimensional data array in a (denoted as ai) and
 each 2-dimensional data array in b (denoted as bi), we have
 ci = trans(ai) * trans(bi) * alpha + cm * beta
 where trans() returns the transposed matrix if the flag is fired
 */
-extern "C"
-void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA, XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
-    DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
+XTensor MatrixMulBatched(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB,
+                         DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/core/Multiply.cpp
+++ b/source/core/Multiply.cpp
@@ -19,37 +19,36 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */

-#include "../XTensor.h"
-#include "../XName.h"
+#include "../../XTensor.h"
+#include "../../XName.h"
 #include "Multiply.h"
 #include "Multiply.cuh"

 namespace nts { // namespace nts(NiuTrans.Tensor)
+
 /*
 element-wise product of two tensors
+
 c(i) = a(i)*b(i) + \alpha * c(i)
 where i is the index of the item
+
 >> a - matrix a
 >> b - matrix b
 >> c - result matrix
 >> alpha - the coefficient
->>
+>> leadingDim - the dimension along which we perform broadcasting
 */
-void Multiply(XTensor * a, XTensor * b, XTensor * c, int leadingDim, DTYPE alpha)
+void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
 {
 	int leadingDimRDI = a->order - leadingDim - 1;
    CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum),
-        "Unmatched tensors in multiplication!");
-    CheckNTErrors((a->order == b->order && a->order == c->order), "Unmatched tensors!");
-
-    /* make tensor connections */
-    XLink::MakeLink(a, b, c, MATH_MULTIPLY);
-    XLink::AddParamToHeadInt(c, leadingDim);
-    XLink::AddParamToHead(c, alpha);
+                  "Unmatched tensors in multiplication!");
+    CheckNTErrors((a->order == b->order && a->order == c->order), 
+                  "Unmatched tensors!");

 #ifdef USE_CUDA
    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
-        CudaMultiply(a, b, c, leadingDim, alpha);
+        _CudaMultiply(a, b, c, alpha, leadingDim);
        return;
    }
 #endif
@@ -122,4 +121,50 @@ void Multiply(XTensor * a, XTensor * b, XTensor * c, int leadingDim, DTYPE alpha
    }
 }

-} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
+/*
+element-wise product of two tensors (do it on site)
+keep the result in the input tensor a and return nothing
+
+a(i) = a(i)*b(i) + \alpha * a(i)
+where i is the index of the item
+
+>> a - tensor a (where keep the result)
+>> b - tensor b
+>> alpha - the coefficient
+>> leadingDim - the dimension along which we perform broadcasting
+*/
+void _MultiplyMe(XTensor * a, const XTensor * b, DTYPE alpha, int leadingDim)
+{
+    _Multiply(a, b, a, alpha, leadingDim);
+}
+
+/*
+element-wise product of two tensors (return a XTensor structure)
+make a new tensor c to keep the result and return it
+
+c(i) = a(i)*b(i)
+where i is the index of the item
+
+>> a - tensor a
+>> b - tensor b
+>> leadingDim - the dimension along which we perform broadcasting
+<< return - the product of the tensors
+*/
+XTensor Multiply(const XTensor &a, const XTensor &b, int leadingDim)
+{
+    CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!");
+
+    XTensor c(&a);
+    c.SetTMP();
+    
+    /* call _Multiply function */
+    _Multiply(&a, &b, &c, 0, leadingDim);
+    
+    /* tensor connections */
+    XLink::MakeLink(&a, &b, &c, MATH_MULTIPLY);
+    XLink::AddParamToHeadInt(&c, leadingDim);
+    
+    return c;
+}
+
+} // namespace nts(NiuTrans.Tensor)
--- a/source/core/Multiply.cu
+++ b/source/core/Multiply.cu
@@ -19,8 +19,8 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */

-#include "../XDevice.h"
-#include "../XTensor.h"
+#include "../../XDevice.h"
+#include "../../XTensor.h"
 #include "Multiply.h"
 #include "Multiply.cuh"

@@ -34,7 +34,7 @@ multiplication of data arrays in a element-wise manner c(i) = a(i)*b(i)
 >> c - result data array
 >> size - size of c
 */
-extern "C" __global__
+__global__
 void KernelMulElementWise(DTYPE * a, DTYPE * b, DTYPE * c, int size)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;
@@ -51,7 +51,7 @@ multiplication of data arrays in a element-wise manner c(i) = a(i)*b(i) + \alpha
 >> size - size of c
 >> alpha - the coefficient
 */
-extern "C" __global__
+__global__
 void KernelMulElementWiseV2(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE alpha)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;
@@ -68,6 +68,7 @@ where |a_lead| means the size of the leading dimension of a
 >> a - tensor a
 >> b - tensor b
 >> c - result tensor
+>> alpha - the coefficient
 >> stride - the number of items we go over when move next along the leading dimension in a block
 >> ldSizeA - size of the leading dimension of a
 >> ldSizeB - size of the leading dimension of b
@@ -116,15 +117,14 @@ where i is the item index
 >> a - tensor a
 >> b - tensor b
 >> c - result tensor
->> leadingDim - leading dimension
 >> alpha - the coefficient
+>> leadingDim - dimension along which we perform broadcasting
 */
-extern "C"
-void CudaMultiply(XTensor * a, XTensor * b, XTensor * c, int leadingDim, DTYPE alpha)
+void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
 {
 	int leadingDimRDI = a->order - leadingDim - 1;
    CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum),
-        "Unmatched tensors in multiplication!");
+                  "Unmatched tensors in multiplication!");
    CheckNTErrors((a->order == b->order && a->order == c->order), "Unmatched tensors!");

    int stride = 1;
@@ -137,8 +137,8 @@ void CudaMultiply(XTensor * a, XTensor * b, XTensor * c, int leadingDim, DTYPE a
    for (int i = 0; i < a->order; i++) {
        if (i != leadingDimRDI) {
            CheckNTErrors((a->dimSizeRDI[i] == b->dimSizeRDI[i] &&
-                a->dimSizeRDI[i] == c->dimSizeRDI[i]),
-                "Unmatched tensors!");
+                           a->dimSizeRDI[i] == c->dimSizeRDI[i]),
+                          "Unmatched tensors!");
        }
        if (i < leadingDimRDI)
            stride *= a->dimSizeRDI[i];

--- a/source/core/Multiply.cuh
+++ b/source/core/Multiply.cuh
@@ -29,11 +29,11 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA

 /* multiplication of two tensors in a element-wise manner c(i) = a(i)*b(i) */
-extern "C" __global__
+__global__
 void KernelMulElementWise(DTYPE * a, DTYPE * b, DTYPE * c, int size);

 /* multiplication of two tensors in a element-wise manner c(i) = a(i)*b(i) + \alpha*c(i) */
-extern "C" __global__
+__global__
 void KernelMulElementWiseV2(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE alpha);

 /* multiplication of two tensors in a element-wise manner c(i) = a(i)*b(i)+ \alpha*c(i)  */
@@ -41,8 +41,7 @@ template<int nonZeroAlpha>__global__
 void KernelMulElementWiseTensorDynamic(DTYPE * a, DTYPE * b, DTYPE * c, DTYPE alpha, int stride, int ldSizeA, int ldSizeB, int ldSizeC, int blockNum);

 /* element-wise product of two tensors */
-extern "C"
-void CudaMultiply(XTensor * a, XTensor * b, XTensor * c, int leadingDim = 0, DTYPE alpha = 0);
+void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha = 0, int leadingDim = 0);

 #endif // USE_CUDA


--- a/source/tensor/core/arithmetic/Multiply.h
+++ b/source/tensor/core/arithmetic/Multiply.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
+*/
+
+#ifndef __MULTIPLY_H__
+#define __MULTIPLY_H__
+
+#include "../../XTensor.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* 
+element-wise product of two tensors:
+c(i) = a(i)*b(i) + \alpha * c(i) 
+where i is the index of the element
+*/
+void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha = 0, int leadingDim = 0);
+
+/* 
+element-wise product of two tensors (do it on site)
+keep the result in the input tensor a and return nothing
+a(i) = a(i)*b(i) + \alpha * a(i) 
+where i is the index of the element 
+*/
+void _MultiplyMe(XTensor * a, const XTensor * b, DTYPE alpha = 0, int leadingDim = 0);
+
+/* 
+element-wise product of two tensors (return a XTensor structure)
+make a new tensor to keep the result and return it
+c(i) = a(i)*b(i)
+where i is the index of the element 
+*/
+XTensor Multiply(const XTensor &a, const XTensor &b, int leadingDim = 0);
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __MULTIPLY_H__
\ No newline at end of file
--- a/source/core/Negate.cpp
+++ b/source/core/Negate.cpp
@@ -19,29 +19,64 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */

-#include "../XTensor.h"
+#include "../../XTensor.h"
+#include "../../XName.h"
 #include "Negate.h"
 #include "Negate.cuh"

 namespace nts { // namespace nts(NiuTrans.Tensor)

 /*
-    set every entry to its minus value
-    >> a - the tensor we are processing
+set every entry to its minus value
+>> a - input tensor we are processing
+>> b - output tensor we are processing
 */
-void Negate(XTensor * a)
+void _Negate(const XTensor * a, XTensor * b)
 {
 #ifdef USE_CUDA
    /* run it on GPUs */
    if (a->devID >= 0) {
-        CudaNegate(a);
+        _CudaNegate(a, b);
    return;
-}
+    }
 #endif

+    CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
    DTYPE * d = (DTYPE*)a->data;
+    DTYPE * db = (DTYPE*)b->data;
    for (int i = 0; i < a->unitNum; i++)
-        d[i] = -d[i];
+        db[i] = -d[i];
+}
+
+/*
+set every entry to its minus value (do it on site)
+keep the result in the input tensor a and return nothing
+>> a - the tensor we are processing
+*/
+void _NegateMe(XTensor * a)
+{
+    _Negate(a, a);
 }
+
+/*
+set every entry to its minus value (return a XTensor structure)
+make a new tensor to keep the result and return it
+>> a - input tensor we are processing
+<< return - the minus value of input tensor
+*/
+XTensor Negate(const XTensor & a)
+{
+    XTensor b(&a);
+    b.SetTMP();
+    
+    /* call _Negate function */
+    _Negate(&a, &b);
+    
+    /* tensor connections */
+    XLink::MakeLink(&a, NULL, &b, MATH_NEGATE);
+    
+    return b;
+}
+
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/core/Negate.cu
+++ b/source/core/Negate.cu
@@ -19,8 +19,8 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */

-#include "../XDevice.h"
-#include "../XTensor.h"
+#include "../../XDevice.h"
+#include "../../XTensor.h"
 #include "Negate.h"
 #include "Negate.cuh"

@@ -29,45 +29,48 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA
 /*
 set each entry to its negtive value (CUDA Kernel)
->> d - pointer to the data array
+>> a - pointer to the input data array
+>> b - pointer to the output data array
 >> size - size of the data array
 */
 __global__
-void KernelNegate(DTYPE * d, int size)
+void KernelNegate(DTYPE * a, DTYPE * b, int size)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    if (i < size)
-        d[i] = -d[i];
+        b[i] = -a[i];
 }

 /*
-    set each entry to its negtive value (CUDA Kernel)
-    This is for float16 computation
-    >> d - pointer to the data array
-    >> size - size of the data array
+set each entry to its negtive value (CUDA Kernel)
+This is for float16 computation
+>> a - pointer to the input data array
+>> b - pointer to the output data array
+>> size - size of the data array
 */
 __global__
-void KernelNegate(__half * d, int size)
+void KernelNegate(__half * a, __half * b, int size)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;

 #if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
        if (i < size)
-            d[i] = __hsub(__float2half(0), d[i]);
+            b[i] = __hsub(__float2half(0), a[i]);
 #else
        if (i < size)
-            d[i] = __float2half(-__half2float(d[i]));
+            b[i] = __float2half(-__half2float(a[i]));
 #endif
 }

 /*
 set each entry to its negtive value
->> a - the tensor
+>> a - input tensor
+>> b - output tensor
 */
-extern "C"
-void CudaNegate(XTensor * a)
+void _CudaNegate(const XTensor * a, XTensor * b)
 {
+    CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
    CheckNTErrors((a->isSparse == false), "TODO!");

    int gridSize[3];
@@ -82,10 +85,10 @@ void CudaNegate(XTensor * a)
    ProtectCudaDev(a->devID, devIDBackup);

    if (a->dataType == DEFAULT_DTYPE) {
-        KernelNegate << <blocks, threads >> >((DTYPE*)a->data, a->unitNum);
+        KernelNegate << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum);
    }
    else if (a->dataType == X_FLOAT16) {
-        KernelNegate << <blocks, threads >> >((__half*)a->data, a->unitNum);
+        KernelNegate << <blocks, threads >> >((__half*)a->data, (__half*)b->data, a->unitNum);
    }
    else {
        ShowNTErrors("TODO!");

--- a/source/core/Negate.cuh
+++ b/source/core/Negate.cuh
@@ -19,6 +19,9 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */

+#ifndef __NEGATE_CUH__
+#define __NEGATE_CUH__
+
 #include "Negate.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -27,16 +30,17 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /* set each entry to its negtive value (CUDA Kernel) */
 __global__
-void KernelNegate(DTYPE * d, int size);
+void KernelNegate(DTYPE * a, DTYPE * b, int size);

 /* set each entry to its negtive value (CUDA Kernel) with float16 data type*/
 __global__
-void KernelNegate(__half * d, int size);
+void KernelNegate(__half * a, __half * b, int size);

 /* set each entry to its negtive value */
-extern "C"
-void CudaNegate(XTensor * a);
+void _CudaNegate(const XTensor * a, XTensor * b);

 #endif // USE_CUDA

-} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __NEGATE_CUH__
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Negate.h
+++ b/source/tensor/core/arithmetic/Negate.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
+*/
+
+#ifndef __NEGATE_H__
+#define __NEGATE_H__
+
+#include "../../XTensor.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* set every entry to its minus value */
+void _Negate(const XTensor * a, XTensor * b);
+
+/* 
+set every entry to its minus value (do it on site)
+keep the result in the input tensor a and return nothing
+*/
+void _NegateMe(XTensor * a);
+
+/* 
+set every entry to its minus value (return a XTensor structure)
+make a new tensor to keep the result and return it
+*/
+XTensor Negate(const XTensor & a);
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __NEGATE_H__
--- a/source/tensor/core/arithmetic/Sign.cpp
+++ b/source/tensor/core/arithmetic/Sign.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
+*/
+
+#include "../../XTensor.h"
+#include "../../XName.h"
+#include "Sign.h"
+#include "Sign.cuh"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/*
+set every entry to its sign value
+>> a - input tensor we are processing
+>> b - output tensor we are processing
+*/
+void _Sign(const XTensor * a, XTensor * b)
+{
+#ifdef USE_CUDA
+    /* run it on GPUs */
+    if (a->devID >= 0) {
+        _CudaSign(a, b);
+    return;
+}
+#endif
+
+    CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
+    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
+    DTYPE * d = (DTYPE*)a->data;
+    DTYPE * db = (DTYPE*)b->data;
+    for (int i = 0; i < a->unitNum; i++) {
+        if (d[i] > 0)
+            db[i] = 1.0F;
+        else if (d[i] == 0)
+            db[i] = 0.0F;
+        else
+            db[i] = -1.0F;
+    }
+}
+
+/*
+set every entry to its sign value (do it on site)
+keep the result in the input tensor a and return nothing
+>> a - the tensor we are processing
+*/
+void _SignMe(XTensor * a)
+{
+    _Sign(a, a);
+}
+
+/*
+set every entry to its sign value (return a XTensor structure)
+make a new tensor to keep the result and return it
+>> a - input tensor we are processing
+<< return - the sign value of the input tensor
+*/
+XTensor Sign(const XTensor & a)
+{
+    XTensor b(&a);
+    b.SetTMP();
+
+    /* call _ScaleAndShift function */
+    _Sign(&a, &b);
+
+    /* tensor connections */
+    XLink::MakeLink(&a, NULL, &b, MATH_SIGN);
+
+    return b;
+}
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Sign.cu
+++ b/source/tensor/core/arithmetic/Sign.cu
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
+*/
+
+#include "../../XDevice.h"
+#include "../../XTensor.h"
+#include "Sign.h"
+#include "Sign.cuh"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+#ifdef USE_CUDA
+/*
+set each entry to its sign value (CUDA Kernel)
+>> a - pointer to input data array
+>> b - pointer to output data array
+>> size - size of the data array
+*/
+__global__
+void KernelSign(DTYPE * a, DTYPE * b, int size)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < size) {
+        if (a[i] > 0)
+            b[i] = 1.0F;
+        else if (a[i] == 0)
+            b[i] = 0.0F;
+        else
+            b[i] = -1.0F;
+    }
+}
+
+/*
+set each entry to its sign value with float16 data type value (CUDA Kernel)
+This is for float16 computation
+>> a - pointer to input data array
+>> b - pointer to output data array
+>> size - size of the data array
+*/
+__global__
+void KernelSign(__half * a, __half * b, int size)
+{
+    return;
+}
+
+/*
+set each entry to its sign value
+>> a - input tensor we are processing
+>> b - output tensor we are processing
+*/
+void _CudaSign(const XTensor * a, XTensor * b)
+{
+    CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
+    CheckNTErrors((a->isSparse == false), "TODO!");
+
+    int gridSize[3];
+    int blockSize[3];
+
+    GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
+
+    dim3 blocks(gridSize[0]);
+    dim3 threads(blockSize[0]);
+
+    int devIDBackup;
+    ProtectCudaDev(a->devID, devIDBackup);
+
+    if (a->dataType == DEFAULT_DTYPE) {
+        KernelSign << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum);
+    }
+    else if (a->dataType == X_FLOAT16) {
+        KernelSign << <blocks, threads >> >((__half*)a->data, (__half*)b->data, a->unitNum);
+    }
+    else {
+        ShowNTErrors("TODO!");
+    }
+
+    BacktoCudaDev(a->devID, devIDBackup);
+}
+
+#endif // USE_CUDA
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/Sign.cuh
+++ b/source/tensor/core/arithmetic/Sign.cuh
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
+*/
+
+#ifndef __SIGN_CUH__
+#define __SIGN_CUH__
+
+#include "Sign.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+#ifdef USE_CUDA
+
+/* set each entry to its sign value (CUDA Kernel) */
+__global__
+void KernelSign(DTYPE * a, DTYPE * b, int size);
+
+/* set each entry to its sign value (CUDA Kernel) with float16 data type*/
+__global__
+void KernelSign(__half * a, __half * b, int size);
+
+/* set each entry to its sign value */
+void _CudaSign(const XTensor * a, XTensor * b);
+
+#endif // USE_CUDA
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __SIGN_H__
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Sign.h
+++ b/source/tensor/core/arithmetic/Sign.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
+*/
+
+#ifndef __SIGN_H__
+#define __SIGN_H__
+
+#include "../../XTensor.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* set every entry to its sign value */
+void _Sign(const XTensor * a, XTensor * b);
+
+/* 
+set every entry to its sign value (do it on site)
+keep the result in the input tensor a and return nothing
+*/
+void _SignMe(XTensor * a);
+
+/* 
+set every entry to its sign value  (return a XTensor structure)
+make a new tensor to keep the result and return it
+*/
+XTensor Sign(const XTensor & a);
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __SIGN_H__
--- a/source/core/Sum.cpp
+++ b/source/core/Sum.cpp
@@ -19,8 +19,9 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */

-#include "../XTensor.h"
-#include "../XName.h"
+#include "../../XTensor.h"
+#include "../../XName.h"
+#include "../../XUtility.h"
 #include "Sum.h"
 #include "Sum.cuh"

@@ -28,26 +29,20 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /*
 tensor summation c = a + b * \beta
+
 >> a - a tensor
 >> b - another tensor
 >> c - where we put a+b*\beta. we save it in a if c is NULL
 >> beta - the scaling factor
 */
-void Sum(XTensor * a, XTensor * b, XTensor * c, DTYPE beta)
+void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
 {
-    if (c == NULL)
-        c = a;
-
-    CheckNTErrors(a && b && c, "Empty tensors in addition!");
+    CheckNTErrors(a && b && c, "Empty tensor input!");
    CheckNTErrors(a->unitNum == b->unitNum && a->unitNum == c->unitNum,
                  "Unmatched tensors in addition!");
    CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
                  "Unmatched tensors in addition!");

-    /* make tensor connections */
-    XLink::MakeLink(a, b, c, MATH_SUM);
-    XLink::AddParamToHead(c, beta);
-
    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {

 #ifdef USE_CUDA
@@ -63,17 +58,16 @@ void Sum(XTensor * a, XTensor * b, XTensor * c, DTYPE beta)
                ShowNTErrors("Cannot run this method on multiple devices simultaneously!");
            }
            else
-                CudaSum(a, b, c, beta);
+                _CudaSum(a, b, c, beta);
        }
        else
-            CudaSum(a, b, c, beta);
+            _CudaSum(a, b, c, beta);

 #endif
    }
    else {
        if (!a->isSparse && !b->isSparse) {
-            CheckNTErrors(!c->isSparse,
-                "Illegal use of sparse matrix in addition!");
+            CheckNTErrors(!c->isSparse, "Illegal use of sparse matrix in addition!");
    
            if (a->dataType == DEFAULT_DTYPE &&
                b->dataType == DEFAULT_DTYPE &&
@@ -116,5 +110,42 @@ void Sum(XTensor * a, XTensor * b, XTensor * c, DTYPE beta)
        }
    }
 }
+    
+/*
+tensor summation a = a + b * \beta (do it on site)
+keep the result in the tensor a and return nothing
+
+>> a - a tensor
+>> b - another tensor
+>> beta - the scaling factor
+*/
+void _SumMe(XTensor * a, const XTensor * b, DTYPE beta)
+{
+    _Sum(a, b, a, beta);
+}
+    
+/*
+tensor summation c = a + b * \beta (return a XTensor structure)
+make a new tensor c to keep the result and return it
+
+>> a - a tensor
+>> b - another tensor
+>> beta - the scaling factor
+<< return - the result of tensor summation
+*/
+XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta)
+{
+    XTensor c(&a);
+    c.SetTMP();
+    
+    /* call _Sum function */
+    _Sum(&a, &b, &c, beta);
+    
+    /* tensor connections */
+    XLink::MakeLink(&a, &b, &c, MATH_SUM);
+    XLink::AddParamToHead(&c, beta);
+    
+    return c;
+}

-} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
+} // namespace nts(NiuTrans.Tensor)
--- a/source/core/Sum.cu
+++ b/source/core/Sum.cu
--- a/source/core/Sum.cuh
+++ b/source/core/Sum.cuh
--- a/source/tensor/core/arithmetic/Sum.h
+++ b/source/tensor/core/arithmetic/Sum.h
--- a/source/core/SumByColumnTV.cpp
+++ b/source/core/SumByColumnTV.cpp
--- a/source/core/SumByColumnTV.cu
+++ b/source/core/SumByColumnTV.cu
--- a/source/core/SumByColumnTV.cuh
+++ b/source/core/SumByColumnTV.cuh
--- a/source/core/SumByColumnTV.h
+++ b/source/core/SumByColumnTV.h
--- a/source/core/SumByColumnVT.cpp
+++ b/source/core/SumByColumnVT.cpp
--- a/source/core/SumByColumnVT.cu
+++ b/source/core/SumByColumnVT.cu
--- a/source/core/SumByColumnVT.cuh
+++ b/source/core/SumByColumnVT.cuh
--- a/source/core/SumByColumnVT.h
+++ b/source/core/SumByColumnVT.h
--- a/source/core/XTensorBLAS.cpp
+++ b/source/core/XTensorBLAS.cpp
--- a/source/core/XTensorBLAS.cu
+++ b/source/core/XTensorBLAS.cu
--- a/source/core/XTensorBLAS.h
+++ b/source/core/XTensorBLAS.h
--- a/source/tensor/core/getandset/ConvertDataType.cpp
+++ b/source/tensor/core/getandset/ConvertDataType.cpp
--- a/source/core/ConvertDataType.cu
+++ b/source/core/ConvertDataType.cu
--- a/source/tensor/core/getandset/ConvertDataType.cuh
+++ b/source/tensor/core/getandset/ConvertDataType.cuh
--- a/source/core/Split.h
+++ b/source/core/Split.h
--- a/source/core/Select.cpp
+++ b/source/core/Select.cpp
--- a/source/core/Select.cu
+++ b/source/core/Select.cu
--- a/source/core/Select.h
+++ b/source/core/Select.h
--- a/source/tensor/core/getandset/Select.h
+++ b/source/tensor/core/getandset/Select.h
--- a/source/tensor/core/getandset/SetData.cpp
+++ b/source/tensor/core/getandset/SetData.cpp
--- a/source/tensor/core/getandset/SetData.cu
+++ b/source/tensor/core/getandset/SetData.cu
--- a/source/tensor/core/getandset/SetData.cuh
+++ b/source/tensor/core/getandset/SetData.cuh
--- a/source/core/SetData.h
+++ b/source/core/SetData.h
--- a/source/tensor/core/math/Log.cpp
+++ b/source/tensor/core/math/Log.cpp
--- a/source/tensor/core/math/Log.cu
+++ b/source/tensor/core/math/Log.cu
--- a/source/tensor/core/math/Log.cuh
+++ b/source/tensor/core/math/Log.cuh
--- a/source/tensor/core/math/Log.h
+++ b/source/tensor/core/math/Log.h
--- a/source/core/Normalize.cpp
+++ b/source/core/Normalize.cpp
--- a/source/core/Normalize.cu
+++ b/source/core/Normalize.cu
--- a/source/core/Normalize.cuh
+++ b/source/core/Normalize.cuh
--- a/source/tensor/core/math/Normalize.h
+++ b/source/tensor/core/math/Normalize.h
--- a/source/core/Power.cpp
+++ b/source/core/Power.cpp
--- a/source/core/Power.cu
+++ b/source/core/Power.cu
--- a/source/core/Power.cuh
+++ b/source/core/Power.cuh
--- a/source/tensor/core/math/Power.h
+++ b/source/tensor/core/math/Power.h
--- a/source/core/ScaleAndShift.cpp
+++ b/source/core/ScaleAndShift.cpp
--- a/source/core/ScaleAndShift.cu
+++ b/source/core/ScaleAndShift.cu
--- a/source/core/ScaleAndShift.cuh
+++ b/source/core/ScaleAndShift.cuh
--- a/source/tensor/core/math/ScaleAndShift.h
+++ b/source/tensor/core/math/ScaleAndShift.h
--- a/source/core/CopyBlocks.cpp
+++ b/source/core/CopyBlocks.cpp
--- a/source/core/CopyBlocks.h
+++ b/source/core/CopyBlocks.h
--- a/source/core/CopyBlocksInGrid.cpp
+++ b/source/core/CopyBlocksInGrid.cpp
--- a/source/core/CopyBlocksInGrid.cu
+++ b/source/core/CopyBlocksInGrid.cu
--- a/source/core/CopyBlocksInGrid.cuh
+++ b/source/core/CopyBlocksInGrid.cuh
--- a/source/core/CopyBlocksInGrid.h
+++ b/source/core/CopyBlocksInGrid.h
--- a/source/core/CopyBlocksOnSite.cpp
+++ b/source/core/CopyBlocksOnSite.cpp
--- a/source/core/CopyBlocksOnSite.cu
+++ b/source/core/CopyBlocksOnSite.cu
--- a/source/core/CopyBlocksOnSite.cuh
+++ b/source/core/CopyBlocksOnSite.cuh
--- a/source/core/CopyBlocksOnSite.h
+++ b/source/core/CopyBlocksOnSite.h
--- a/source/core/CopyBlocksSelected.cu
+++ b/source/core/CopyBlocksSelected.cu
--- a/source/core/CopyBlocksSelected.cuh
+++ b/source/core/CopyBlocksSelected.cuh
--- a/source/core/CopyData2D.cpp
+++ b/source/core/CopyData2D.cpp
--- a/source/core/CopyData2D.h
+++ b/source/core/CopyData2D.h
--- a/source/core/CopyInGrid.cpp
+++ b/source/core/CopyInGrid.cpp
--- a/source/core/CopyInGrid.h
+++ b/source/core/CopyInGrid.h
--- a/source/core/CopyIndexed.cpp
+++ b/source/core/CopyIndexed.cpp
--- a/source/core/CopyIndexed.h
+++ b/source/core/CopyIndexed.h
--- a/source/core/CopyValues.cpp
+++ b/source/core/CopyValues.cpp
--- a/source/core/CopyValues.cu
+++ b/source/core/CopyValues.cu
--- a/source/core/CopyValues.cuh
+++ b/source/core/CopyValues.cuh
--- a/source/core/CopyValues.h
+++ b/source/core/CopyValues.h
--- a/source/core/ReduceMax.cpp
+++ b/source/core/ReduceMax.cpp
--- a/source/core/ReduceMax.cu
+++ b/source/core/ReduceMax.cu
--- a/source/core/ReduceMax.cuh
+++ b/source/core/ReduceMax.cuh
--- a/source/core/ReduceMax.h
+++ b/source/core/ReduceMax.h
--- a/source/core/ReduceMean.cpp
+++ b/source/core/ReduceMean.cpp
--- a/source/core/ReduceMean.h
+++ b/source/core/ReduceMean.h
--- a/source/core/ReduceStandardVariance.h
+++ b/source/core/ReduceStandardVariance.h
--- a/source/core/ReduceSum.cpp
+++ b/source/core/ReduceSum.cpp
--- a/source/core/ReduceSum.cu
+++ b/source/core/ReduceSum.cu
--- a/source/core/ReduceSum.cuh
+++ b/source/core/ReduceSum.cuh
--- a/source/core/ReduceSum.h
+++ b/source/core/ReduceSum.h
--- a/source/core/ReduceSumSquared.cpp
+++ b/source/core/ReduceSumSquared.cpp
--- a/source/core/ReduceSumSquared.h
+++ b/source/core/ReduceSumSquared.h
--- a/source/core/ReduceVariance.cpp
+++ b/source/core/ReduceVariance.cpp
--- a/source/core/ReduceVariance.h
+++ b/source/core/ReduceVariance.h
--- a/source/tensor/core/shape/Concatenate.cpp
+++ b/source/tensor/core/shape/Concatenate.cpp
--- a/source/core/Concatenate.h
+++ b/source/core/Concatenate.h
--- a/source/core/ConcatenateSolely.cpp
+++ b/source/core/ConcatenateSolely.cpp
--- a/source/core/ConcatenateSolely.h
+++ b/source/core/ConcatenateSolely.h
--- a/source/core/MakeMergeBlockIndex.cpp
+++ b/source/core/MakeMergeBlockIndex.cpp
--- a/source/core/MakeMergeBlockIndex.cu
+++ b/source/core/MakeMergeBlockIndex.cu
--- a/source/core/MakeMergeBlockIndex.cuh
+++ b/source/core/MakeMergeBlockIndex.cuh
--- a/source/core/MakeMergeBlockIndex.h
+++ b/source/core/MakeMergeBlockIndex.h
--- a/source/core/MakeSplitBlockIndex.cpp
+++ b/source/core/MakeSplitBlockIndex.cpp
--- a/source/core/MakeSplitBlockIndex.cu
+++ b/source/core/MakeSplitBlockIndex.cu
--- a/source/core/MakeSplitBlockIndex.cuh
+++ b/source/core/MakeSplitBlockIndex.cuh
--- a/source/core/MakeSplitBlockIndex.h
+++ b/source/core/MakeSplitBlockIndex.h
--- a/source/core/Merge.cpp
+++ b/source/core/Merge.cpp
--- a/source/core/Merge.h
+++ b/source/core/Merge.h
--- a/source/core/MergeBlockLists.cpp
+++ b/source/core/MergeBlockLists.cpp
--- a/source/core/MergeBlockLists.cu
+++ b/source/core/MergeBlockLists.cu
--- a/source/core/MergeBlockLists.cuh
+++ b/source/core/MergeBlockLists.cuh
--- a/source/core/MergeBlockLists.h
+++ b/source/core/MergeBlockLists.h
--- a/source/core/Permute.cpp
+++ b/source/core/Permute.cpp
--- a/source/core/Permute.h
+++ b/source/core/Permute.h
--- a/source/core/Split.cpp
+++ b/source/core/Split.cpp
--- a/source/tensor/core/shape/Split.h
+++ b/source/tensor/core/shape/Split.h
--- a/source/core/Transpose.cpp
+++ b/source/core/Transpose.cpp
--- a/source/core/Transpose.h
+++ b/source/core/Transpose.h
--- a/source/core/Unsqueeze.cpp
+++ b/source/core/Unsqueeze.cpp
--- a/source/core/Unsqueeze.cu
+++ b/source/core/Unsqueeze.cu
--- a/source/core/Unsqueeze.cuh
+++ b/source/core/Unsqueeze.cuh
--- a/source/core/Unsqueeze.h
+++ b/source/core/Unsqueeze.h
--- a/source/core/Sort.cpp
+++ b/source/core/Sort.cpp
--- a/source/core/Sort.cu
+++ b/source/core/Sort.cu
--- a/source/core/Sort.cuh
+++ b/source/core/Sort.cuh
--- a/source/core/Sort.h
+++ b/source/core/Sort.h
--- a/source/core/TopK.cpp
+++ b/source/core/TopK.cpp
--- a/source/core/TopK.cu
+++ b/source/core/TopK.cu
--- a/source/core/TopK.cuh
+++ b/source/core/TopK.cuh
--- a/source/core/TopK.h
+++ b/source/core/TopK.h
--- a/source/core/FlushToMem.cpp
+++ b/source/core/FlushToMem.cpp
--- a/source/core/FlushToMem.cu
+++ b/source/core/FlushToMem.cu
--- a/source/core/FlushToMem.cuh
+++ b/source/core/FlushToMem.cuh
--- a/source/core/FlushToMem.h
+++ b/source/core/FlushToMem.h
--- a/source/core/SetAscendingOrder.cu
+++ b/source/core/SetAscendingOrder.cu
--- a/source/core/SetAscendingOrder.cuh
+++ b/source/core/SetAscendingOrder.cuh
--- a/source/core/XMatrixSegment.cpp
+++ b/source/core/XMatrixSegment.cpp
--- a/source/core/XMatrixSegment.h
+++ b/source/core/XMatrixSegment.h
--- a/source/function/FHeader.h
+++ b/source/function/FHeader.h
--- a/source/function/HardTanH.cpp
+++ b/source/function/HardTanH.cpp
--- a/source/function/HardTanH.cu
+++ b/source/function/HardTanH.cu
--- a/source/function/HardTanH.cuh
+++ b/source/function/HardTanH.cuh
--- a/source/function/HardTanH.h
+++ b/source/function/HardTanH.h
--- a/source/function/Identity.cpp
+++ b/source/function/Identity.cpp
--- a/source/function/Identity.h
+++ b/source/function/Identity.h
--- a/source/function/LogSoftmax.cpp
+++ b/source/function/LogSoftmax.cpp
--- a/source/function/LogSoftmax.cu
+++ b/source/function/LogSoftmax.cu
--- a/source/function/LogSoftmax.cuh
+++ b/source/function/LogSoftmax.cuh
--- a/source/function/LogSoftmax.h
+++ b/source/function/LogSoftmax.h
--- a/source/function/Loss.cpp
+++ b/source/function/Loss.cpp
--- a/source/tensor/function/Loss.cu
+++ b/source/tensor/function/Loss.cu
--- a/source/function/Loss.cuh
+++ b/source/function/Loss.cuh
--- a/source/function/Loss.h
+++ b/source/function/Loss.h
--- a/source/function/Rectify.cpp
+++ b/source/function/Rectify.cpp
--- a/source/function/Rectify.cu
+++ b/source/function/Rectify.cu
--- a/source/function/Rectify.cuh
+++ b/source/function/Rectify.cuh
--- a/source/function/Rectify.h
+++ b/source/function/Rectify.h
--- a/source/function/Sigmoid.cpp
+++ b/source/function/Sigmoid.cpp
--- a/source/function/Sigmoid.cu
+++ b/source/function/Sigmoid.cu
--- a/source/function/Sigmoid.cuh
+++ b/source/function/Sigmoid.cuh
--- a/source/function/Sigmoid.h
+++ b/source/function/Sigmoid.h
--- a/source/function/Softmax.cpp
+++ b/source/function/Softmax.cpp
--- a/source/function/Softmax.cu
+++ b/source/function/Softmax.cu
--- a/source/function/Softmax.cuh
+++ b/source/function/Softmax.cuh
--- a/source/function/Softmax.h
+++ b/source/function/Softmax.h
--- a/source/tensor/test/TAbsolute.cpp
+++ b/source/tensor/test/TAbsolute.cpp
--- a/source/core/ScaleAndShift.h
+++ b/source/core/ScaleAndShift.h
--- a/source/tensor/test/TConcatenate.cpp
+++ b/source/tensor/test/TConcatenate.cpp
--- a/source/test/TConcatenate.h
+++ b/source/test/TConcatenate.h
--- a/source/test/TConcatenateSolely.cpp
+++ b/source/test/TConcatenateSolely.cpp
--- a/source/test/TConcatenateSolely.h
+++ b/source/test/TConcatenateSolely.h
--- a/source/tensor/test/TConvertDataType.cpp
+++ b/source/tensor/test/TConvertDataType.cpp
--- a/source/tensor/test/TConvertDataType.h
+++ b/source/tensor/test/TConvertDataType.h
--- a/source/tensor/test/TCopyIndexed.cpp
+++ b/source/tensor/test/TCopyIndexed.cpp
--- a/source/XName.h
+++ b/source/XName.h
--- a/source/tensor/test/TCopyValues.cpp
+++ b/source/tensor/test/TCopyValues.cpp
--- a/source/tensor/test/TCopyValues.h
+++ b/source/tensor/test/TCopyValues.h
--- a/source/tensor/test/THardTanH.cpp
+++ b/source/tensor/test/THardTanH.cpp
--- a/source/test/THardTanH.h
+++ b/source/test/THardTanH.h
--- a/source/tensor/test/TIdentity.cpp
+++ b/source/tensor/test/TIdentity.cpp
--- a/source/tensor/test/TIdentity.h
+++ b/source/tensor/test/TIdentity.h
--- a/source/tensor/test/TLog.cpp
+++ b/source/tensor/test/TLog.cpp
--- a/source/test/TSoftmax.h
+++ b/source/test/TSoftmax.h
--- a/source/tensor/test/TLogSoftmax.cpp
+++ b/source/tensor/test/TLogSoftmax.cpp
--- a/source/tensor/test/TLogSoftmax.h
+++ b/source/tensor/test/TLogSoftmax.h
--- a/source/tensor/test/TLoss.cpp
+++ b/source/tensor/test/TLoss.cpp
--- a/source/test/TLoss.h
+++ b/source/test/TLoss.h
--- a/source/tensor/test/TMatrixMULBatchedCPU.cpp
+++ b/source/tensor/test/TMatrixMULBatchedCPU.cpp
--- a/source/test/TMatrixMULBatchedCPU.h
+++ b/source/test/TMatrixMULBatchedCPU.h
--- a/source/test/TMatrixMul.cpp
+++ b/source/test/TMatrixMul.cpp
--- a/source/test/TMatrixMul.h
+++ b/source/test/TMatrixMul.h
--- a/source/tensor/test/TMatrixMul2D.cpp
+++ b/source/tensor/test/TMatrixMul2D.cpp
--- a/source/test/TMatrixMul2D.h
+++ b/source/test/TMatrixMul2D.h
--- a/source/tensor/test/TMatrixMul2DParallel.cpp
+++ b/source/tensor/test/TMatrixMul2DParallel.cpp
--- a/source/tensor/test/TMatrixMul2DParallel.h
+++ b/source/tensor/test/TMatrixMul2DParallel.h
--- a/source/tensor/test/TMatrixMulBatched.cpp
+++ b/source/tensor/test/TMatrixMulBatched.cpp
--- a/source/core/Sum.h
+++ b/source/core/Sum.h
--- a/source/tensor/test/TMerge.cpp
+++ b/source/tensor/test/TMerge.cpp
--- a/source/test/TMerge.h
+++ b/source/test/TMerge.h
--- a/source/test/TMultiply.cpp
+++ b/source/test/TMultiply.cpp
--- a/source/test/TMultiply.h
+++ b/source/test/TMultiply.h
--- a/source/tensor/test/TNegate.cpp
+++ b/source/tensor/test/TNegate.cpp
--- a/source/test/TNegate.h
+++ b/source/test/TNegate.h
--- a/source/tensor/test/TNormalize.cpp
+++ b/source/tensor/test/TNormalize.cpp
--- a/source/test/TNormalize.h
+++ b/source/test/TNormalize.h
--- a/source/tensor/test/TPower.cpp
+++ b/source/tensor/test/TPower.cpp
--- a/source/test/TPower.h
+++ b/source/test/TPower.h
--- a/source/tensor/test/TRectify.cpp
+++ b/source/tensor/test/TRectify.cpp
--- a/source/test/TRectify.h
+++ b/source/test/TRectify.h
--- a/source/tensor/test/TReduceMax.cpp
+++ b/source/tensor/test/TReduceMax.cpp
--- a/source/test/TReduceMax.h
+++ b/source/test/TReduceMax.h
--- a/source/tensor/test/TReduceMean.cpp
+++ b/source/tensor/test/TReduceMean.cpp
--- a/source/test/TReduceMean.h
+++ b/source/test/TReduceMean.h
--- a/source/tensor/test/TReduceSum.cpp
+++ b/source/tensor/test/TReduceSum.cpp
--- a/source/test/TReduceSum.h
+++ b/source/test/TReduceSum.h
--- a/source/tensor/test/TReduceSumSquared.cpp
+++ b/source/tensor/test/TReduceSumSquared.cpp
--- a/source/tensor/test/TReduceSumSquared.h
+++ b/source/tensor/test/TReduceSumSquared.h
--- a/source/tensor/test/TReduceVariance.cpp
+++ b/source/tensor/test/TReduceVariance.cpp
--- a/source/tensor/test/TReduceVariance.h
+++ b/source/tensor/test/TReduceVariance.h
--- a/source/tensor/test/TScaleAndShift.cpp
+++ b/source/tensor/test/TScaleAndShift.cpp
--- a/source/core/Normalize.h
+++ b/source/core/Normalize.h
--- a/source/tensor/test/TSelect.cpp
+++ b/source/tensor/test/TSelect.cpp
--- a/source/tensor/test/TSelect.h
+++ b/source/tensor/test/TSelect.h
--- a/source/tensor/test/TSetAscendingOrder.cpp
+++ b/source/tensor/test/TSetAscendingOrder.cpp
--- a/source/core/Negate.h
+++ b/source/core/Negate.h
--- a/source/tensor/test/TSetData.cpp
+++ b/source/tensor/test/TSetData.cpp
--- a/source/core/Power.h
+++ b/source/core/Power.h
--- a/source/tensor/test/TSigmoid.cpp
+++ b/source/tensor/test/TSigmoid.cpp
--- a/source/test/TSigmoid.h
+++ b/source/test/TSigmoid.h
--- a/source/tensor/test/TSign.cpp
+++ b/source/tensor/test/TSign.cpp
--- a/source/core/Multiply.h
+++ b/source/core/Multiply.h
--- a/source/tensor/test/TSoftmax.cpp
+++ b/source/tensor/test/TSoftmax.cpp
--- a/source/tensor/test/TSoftmax.h
+++ b/source/tensor/test/TSoftmax.h
--- a/source/tensor/test/TSort.cpp
+++ b/source/tensor/test/TSort.cpp
--- a/source/tensor/test/TSort.h
+++ b/source/tensor/test/TSort.h
--- a/source/test/TSplit.cpp
+++ b/source/test/TSplit.cpp
--- a/source/tensor/test/TSplit.h
+++ b/source/tensor/test/TSplit.h
--- a/source/tensor/test/TSum.cpp
+++ b/source/tensor/test/TSum.cpp
--- a/source/core/MatrixMul.h
+++ b/source/core/MatrixMul.h
--- a/source/tensor/test/TSumByColumnTV.cpp
+++ b/source/tensor/test/TSumByColumnTV.cpp
--- a/source/tensor/test/TSumByColumnTV.h
+++ b/source/tensor/test/TSumByColumnTV.h
--- a/source/tensor/test/TSumByColumnVT.cpp
+++ b/source/tensor/test/TSumByColumnVT.cpp
--- a/source/tensor/test/TSumByColumnVT.h
+++ b/source/tensor/test/TSumByColumnVT.h
--- a/source/tensor/test/TTopK.cpp
+++ b/source/tensor/test/TTopK.cpp
--- a/source/tensor/test/TTopK.h
+++ b/source/tensor/test/TTopK.h
--- a/source/tensor/test/TUnsqueeze.cpp
+++ b/source/tensor/test/TUnsqueeze.cpp
--- a/source/tensor/test/TUnsqueeze.h
+++ b/source/tensor/test/TUnsqueeze.h
--- a/source/tensor/test/TXMem.cpp
+++ b/source/tensor/test/TXMem.cpp
--- a/source/core/XTensorFunction.h
+++ b/source/core/XTensorFunction.h
--- a/source/tensor/test/Test.cpp
+++ b/source/tensor/test/Test.cpp
--- a/source/tensor/test/Test.h
+++ b/source/tensor/test/Test.h
--- a/source/test/TConcatenate.cpp
+++ b/source/test/TConcatenate.cpp
--- a/source/test/THardTanH.cpp
+++ b/source/test/THardTanH.cpp
--- a/source/test/TLoss.cpp
+++ b/source/test/TLoss.cpp
--- a/source/test/TMatrixMULBatchedCPU.cpp
+++ b/source/test/TMatrixMULBatchedCPU.cpp
--- a/source/test/TMatrixMul2D.cpp
+++ b/source/test/TMatrixMul2D.cpp
--- a/source/test/TMerge.cpp
+++ b/source/test/TMerge.cpp
--- a/source/test/TNegate.cpp
+++ b/source/test/TNegate.cpp
--- a/source/test/TNormalize.cpp
+++ b/source/test/TNormalize.cpp
--- a/source/test/TPower.cpp
+++ b/source/test/TPower.cpp
--- a/source/test/TRectify.cpp
+++ b/source/test/TRectify.cpp
--- a/source/test/TReduceMax.cpp
+++ b/source/test/TReduceMax.cpp
--- a/source/test/TReduceMean.cpp
+++ b/source/test/TReduceMean.cpp
--- a/source/test/TReduceSum.cpp
+++ b/source/test/TReduceSum.cpp
--- a/source/test/TSigmoid.cpp
+++ b/source/test/TSigmoid.cpp
--- a/source/test/TSoftmax.cpp
+++ b/source/test/TSoftmax.cpp
--- a/source/test/TSort.cpp
+++ b/source/test/TSort.cpp
--- a/source/test/TSort.h
+++ b/source/test/TSort.h
--- a/source/test/TSplit.h
+++ b/source/test/TSplit.h
--- a/source/test/TSum.cpp
+++ b/source/test/TSum.cpp
--- a/source/test/TSum.h
+++ b/source/test/TSum.h
--- a/source/test/TXMem.cpp
+++ b/source/test/TXMem.cpp
--- a/source/test/TXMem.h
+++ b/source/test/TXMem.h
--- a/source/test/Test.cpp
+++ b/source/test/Test.cpp
--- a/source/test/Test.h
+++ b/source/test/Test.h