refactor parameter from pointer to reference

771643c6 · huchi · 04f129fc · 771643c6 · 771643c6 · 771643c6
Commit 771643c6 authored Jul 19, 2019 by huchi
--- a/source/network/Main.cpp
+++ b/source/network/Main.cpp
@@ -55,7 +55,7 @@ int main( int argc, const char ** argv )
    //    fprintf(stderr, "Run this program with \"-test\" for unit test!\n");
    //    fprintf(stderr, "Or run this program with \"-fnnlm\" for sample FNNLM!\n");
    //}
-	BackwardTest();
+    BackwardTest();

    //_CrtDumpMemoryLeaks();
    
@@ -69,9 +69,9 @@ void BackwardTest()
    XTensor a;
    XTensor b;
    XTensor c;
-	a.enableGrad = true;
-	b.enableGrad = false;
-	c.enableGrad = false;
+    a.enableGrad = true;
+    b.enableGrad = false;
+    c.enableGrad = false;
    XTensor mean;
    XTensor origin;
    InitTensor2D(&a, 2, 3);
@@ -89,9 +89,9 @@ void BackwardTest()
    b.Set1D(2.0F, 0);
    b.Set1D(1.0F, 1);

-	DivDim(a, b, c, 0);
+    DivDim(a, b, c, 0);
    c.Dump(stderr, "c:");
-	auto loss = CrossEntropy(c, a);
+    auto loss = CrossEntropy(c, a);

    //XLink::ShowNetwork(stderr, &c);


--- a/source/network/XBackwardMath.cpp
+++ b/source/network/XBackwardMath.cpp
@@ -765,15 +765,15 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)

    /* dE/da */
    _MultiplyDim(node->grad, b, a->grad, n, 1.0F);
-	
-	/* dE/db */
+    
+    /* dE/db */
    int order = a->order;
    int dimSize[MAX_TENSOR_DIM_NUM];
    memcpy(dimSize, a->dimSize, sizeof(int) * a->order);

    XTensor * bGradTMP = NewTensorBuf(node->grad, node->devID, node->mem);
    _Multiply(node->grad, a, bGradTMP);
-	
+    
    if(n == order - 1){
        int reshapedSize[MAX_TENSOR_DIM_NUM];
        reshapedSize[0] = a->unitNum/dimSize[order - 1];
@@ -1078,91 +1078,91 @@ dE/db = - dE/dc * b.reduce(0,...,n-1,n+1,...) * \beta
 */
 void XMathGrad::GradSubDim(XTensor * node, bool isEfficient)
 {
-	XLink &income = node->income;
-	CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for SUBDIM!");
-
-	XTensor * a = income.tails[0];
-	XTensor * b = income.tails[1];
-	int n = income.GetParamInt(0);
-	DTYPE beta = income.GetParam(1);
-	XNoder::MakeGrad(a);
-	XNoder::MakeGrad(b);
-
-	_Sum(a->grad, node->grad, a->grad);
-
-	int order = a->order;
-	int dimSize[MAX_TENSOR_DIM_NUM];
-	memcpy(dimSize, a->dimSize, sizeof(int) * a->order);
-
-	if(n == order - 1){
-		int reshapedSize[MAX_TENSOR_DIM_NUM];
-		reshapedSize[0] = a->unitNum / dimSize[order - 1];
-		reshapedSize[1] = dimSize[order - 1];
-
-		/* we reshape dE/dc to a matrix whose column number is equal to the
-		   size of b. Then we can reduce the matrix into a row vector. */
-		node->grad->Reshape(2, reshapedSize);
-
-		//if(b->outgo.tailNum > 1){
-			XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
-			_ReduceSum(node->grad, bGradTMP, 0);
-			if(beta != 1.0F)
-				_ScaleAndShiftMe(bGradTMP, beta);
-			_Sub(b->grad, bGradTMP, b->grad);
-			DelTensorBuf(bGradTMP);
-		/*}
-		else{
-			_ReduceSum(node->grad, b->grad, 0);
-			if(beta != 1.0F)
-				_ScaleAndShiftMe(b->grad, beta);
-			_ScaleAndShiftMe(b->grad, -1.0F);
-		}*/
-
-		node->grad->Reshape(order, dimSize);
-	}
-	else{
-		int reshapedSize[MAX_TENSOR_DIM_NUM];
-		reshapedSize[0] = 1;
-		reshapedSize[1] = dimSize[n];
-		reshapedSize[2] = 1;
-
-		for(int i = 0; i < order; i++){
-			if(i < n)
-				reshapedSize[0] *= dimSize[i];
-		}
-
-		reshapedSize[2] = a->unitNum / (reshapedSize[0] * reshapedSize[1]);
-
-		/* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|.
-		   Then reduce along with z and x to obtain dE/db. */
-		node->grad->Reshape(3, reshapedSize);
-
-		XTensor * interGrad = NewTensorBuf(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
-
-		_ReduceSum(node->grad, interGrad, 2);
-
-		//if(b->outgo.tailNum > 1){
-			XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
-			_ReduceSum(interGrad, bGradTMP, 0);
-			if(beta != 1.0F)
-				_ScaleAndShiftMe(bGradTMP, beta);
-			_Sub(b->grad, bGradTMP, b->grad);
-			DelTensorBuf(bGradTMP);
-		/*}
-		else{
-			_ReduceSum(interGrad, b->grad, 0);
-			if(beta != 1.0F)
-				_ScaleAndShiftMe(b->grad, beta);
-			_ScaleAndShiftMe(b->grad, -1.0F);
-		}*/
-
-		node->grad->Reshape(order, dimSize);
-
-		DelTensorBuf(interGrad);
-
-	}
-
-	node->visitMark = NODE_FINISHED;
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for SUBDIM!");
+
+    XTensor * a = income.tails[0];
+    XTensor * b = income.tails[1];
+    int n = income.GetParamInt(0);
+    DTYPE beta = income.GetParam(1);
+    XNoder::MakeGrad(a);
+    XNoder::MakeGrad(b);
+
+    _Sum(a->grad, node->grad, a->grad);
+
+    int order = a->order;
+    int dimSize[MAX_TENSOR_DIM_NUM];
+    memcpy(dimSize, a->dimSize, sizeof(int) * a->order);
+
+    if(n == order - 1){
+        int reshapedSize[MAX_TENSOR_DIM_NUM];
+        reshapedSize[0] = a->unitNum / dimSize[order - 1];
+        reshapedSize[1] = dimSize[order - 1];
+
+        /* we reshape dE/dc to a matrix whose column number is equal to the
+           size of b. Then we can reduce the matrix into a row vector. */
+        node->grad->Reshape(2, reshapedSize);
+
+        //if(b->outgo.tailNum > 1){
+            XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
+            _ReduceSum(node->grad, bGradTMP, 0);
+            if(beta != 1.0F)
+                _ScaleAndShiftMe(bGradTMP, beta);
+            _Sub(b->grad, bGradTMP, b->grad);
+            DelTensorBuf(bGradTMP);
+        /*}
+        else{
+            _ReduceSum(node->grad, b->grad, 0);
+            if(beta != 1.0F)
+                _ScaleAndShiftMe(b->grad, beta);
+            _ScaleAndShiftMe(b->grad, -1.0F);
+        }*/
+
+        node->grad->Reshape(order, dimSize);
+    }
+    else{
+        int reshapedSize[MAX_TENSOR_DIM_NUM];
+        reshapedSize[0] = 1;
+        reshapedSize[1] = dimSize[n];
+        reshapedSize[2] = 1;
+
+        for(int i = 0; i < order; i++){
+            if(i < n)
+                reshapedSize[0] *= dimSize[i];
+        }
+
+        reshapedSize[2] = a->unitNum / (reshapedSize[0] * reshapedSize[1]);
+
+        /* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|.
+           Then reduce along with z and x to obtain dE/db. */
+        node->grad->Reshape(3, reshapedSize);
+
+        XTensor * interGrad = NewTensorBuf(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
+
+        _ReduceSum(node->grad, interGrad, 2);
+
+        //if(b->outgo.tailNum > 1){
+            XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
+            _ReduceSum(interGrad, bGradTMP, 0);
+            if(beta != 1.0F)
+                _ScaleAndShiftMe(bGradTMP, beta);
+            _Sub(b->grad, bGradTMP, b->grad);
+            DelTensorBuf(bGradTMP);
+        /*}
+        else{
+            _ReduceSum(interGrad, b->grad, 0);
+            if(beta != 1.0F)
+                _ScaleAndShiftMe(b->grad, beta);
+            _ScaleAndShiftMe(b->grad, -1.0F);
+        }*/
+
+        node->grad->Reshape(order, dimSize);
+
+        DelTensorBuf(interGrad);
+
+    }
+
+    node->visitMark = NODE_FINISHED;
 }

 /* 

--- a/source/network/XBackwardMath.h
+++ b/source/network/XBackwardMath.h
@@ -146,10 +146,10 @@ private:
    static
    void GradSub(XTensor * node, bool isEfficient);
    
-	/* gradient for sub with one dimension: c = a - b * \beta
-	where the size of b is equal to that of one dimension of a */
-	static
-	void GradSubDim(XTensor * node, bool isEfficient);
+    /* gradient for sub with one dimension: c = a - b * \beta
+    where the size of b is equal to that of one dimension of a */
+    static
+    void GradSubDim(XTensor * node, bool isEfficient);

    /* gradient for sum: c =  a + b * \beta */
    static

--- a/source/network/XBackwardShape.cpp
+++ b/source/network/XBackwardShape.cpp
@@ -450,7 +450,7 @@ void XShapeGrad::GradSplitListPost(XTensor * node, bool isEfficient)
        if(income.typeID == SHAPE_SPLIT_LIST){
            int w = income.GetParamInt(0);
            int splitID = income.GetParamInt(1);
-			
+            
            if(whereToSplit < 0)
                whereToSplit = w;
            splitNum++;

--- a/source/network/XNet.cpp
+++ b/source/network/XNet.cpp
@@ -267,7 +267,7 @@ void XNet::BackwardNode(XTensor * node, bool isEfficent)
        else if(XShapeGrad::IsShapeOP(node))
            XShapeGrad::MakeGrad(node, isEfficent);
        else if(XLossGrad::IsLossOP(node))
-			XLossGrad::MakeGrad(node, isEfficent);
+            XLossGrad::MakeGrad(node, isEfficent);
        else{
            ShowNTErrors("Wrong node type!");
        }
@@ -468,7 +468,7 @@ search for a node in a top-down manner by its name
 */
 //XTensor * XNet::SearchNode(XTensor * top, const char * name)
 //{
-	//return XLink::SearchNode(top, name);
+    //return XLink::SearchNode(top, name);
 //}

 }
--- a/source/sample/fnnlm/FNNLM.cpp
+++ b/source/sample/fnnlm/FNNLM.cpp
@@ -482,12 +482,12 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
                Clear(model, true);

                /* forward + backward process */
-				
-				/* this is implemented by gather function */
+                
+                /* this is implemented by gather function */
                ForwardAutoDiff(ngrams, ngramNum, output, model);
-				
-				/* this is implemented by multiply function */
-				//ForwardAutoDiff(inputs, output, model);
+                
+                /* this is implemented by multiply function */
+                //ForwardAutoDiff(inputs, output, model);
                lossTensor = CrossEntropy(output, gold);

                /* automatic differentiation */
@@ -1177,12 +1177,12 @@ void Test(const char * test, const char * result, FNNModel &model)
            /* forward computation */
            Forward(inputs, output, model, net);
        }
-        else {			
-			/* this is implemented by gather function */
+        else {            
+            /* this is implemented by gather function */
            ForwardAutoDiff(ngrams, ngramNum, output, model);
-				
-			/* this is implemented by multiply function */
-			//ForwardAutoDiff(inputs, output, model);
+                
+            /* this is implemented by multiply function */
+            //ForwardAutoDiff(inputs, output, model);
        }

        /* prediction probabilities */

--- a/source/sample/transformer/T2TAttention.h
+++ b/source/sample/transformer/T2TAttention.h
@@ -61,7 +61,7 @@ public:
    XTensor wa;
    
    XTensor wbig;
-	
+    
    /* size of transformed Q and K */
    int dk;


--- a/source/sample/transformer/T2TBatchLoader.cpp
+++ b/source/sample/transformer/T2TBatchLoader.cpp
@@ -86,7 +86,7 @@ struct SampleNode
    int * p;
    int size;
    int value;
-	int key;
+    int key;
 };

 int CompareSampleNode(const void * a, const void * b)
@@ -289,7 +289,7 @@ int T2TBatchLoader::LoadBatch(FILE * file, bool isLM,
                          int vsEnc, int vsDec, int sBatch, int wBatch, 
                          bool isSorted, int &ws, int &wCount,
                          int devID, XMem * mem, 
-						  bool isTraining)
+                          bool isTraining)
 {
    if(isLM){
        return LoadBatchLM(file, batchEnc, paddingEnc, batchDec, paddingDec, gold, label,
@@ -331,7 +331,7 @@ int T2TBatchLoader::LoadBatchLM(FILE * file,
                            int vSize, int sBatch, int wBatch, 
                            bool isSorted, int &wCount,
                            int devID, XMem * mem,
-							bool isTraining)
+                            bool isTraining)
 {
    if(nextSeq < 0 || nextSeq >= nseqBuf)
        LoadBuf(file, isSorted, 1);
@@ -490,7 +490,7 @@ int T2TBatchLoader::LoadBatchMT(FILE * file,
                            int vSizeEnc, int vSizeDec, int sBatch, int wBatch, 
                            bool isSorted, int &ws, int &wCount,
                            int devID, XMem * mem, 
-							bool isTraining)
+                            bool isTraining)
 {
    if (nextBatch < 0 || nextBatch >= bufBatchSize) {
        LoadBuf(file, isSorted, 2);

--- a/source/sample/transformer/T2TBatchLoader.h
+++ b/source/sample/transformer/T2TBatchLoader.h
@@ -132,7 +132,7 @@ public:
                  int vsEnc, int vsDec, int sBatch, int wBatch, 
                  bool isSorted, int &ws, int &wCount,
                  int devID, XMem * mem, 
-				  bool isTraining);
+                  bool isTraining);

    /* load a batch of sequences (for language modeling) */
    int LoadBatchLM(FILE * file, 
@@ -142,7 +142,7 @@ public:
                    int * seqs, int vs, int sBatch, int wBatch, 
                    bool isSorted, int &wCount,
                    int devID, XMem * mem, 
-					bool isTraining);
+                    bool isTraining);

    /* load a batch of sequences (for machine translation) */
    int LoadBatchMT(FILE * file, 
@@ -152,7 +152,7 @@ public:
                    int * seqs, int vsEnc, int vsDec, int sBatch, int wBatch, 
                    bool isSorted, int &ws, int &wCount,
                    int devID, XMem * mem, 
-					bool isTraining);
+                    bool isTraining);

    /* shuffle the data file */
    void Shuffle(const char * srcFile, const char * tgtFile);

--- a/source/sample/transformer/T2TSearch.cpp
+++ b/source/sample/transformer/T2TSearch.cpp
@@ -303,7 +303,7 @@ void T2TSearch::Generate(T2TStateBundle * beam)
    /* Then, we do something similar to "preID". For the top-k predictions, we need 
       to know their indices in the vocabulary. We compute the offset of each prediction
       in the vocabulary by dividing it with vocab-size and computing the remainder. */
-    _ModMe(index, sizeVocab);
+    ModMe(index, sizeVocab);

    score.Reshape(order, dims);


--- a/source/tensor/XDevice.cpp
+++ b/source/tensor/XDevice.cpp
@@ -528,7 +528,7 @@ get device ids for the given device information
 */
 int XDevManager::GetDeviceIDs(char * devInfo, int * devIDs)
 {
-	StrList* terms = new StrList(1);
+    StrList* terms = new StrList(1);
    SplitALine(devInfo, " ", terms);

    for(int i = 0; i < terms->count; i++){

--- a/source/tensor/XList.cpp
+++ b/source/tensor/XList.cpp
@@ -90,7 +90,7 @@ template <typename T>
 void TensorListBase<T>::Add(T&& item)
 {
    if (count == maxNum) {
-		
+        
        T* newItems;
        if (mem == NULL)
            newItems = new T[maxNum * 2 + 1];
@@ -101,7 +101,7 @@ void TensorListBase<T>::Add(T&& item)
        maxNum = maxNum * 2 + 1;
    }
    items[count++] = item;
-	
+    
 }

 /*
@@ -111,18 +111,18 @@ add an item into the list
 template <typename T>
 void TensorListBase<T>::Add(const T& item)
 {
-	if (count == maxNum) {
-		T* newItems;
-		if (mem == NULL)
-			newItems = new T[maxNum * 2 + 1];
-		else
-			newItems = (T*)mem->Alloc(mem->devID, sizeof(T) * (maxNum * 2 + 1));
-		memcpy(newItems, items, sizeof(T) * maxNum);
-		items = newItems;
-		maxNum = maxNum * 2 + 1;
-	}
-
-	items[count++] = item;
+    if (count == maxNum) {
+        T* newItems;
+        if (mem == NULL)
+            newItems = new T[maxNum * 2 + 1];
+        else
+            newItems = (T*)mem->Alloc(mem->devID, sizeof(T) * (maxNum * 2 + 1));
+        memcpy(newItems, items, sizeof(T) * maxNum);
+        items = newItems;
+        maxNum = maxNum * 2 + 1;
+    }
+
+    items[count++] = item;
 }

 /* 
@@ -186,21 +186,21 @@ void TensorListBase<T>::Insert(int pos, const T& item)
 template<typename T>
 void TensorListBase<T>::Insert(int pos, T&& item)
 {
-	if (count == maxNum) {
-		T* newItems;
-		if (mem == NULL)
-			newItems = new T[maxNum * 2 + 1];
-		else
-			newItems = (T*)mem->Alloc(mem->devID, sizeof(T) * (maxNum * 2 + 1));
-		memcpy(newItems, items, sizeof(T) * maxNum);
-		items = newItems;
-		maxNum = maxNum * 2 + 1;
-	}
-
-	for (int i = count - 1; i >= pos; i--)
-		items[i + 1] = items[i];
-	items[pos] = item;
-	count++;
+    if (count == maxNum) {
+        T* newItems;
+        if (mem == NULL)
+            newItems = new T[maxNum * 2 + 1];
+        else
+            newItems = (T*)mem->Alloc(mem->devID, sizeof(T) * (maxNum * 2 + 1));
+        memcpy(newItems, items, sizeof(T) * maxNum);
+        items = newItems;
+        maxNum = maxNum * 2 + 1;
+    }
+
+    for (int i = count - 1; i >= pos; i--)
+        items[i + 1] = items[i];
+    items[pos] = item;
+    count++;
 }

 /* get the item at position i */
@@ -226,8 +226,8 @@ inline void TensorListBase<T>::SetItem(int i, const T& item)
 template<typename T>
 inline void TensorListBase<T>::SetItem(int i, T&& item)
 {
-	if (i >= 0 && i < count)
-		items[i] = std::move(item);
+    if (i >= 0 && i < count)
+        items[i] = std::move(item);
 }

 /* 
@@ -250,7 +250,7 @@ inline int TensorListBase<T>::FindFirst(const T& item)
 template <typename T>
 void TensorListBase<T>::Clear()
 {
-	count = 0;
+    count = 0;
 }

 /*

--- a/source/tensor/XList.h
+++ b/source/tensor/XList.h
@@ -32,7 +32,7 @@

 /* the nts (NiuTrans.Tensor) namespace */
 namespace nts {
-	
+    
 /* the TensorListBase class */
 template <typename T>
 struct TensorListBase {
@@ -66,57 +66,57 @@ public:
    /* add an item into the list */
    void Add(T&& item);

-	/* add an item into the list */
-	void Add(const T& item);
+    /* add an item into the list */
+    void Add(const T& item);

-	/* add a number of items into the list */
+    /* add a number of items into the list */
    void Add(T* inputItems, int inputItemCount);

-	/* append a list to the current list */
+    /* append a list to the current list */
    void AddList(TensorListBase* l);

-	/* insert an item to the given position of the list */
+    /* insert an item to the given position of the list */
    void Insert(int pos, const T& item);

-	/* insert an item to the given position of the list */
-	void Insert(int pos, T&& item);
+    /* insert an item to the given position of the list */
+    void Insert(int pos, T&& item);

-	/* get the item at position i */
+    /* get the item at position i */
    T& GetItem(int i) const;

-	/* set the item at position i */
+    /* set the item at position i */
    void SetItem(int i, const T& item);

-	/* set the item at position i */
-	void SetItem(int i, T&& item);
+    /* set the item at position i */
+    void SetItem(int i, T&& item);

-	/* find the position of the first matched item  */
+    /* find the position of the first matched item  */
    int FindFirst(const T& item);

-	/* clear the data array */
+    /* clear the data array */
    void Clear();

-	/* sort the list */
+    /* sort the list */
    void Sort(int itemSize);

-	/* reverse the list */
+    /* reverse the list */
    void Reverse();

-	/* remove the item at position i */
+    /* remove the item at position i */
    void Remove(int i);

-	/* copy the list */
+    /* copy the list */
    TensorListBase* Copy(XMem* myMem);

-	/* shuffle the list */
+    /* shuffle the list */
    void Shuffle(int nround = 10, int beg = -1, int len = 0);

    /* short */
-	T& operator[] (int i) {
-		return GetItem(i);
-	};
+    T& operator[] (int i) {
+        return GetItem(i);
+    };
    T& Get(int i) { return GetItem(i); };
-	void Set(int i, T item) { SetItem(i, item); };
+    void Set(int i, T item) { SetItem(i, item); };
 };

 struct XTensor;

--- a/source/tensor/XMem.cpp
+++ b/source/tensor/XMem.cpp
@@ -305,7 +305,7 @@ void XMem::SetComputationMode(bool myIsForComputation)
        cublasDestroy(cublasHandle);
    if(myIsForComputation)
        CheckNTErrors((enum curandStatus)cublasCreate(&cublasHandle) == CURAND_STATUS_SUCCESS, 
-				      "Cannot create the cublas handle.");
+                      "Cannot create the cublas handle.");

    SetDevice(devIDBackup);
 #endif
@@ -321,11 +321,11 @@ void XMem::SetIndex(INT_64 indexSize, MTYPE minSizeFirst, int minSizeNum)
 {
    delete[] memIndex;
    delete[] memIndex2;
-	delete[] minSizeIndex;
+    delete[] minSizeIndex;

-	nodeNum = indexSize;
-	nodeNumUsed = minSizeNum * 2;
-	indexEntryNum = minSizeNum;
+    nodeNum = indexSize;
+    nodeNumUsed = minSizeNum * 2;
+    indexEntryNum = minSizeNum;
    
    memIndex = new MPieceNode[nodeNum];
    memset(memIndex, 0, sizeof(MPieceNode) * nodeNum);
@@ -333,12 +333,12 @@ void XMem::SetIndex(INT_64 indexSize, MTYPE minSizeFirst, int minSizeNum)
    memIndex2 = new MPieceNode[nodeNum];
    memset(memIndex2, 0, sizeof(MPieceNode) * nodeNum);

-	minSizeIndex = new MTYPE[indexEntryNum];
-	memset(minSizeIndex, 0, sizeof(MTYPE) * indexEntryNum);
+    minSizeIndex = new MTYPE[indexEntryNum];
+    memset(minSizeIndex, 0, sizeof(MTYPE) * indexEntryNum);

-	minSizeIndex[0] = minSizeFirst;
-	for(int i = 1; i < indexEntryNum; i++)
-		minSizeIndex[i] = minSizeIndex[i - 1] * 2;
+    minSizeIndex[0] = minSizeFirst;
+    for(int i = 1; i < indexEntryNum; i++)
+        minSizeIndex[i] = minSizeIndex[i - 1] * 2;

    indexOffset = GetMSB(minSizeFirst);
 }
@@ -757,8 +757,8 @@ void * XMem::AllocStandard(int myDevID, MTYPE mySize, bool myIsRebuiltIndex)

    /* if all index nodes are used, we rebuild the index to release the nodes that are free */
    if(nodeNumUsed == nodeNum){
-    	RebuildIndex();
-    	CheckNTErrors(nodeNumUsed < nodeNum, "No enough index nodes for the memory pool!");
+        RebuildIndex();
+        CheckNTErrors(nodeNumUsed < nodeNum, "No enough index nodes for the memory pool!");
    }

    /*if(testxmemid == 30){
@@ -961,8 +961,8 @@ release a piece of memory as "free"
 */
 void XMem::ReleaseStandard(int myDevID, void * p, MTYPE size)
 {
-	if(p == NULL)
-		return;
+    if(p == NULL)
+        return;
    
    if(size <= minSizeIndex[0])
        size = minSizeIndex[0];
@@ -1092,7 +1092,7 @@ void XMem::RebuildIndex()
            block->mem = NULL;
        }
        else{
-        	/* if the block is in use, we build the index */
+            /* if the block is in use, we build the index */
            int pieceCount = 0;
            MTYPE size = 0;
            MHeader * newLast = NULL;

--- a/source/tensor/XQueue-李垠桥的MacBook Pro.cpp
+++ b/source/tensor/XQueue-李垠桥的MacBook Pro.cpp
-/* NiuTrans.Tensor - an open-source tensor library
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * 
- * This is an implementation of queue. Actually we intend to use it to maintain
- * a priority job list
- *
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2017-04-05
- *
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "XQueue.h"
-#include "XDevice.h"
-#include "XList.h"
-#include "XUtility.h"
-
-/* the nts (NiuTrans.Tensor) namespace */
-namespace nts{
-
-/**************************************
-job item used in queues
-*/
-
-/* constructor */
-JobQueueNode::JobQueueNode()
-{
-    job  = NULL;
-    args = new TensorList(1);
-}
-
-/* de-constructor */
-JobQueueNode::~JobQueueNode()
-{
-    delete args;
-}
-
-/**************************************
-This class provides standard utilities of Queue.
-*/
-
-/* constuctor */
-XQueue::XQueue(int mySize)
-{
-    queue = new void*[mySize];
-
-    memset(queue, 0, sizeof(void*) * mySize);
-
-    size = mySize;
-    itemCount = 0;
-    head = 0;
-    tail = 0;
-    isJobQueue = false;
-    jobDequeuerArgs = new TensorList(1);
-    jobDequeuerBreak = false;
-    runningJobCount = 0;
-    jobStream = NULL;
-    jobStream1 = NULL;
-    jobStream2 = NULL;
-    
-    MUTEX_INIT(enqueueMutex);
-    MUTEX_INIT(dequeueMutex);
-    COND_INIT(queueCond);
-    MUTEX_INIT(jobQueueMutex);
-}
-
-/* deconstructor */
-XQueue::~XQueue()
-{
-    delete[] queue;
-    delete jobDequeuerArgs;
-    delete jobStream;
-    delete jobStream1;
-    delete jobStream2;
-
-    //if(isJobQueue)
-    //    StopJobConsumer();
-
-    MUTEX_DELE(enqueueMutex);
-    MUTEX_DELE(dequeueMutex);
-    COND_DELE(queueCond);
-    MUTEX_DELE(jobQueueMutex);
-}
-
-/* 
-put an item in the tail of the queue 
->> item - the item we intend to add into the queue
-*/
-void XQueue::Enqueue(void * item)
-{
-
-    MUTEX_LOCK(enqueueMutex);
-    MUTEX_LOCK(dequeueMutex);
-
-    CheckNTErrors((itemCount < size), "Put too many items into the queue!");
-
-    queue[tail] = item;
-    tail = (tail + 1) % size;
-    itemCount++;
-    
-    COND_SIGNAL(queueCond);
-
-    MUTEX_UNLOCK(dequeueMutex);
-    MUTEX_UNLOCK(enqueueMutex);
-}
-
-/* 
-fetch an item from head of the queue 
-<< return - the head item of the queue
-*/
-void * XQueue::Dequeue()
-{
-    MUTEX_LOCK(dequeueMutex);
-
-    while(itemCount == 0)
-    {
-#ifdef  WIN32
-        MUTEX_UNLOCK(dequeueMutex);
-#endif
-        COND_WAIT(queueCond, dequeueMutex);
-#ifdef  WIN32
-        MUTEX_LOCK(dequeueMutex);
-#endif
-    }
-
-    void * r = queue[head];
-    head = (head + 1) % size;
-    itemCount--;
-
-    MUTEX_UNLOCK(dequeueMutex);
-
-    return r;
-}
-
-/* return if the queue is empty */
-bool XQueue::IsEmpty()
-{
-    return itemCount == 0;
-}
-
-/* wait until the queue is empty */
-void XQueue::WaitForEmptyJobQueue()
-{
-    while(runningJobCount > 0){
-        XSleep(10);
-    }
-
-    if(jobStream != NULL){
-        CheckNTErrors((jobStream->IsFinished()), "None fineished jobs remain");
-        jobStream->Clear();
-    }
-    if(jobStream1 != NULL){
-        CheckNTErrors((jobStream1->IsFinished()), "None fineished jobs remain");
-        jobStream1->Clear();
-    }
-    if(jobStream2 != NULL){
-        CheckNTErrors((jobStream2->IsFinished()), "None fineished jobs remain");
-        jobStream2->Clear();
-    }
-}
-
-int devids[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
-int cpuid = -1;
-
-/* 
-run job consumer (in another thread) 
->> jobDevID - id of the device for running the jobs
-*/
-void XQueue::RunJobConsumer(int jobDevID)
-{
-    CheckNTErrors((jobDevID < 16), "device id is out of scope!");
-
-    isJobQueue = true;
-    jobDequeuerArgs->Clear();
-    jobDequeuerArgs->Add(this);
-    jobDequeuerArgs->Add(jobDevID >= 0 ? devids + jobDevID : &cpuid);
-
-    jobDequeuer.function = (TFunction)DequeueJobs;
-    jobDequeuer.argv = jobDequeuerArgs;
-
-    jobDequeuer.Start();
-    jobDequeuer.LetItGo();
-}
-
-/* stop the job consumer */
-void XQueue::StopJobConsumer()
-{
-    jobDequeuerBreak = true;
-    XSleep(10);
-
-    EnqueueJob(NULL, NULL);
-
-    jobDequeuer.End();
-    isJobQueue = false;
-}
-
-/* add a job item to process */
-void XQueue::EnqueueJob(void * job, TensorList * jobArgs)
-{
-    MUTEX_LOCK(jobQueueMutex);
-    runningJobCount++;
-    MUTEX_UNLOCK(jobQueueMutex);
-
-    JobQueueNode * node = new JobQueueNode();
-    node->job = job;
-    if(jobArgs != NULL)
-        node->args->AddList(jobArgs);
-    Enqueue(node);
-}
-
-/* job item consumer */
-void XQueue::DequeueJobs(TensorList * args)
-{
-    CheckNTErrors((args->count == 2), "Illegal arguments!");
-
-    XQueue * q = (XQueue*)args->GetItem(0);
-    int devID = *(int*)args->GetItem(1);
-
-    int devIDBackup = XDevice::GetGPUDevice();
-
-    if(devID >= 0)
-        XDevice::SetGPUDevice(devID);
-
-    while(1){
-        JobQueueNode * node = (JobQueueNode*)q->Dequeue();
-
-        if(q->GetJobBreak())
-            break;
-
-        CheckNTErrors((node != NULL), "Illegal job!");
-
-        /* process a job */
-        ((TFunction)node->job)(node->args);
-
-        delete node;
-
-        MUTEX_LOCK(q->jobQueueMutex);
-        q->runningJobCount--;
-        MUTEX_UNLOCK(q->jobQueueMutex);
-
-    }
-
-    if(devID >= 0)
-        XDevice::SetGPUDevice(devIDBackup);
-}
-
-/* get the break flag */
-bool XQueue::GetJobBreak()
-{
-    return jobDequeuerBreak;
-}
-
-/* get job stream */
-XStream * XQueue::GetJobStream(int n)
-{
-    if(n == 0)
-        return jobStream;
-    else if(n == 1)
-        return jobStream1;
-    else if(n == 2)
-        return jobStream2;
-    else{
-        ShowNTErrors("invalid stream id!");
-    }
-
-    return NULL;
-}
-
-/* make job streams */
-void XQueue::MakeJobStreams(int devID, int devID1, int devID2)
-{
-    if(devID != INVALID_DEVICE_ID)
-        jobStream = new XStream(0, devID);
-    if(devID1 != INVALID_DEVICE_ID)
-        jobStream1 = new XStream(0, devID1);
-    if(devID2 != INVALID_DEVICE_ID)
-        jobStream2 = new XStream(0, devID2);
-}
-
-} /* end of the nts (NiuTrans.Tensor) namespace */
--- a/source/tensor/XQueue.cpp
+++ b/source/tensor/XQueue.cpp
@@ -189,7 +189,7 @@ void XQueue::RunJobConsumer(int jobDevID)
    isJobQueue = true;
    jobDequeuerArgs->Clear();

-	// warning: this may cause unknown error
+    // warning: this may cause unknown error
    jobDequeuerArgs->Add((XTensor*)this);
    jobDequeuerArgs->Add(jobDevID >= 0 ? (XTensor*)(devids + jobDevID) : (XTensor*)&cpuid);


--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -190,7 +190,6 @@ XTensor::XTensor(const XTensor &reference)

    isInit = true;
    isTmp  = reference.isTmp;
-	enableGrad = reference.enableGrad;
 }

 /* copy constructor (with right value reference) */
@@ -219,7 +218,6 @@ XTensor::XTensor(const XTensor &&reference)

    isInit = true;
    isTmp  = reference.isTmp;
-	enableGrad = reference.enableGrad;
 }

 /* de-constructor */
@@ -285,7 +283,7 @@ void XTensor::Init()
    isTmp =  false;
    isGrad = false;
    isVar  = false;
-	enableGrad = false;
+    enableGrad = false;
    visitMark = 0;
    grad = NULL;
 }
@@ -316,6 +314,7 @@ void XTensor::ShallowCopy(const XTensor &tensor)
 {
    strcpy(name, tensor.name);
    order = tensor.order;
+    enableGrad = tensor.enableGrad;
    memcpy(dimSize, tensor.dimSize, sizeof(int) * MAX_TENSOR_DIM_NUM);
    memcpy(dimSizeRDI, tensor.dimSizeRDI, sizeof(int) * MAX_TENSOR_DIM_NUM);
    dataType = tensor.dataType;
@@ -403,7 +402,6 @@ XTensor& XTensor::operator= (const XTensor& tensor)
        /* create tensor links for the new tensor */
        XLink::Replace(&tensor, this);
    }
-	enableGrad = tensor.enableGrad;
    return *this;
 }

@@ -450,7 +448,6 @@ XTensor& XTensor::operator= (const XTensor&& tensor)
    *tensor.dataP = NULL;

    XLink::Replace(&tensor, this);
-	enableGrad = tensor.enableGrad;
    return *this;
 }

@@ -1322,7 +1319,7 @@ set the value of a cell
 */
 bool XTensor::Set(DTYPE value, int index[], int size)
 {
-	CheckNTErrors(dataType == DEFAULT_DTYPE, "The tensor is not in default type.");
+    CheckNTErrors(dataType == DEFAULT_DTYPE, "The tensor is not in default type.");

    return SetToDevice(devID, GetCell(index, size), value);
 }
@@ -2447,7 +2444,7 @@ void InitTensor(XTensor * tensor, const XTensor * reference)
    if(reference->order < 0)
        return;

-	tensor->enableGrad = reference->enableGrad;
+    tensor->enableGrad = reference->enableGrad;
    InitTensor(tensor, reference->order, reference->dimSize, 
               reference->dataType, reference->denseRatio, 
               reference->devID, reference->mem);
@@ -2463,7 +2460,7 @@ void InitTensorV2(XTensor * tensor, const XTensor * reference)
    if(reference->order < 0)
        return;

-	tensor->enableGrad = reference->enableGrad;
+    tensor->enableGrad = reference->enableGrad;
    InitTensorV2(tensor, reference->order, reference->dimSize, 
               reference->dataType, reference->devID);
 }
@@ -2478,7 +2475,7 @@ void InitTensorOnCPU(XTensor * tensor, const XTensor * reference)
    if(reference->order < 0)
        return;
    
-	tensor->enableGrad = reference->enableGrad;
+    tensor->enableGrad = reference->enableGrad;
    InitTensor(tensor, reference->order, reference->dimSize,
               reference->dataType, reference->denseRatio,
               -1);

--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
@@ -151,8 +151,8 @@ public:
    /* indicates whether the tensor keeps the gradient when used as model parameters */
    bool isGrad;

-	/* indicates whether the gradient of the tensor should be computed */
-	bool enableGrad;
+    /* indicates whether the gradient of the tensor should be computed */
+    bool enableGrad;

    /* indicates whether the tensor is used as paramters (or variables) */
    bool isVar;
@@ -453,7 +453,7 @@ extern int MakeTensorID();
 void InitTensor(XTensor * tensor,
                const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType = X_FLOAT,
                const float myDenseRatio = 1.0F, const int myDevID = -1, XMem * myMem = NULL);
-				
+                
 /* initialize a dense XTensor V2 */
 void InitTensorV2(XTensor * tensor,
                const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType = X_FLOAT,

--- a/source/tensor/core/arithmetic/Div.cpp
+++ b/source/tensor/core/arithmetic/Div.cpp
@@ -142,6 +142,23 @@ void _DivMe(XTensor * a, const XTensor * b, DTYPE alpha, int leadingDim)
    _Div(a, b, a, alpha, leadingDim);
 }

+/*
+element-wise division of two tensors (do it on site)
+keep the result in the input tensor a and return nothing
+
+a(i) = a(i)*b(i) + \alpha * a(i)
+where i is the index of the item
+
+>> a - tensor a (where keep the result)
+>> b - tensor b
+>> alpha - the coefficient
+>> leadingDim - the dimension along which we perform broadcasting
+*/
+void DivMe(XTensor& a, const XTensor& b, DTYPE alpha, int leadingDim)
+{
+    _Div(&a, &b, &a, alpha, leadingDim);
+}
+
 /* 
 return a dimension if the division is performed as DivDim (in more details in DivDim.h)
 >> a - a tensor

--- a/source/tensor/core/arithmetic/Div.cu
+++ b/source/tensor/core/arithmetic/Div.cu
@@ -122,7 +122,7 @@ where i is the item index
 */
 void _CudaDiv(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
 {
-	int leadingDimRDI = a->order - leadingDim - 1;
+    int leadingDimRDI = a->order - leadingDim - 1;
    CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum),
                  "Unmatched tensors in multiplication!");
    CheckNTErrors((a->order == b->order && a->order == c->order), "Unmatched tensors!");

--- a/source/tensor/core/arithmetic/Div.h
+++ b/source/tensor/core/arithmetic/Div.h
@@ -40,6 +40,7 @@ a(i) = a(i)/b(i) + \alpha * a(i)
 where i is the index of the element 
 */
 void _DivMe(XTensor * a, const XTensor * b, DTYPE alpha = 0.0, int leadingDim = 0);
+void DivMe(XTensor & a, const XTensor & b, DTYPE alpha = 0.0, int leadingDim = 0);

 /* 
 element-wise division of two tensors (return an XTensor structure)

--- a/source/tensor/core/arithmetic/Mask.cpp
+++ b/source/tensor/core/arithmetic/Mask.cpp
@@ -130,6 +130,17 @@ void _MaskMe(XTensor * a, const XTensor * mask, DTYPE alpha)
 }

 /*
+mask entries of a given tensor (on site):
+a(i) = a(i) if mask(i) is non-zero
+a(i) = alpha if mask(i) = 0
+where i is the index of the element
+*/
+void MaskMe(XTensor& a, const XTensor& mask, DTYPE alpha)
+{
+    _Mask(&a, &mask, &a, alpha);
+}
+
+/*
 mask entries of a given tensor (return an XTensor structure):
 a(i) = a(i) if mask(i) is non-zero
 a(i) = alpha if mask(i) = 0

--- a/source/tensor/core/arithmetic/Mask.h
+++ b/source/tensor/core/arithmetic/Mask.h
@@ -43,6 +43,7 @@ a(i) = alpha if mask(i) = 0
 where i is the index of the element
 */
 void _MaskMe(XTensor * a, const XTensor * mask, DTYPE alpha);
+void MaskMe(XTensor & a, const XTensor & mask, DTYPE alpha);

 /* 
 mask entries of a given tensor (return an XTensor structure):

--- a/source/tensor/core/arithmetic/MatrixMul2D.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul2D.cpp
@@ -54,15 +54,15 @@ void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
    CheckNTErrors((a->order == 2 && b->order == 2 && c->order == 2),
                  "Input tensors must have a order = 2!");

-	int an = a->dimSize[0], am = a->dimSize[1];
-	int bn = b->dimSize[0], bm = b->dimSize[1];
-	int cn = c->dimSize[0], cm = c->dimSize[1];
-	int am2 = transposedA == X_TRANS ? an : am;
-	int an2 = transposedA == X_TRANS ? am : an;
-	int bm2 = transposedB == X_TRANS ? bn : bm;
-	int bn2 = transposedB == X_TRANS ? bm : bn;
-	int cm2 = cm;
-	int cn2 = cn;
+    int an = a->dimSize[0], am = a->dimSize[1];
+    int bn = b->dimSize[0], bm = b->dimSize[1];
+    int cn = c->dimSize[0], cm = c->dimSize[1];
+    int am2 = transposedA == X_TRANS ? an : am;
+    int an2 = transposedA == X_TRANS ? am : an;
+    int bm2 = transposedB == X_TRANS ? bn : bm;
+    int bn2 = transposedB == X_TRANS ? bm : bn;
+    int cm2 = cm;
+    int cn2 = cn;

    CheckNTErrors((am2 == bn2 && an2 == cn2 && bm2 == cm2),
                  "Unmatched tensors in multiplication!");

--- a/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.cpp
@@ -40,21 +40,21 @@ argument7: matrix c (c=a*b*\alpha + c*beta)
 */
 void _MatrixMul2DMultiTheading(TensorList * args)
 {
-	CheckNTErrors(args->count == 2, "invalid argument number!");
-	IntList * indexArgs = (IntList*)args->GetItem(0);
-	TensorList * matrixArgs = (TensorList*)args->GetItem(1);
-	CheckNTErrors(indexArgs->count == 4, "invalid argument number!");
-	CheckNTErrors(matrixArgs->count == 5, "invalid argument number!");
+    CheckNTErrors(args->count == 2, "invalid argument number!");
+    IntList * indexArgs = (IntList*)args->GetItem(0);
+    TensorList * matrixArgs = (TensorList*)args->GetItem(1);
+    CheckNTErrors(indexArgs->count == 4, "invalid argument number!");
+    CheckNTErrors(matrixArgs->count == 5, "invalid argument number!");

    XTensor * a = matrixArgs->GetItem(0);
    XTensor * b = matrixArgs->GetItem(1);
    XTensor * c = matrixArgs->GetItem(2);
    DTYPE alpha = *(DTYPE*)(matrixArgs->GetItem(3));
    DTYPE beta = *(DTYPE*)(matrixArgs->GetItem(4));
-	int x1 = indexArgs->GetItem(0);
-	int y1 = indexArgs->GetItem(1);
-	int x2 = indexArgs->GetItem(2);
-	int y2 = indexArgs->GetItem(3);
+    int x1 = indexArgs->GetItem(0);
+    int y1 = indexArgs->GetItem(1);
+    int x2 = indexArgs->GetItem(2);
+    int y2 = indexArgs->GetItem(3);

 #ifdef FAST_MATRIX
    int am = a->dimSize[1];

--- a/source/tensor/core/arithmetic/Multiply.cpp
+++ b/source/tensor/core/arithmetic/Multiply.cpp
@@ -143,6 +143,23 @@ void _MultiplyMe(XTensor * a, const XTensor * b, DTYPE alpha, int leadingDim)
    _Multiply(a, b, a, alpha, leadingDim);
 }

+/*
+element-wise product of two tensors (do it on site)
+keep the result in the input tensor a and return nothing
+
+a(i) = a(i)*b(i) + \alpha * a(i)
+where i is the index of the item
+
+>> a - tensor a (where keep the result)
+>> b - tensor b
+>> alpha - the coefficient
+>> leadingDim - the dimension along which we perform broadcasting
+*/
+void MultiplyMe(XTensor& a, const XTensor& b, DTYPE alpha, int leadingDim)
+{
+    _Multiply(&a, &b, &a, alpha, leadingDim);
+}
+
 /* 
 return a dimension if the multiplication is performed as MultiplyDim (in more details in MultiplyDim.h)
 >> a - a tensor

--- a/source/tensor/core/arithmetic/Multiply.cu
+++ b/source/tensor/core/arithmetic/Multiply.cu
@@ -122,7 +122,7 @@ where i is the item index
 */
 void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
 {
-	int leadingDimRDI = a->order - leadingDim - 1;
+    int leadingDimRDI = a->order - leadingDim - 1;
    CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum),
                  "Unmatched tensors in multiplication!");
    CheckNTErrors((a->order == b->order && a->order == c->order), "Unmatched tensors!");

--- a/source/tensor/core/arithmetic/Multiply.h
+++ b/source/tensor/core/arithmetic/Multiply.h
@@ -40,6 +40,7 @@ a(i) = a(i)*b(i) + \alpha * a(i)
 where i is the index of the element 
 */
 void _MultiplyMe(XTensor * a, const XTensor * b, DTYPE alpha = 0.0, int leadingDim = 0);
+void MultiplyMe(XTensor & a, const XTensor & b, DTYPE alpha = 0.0, int leadingDim = 0);

 /* 
 element-wise product of two tensors (return an XTensor structure)

--- a/source/tensor/core/arithmetic/MultiplyDim.cpp
+++ b/source/tensor/core/arithmetic/MultiplyDim.cpp
@@ -139,6 +139,24 @@ void _MultiplyDimMe(XTensor * a, const XTensor * b, int n, DTYPE alpha)
 }

 /*
+tensor multiplication(do it on site)
+make a new tensor to keep the result and return it
+
+c = a * b + \alpha * c
+where the size of b is equal to the n-th dimension of a,
+i.e., a is multiplied with b by broadcasting
+
+>> a - a tensor
+>> b - another tensor whose size is equal to that of dimension n of a
+>> n - the dimension index
+>> alpha - the scaling factor
+*/
+void MultiplyDimMe(XTensor& a, const XTensor& b, int n, DTYPE alpha)
+{
+    _MultiplyDim(&a, &b, &a, n, alpha);
+}
+
+/*
 tensor multiplication (return an XTensor structure and make tensor connections)
 make a new tensor to keep the result and return it


--- a/source/tensor/core/arithmetic/MultiplyDim.h
+++ b/source/tensor/core/arithmetic/MultiplyDim.h
@@ -33,6 +33,7 @@ void _MultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYP
 /* tensor multiplication a = a * b + \alpha * c where the size of b is equal to the n-th dimension of a,
   i.e., a is multiplied with b by broadcasting. we keep the result in the input tensor a and return nothing */
 void _MultiplyDimMe(XTensor * a, const XTensor * b, int n, DTYPE alpha = 0.0);
+void MultiplyDimMe(XTensor & a, const XTensor & b, int n, DTYPE alpha = 0.0);

 /* tensor multiplication c = a * b where the size of b is equal to the n-th dimension of a,
   i.e., a is multiplied with b by broadcasting. We make a new tensor c to keep the result and return it */

--- a/source/tensor/core/arithmetic/Negate.cpp
+++ b/source/tensor/core/arithmetic/Negate.cpp
@@ -60,6 +60,16 @@ void _NegateMe(XTensor * a)
 }

 /*
+set every entry to its minus value (do it on site)
+keep the result in the input tensor a and return nothing
+>> a - the tensor we are processing
+*/
+void NegateMe(XTensor& a)
+{
+    _Negate(&a, &a);
+}
+
+/*
 set every entry to its minus value (return an XTensor structure)
 make a new tensor to keep the result and return it
 >> a - input tensor we are processing

--- a/source/tensor/core/arithmetic/Negate.h
+++ b/source/tensor/core/arithmetic/Negate.h
@@ -34,6 +34,7 @@ set every entry to its minus value (do it on site)
 keep the result in the input tensor a and return nothing
 */
 void _NegateMe(XTensor * a);
+void NegateMe(XTensor & a);

 /* 
 set every entry to its minus value (return an XTensor structure)

--- a/source/tensor/core/arithmetic/Sign.cpp
+++ b/source/tensor/core/arithmetic/Sign.cpp
@@ -66,6 +66,16 @@ void _SignMe(XTensor * a)
 }

 /*
+set every entry to its sign value (do it on site)
+keep the result in the input tensor a and return nothing
+>> a - the tensor we are processing
+*/
+void SignMe(XTensor& a)
+{
+    _Sign(&a, &a);
+}
+
+/*
 set every entry to its sign value (return an XTensor structure)
 make a new tensor to keep the result and return it
 >> a - input tensor we are processing

--- a/source/tensor/core/arithmetic/Sign.h
+++ b/source/tensor/core/arithmetic/Sign.h
@@ -36,6 +36,12 @@ keep the result in the input tensor a and return nothing
 void _SignMe(XTensor * a);

 /* 
+set every entry to its sign value (do it on site)
+keep the result in the input tensor a and return nothing
+*/
+void SignMe(XTensor & a);
+
+/* 
 set every entry to its sign value  (return an XTensor structure)
 make a new tensor to keep the result and return it
 */

--- a/source/tensor/core/arithmetic/Sub.cpp
+++ b/source/tensor/core/arithmetic/Sub.cpp
@@ -126,6 +126,19 @@ void _SubMe(XTensor * a, const XTensor * b, DTYPE beta)
 {
    _Sub(a, b, a, beta);
 }
+
+/*
+tensor subtraction a = a - b * \beta (do it on site)
+keep the result in the tensor a and return nothing
+
+>> a - a tensor
+>> b - another tensor
+>> beta - the scaling factor
+*/
+void SubMe(XTensor& a, const XTensor& b, DTYPE beta)
+{
+    _Sub(&a, &b, &a, beta);
+}
  
 /* 
 return a dimension if the subtraction is performed as SubDim (in more details in SubDim.h)

--- a/source/tensor/core/arithmetic/Sub.h
+++ b/source/tensor/core/arithmetic/Sub.h
@@ -35,6 +35,7 @@ tensor subtraction a = a - b * \beta
 keep the result in the input tensor a and return nothing
 */
 void _SubMe(XTensor * a, const XTensor * b, DTYPE beta = (DTYPE)1.0);
+void SubMe(XTensor & a, const XTensor & b, DTYPE beta = (DTYPE)1.0);
    
 /*
 tensor subtraction c = a - b * \beta

--- a/source/tensor/core/arithmetic/SubDim.cpp
+++ b/source/tensor/core/arithmetic/SubDim.cpp
@@ -46,79 +46,79 @@ void _SubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE bet
 {
    n = MODX(n, a->order);

-	CheckNTErrors(a && b && c, "Empty tensor input!");
-	CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in subtraction!");
-	CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
-		          "Unmatched data types in subtraction!");
-	CheckNTErrors(a->order == c->order, "The input tensors do not have the same order in subtraction!");
-	CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
-	CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
+    CheckNTErrors(a && b && c, "Empty tensor input!");
+    CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in subtraction!");
+    CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
+                  "Unmatched data types in subtraction!");
+    CheckNTErrors(a->order == c->order, "The input tensors do not have the same order in subtraction!");
+    CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
+    CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");

    CheckDev(a->devID, b->devID);

-	if (beta == 0) {
-		_CopyValues(a, c);
-		return;
-	}
+    if (beta == 0) {
+        _CopyValues(a, c);
+        return;
+    }

-	if (XTensor::IsSameShaped(a, b)) {
-		_Sub(a, b, c, beta);
-		return;
-	}
+    if (XTensor::IsSameShaped(a, b)) {
+        _Sub(a, b, c, beta);
+        return;
+    }

-	if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
+    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
 #ifdef USE_CUDA
-		_CudaSubDim(a, b, c, n, beta);
+        _CudaSubDim(a, b, c, n, beta);
 #else
-		ShowNTErrors("Please specify USE_CUDA and recompile the code!");
+        ShowNTErrors("Please specify USE_CUDA and recompile the code!");
 #endif
-	}
-	else {
-		int stride = 1;
-		int blockSize = a->dimSize[n];
-		int blockNum = 1;
-
-		for (int i = a->order - 1; i >= 0; i--) {
-			if (i > n)
-				stride *= a->dimSize[i];
-			else if (i < n)
-				blockNum *= a->dimSize[i];
-		}
-
-		if (a->dataType == DEFAULT_DTYPE) {
-			int num = a->unitNum;
-			if (stride > 1) {
-				for (int i = 0, j = 0; i < num; i += stride, j++) {
-					DTYPE * ap = (DTYPE*)a->data + i;
-					DTYPE   bv = *((DTYPE*)b->data + j % blockSize) * beta;
-					DTYPE * cp = (DTYPE*)c->data + i;
-					for (int k = 0; k < stride; k++)
-						cp[k] = ap[k] - bv;
-				}
-			}
-			else if (stride == 1) {
-				DTYPE * bp = (DTYPE*)b->data;
-				for (int i = 0; i < num; i += blockSize) {
-					DTYPE * ap = (DTYPE*)a->data + i;
-					DTYPE * cp = (DTYPE*)c->data + i;
-					if (beta == 1.0F) {
-						for (int j = 0; j < blockSize; j++)
-							cp[j] = ap[j] - bp[j];
-					}
-					else {
-						for (int j = 0; j < blockSize; j++)
-							cp[j] = ap[j] - bp[j] * beta;
-					}
-				}
-			}
-			else {
-				ShowNTErrors("Something is wrong!");
-			}
-		}
-		else {
-			ShowNTErrors("TODO!");
-		}
-	}
+    }
+    else {
+        int stride = 1;
+        int blockSize = a->dimSize[n];
+        int blockNum = 1;
+
+        for (int i = a->order - 1; i >= 0; i--) {
+            if (i > n)
+                stride *= a->dimSize[i];
+            else if (i < n)
+                blockNum *= a->dimSize[i];
+        }
+
+        if (a->dataType == DEFAULT_DTYPE) {
+            int num = a->unitNum;
+            if (stride > 1) {
+                for (int i = 0, j = 0; i < num; i += stride, j++) {
+                    DTYPE * ap = (DTYPE*)a->data + i;
+                    DTYPE   bv = *((DTYPE*)b->data + j % blockSize) * beta;
+                    DTYPE * cp = (DTYPE*)c->data + i;
+                    for (int k = 0; k < stride; k++)
+                        cp[k] = ap[k] - bv;
+                }
+            }
+            else if (stride == 1) {
+                DTYPE * bp = (DTYPE*)b->data;
+                for (int i = 0; i < num; i += blockSize) {
+                    DTYPE * ap = (DTYPE*)a->data + i;
+                    DTYPE * cp = (DTYPE*)c->data + i;
+                    if (beta == 1.0F) {
+                        for (int j = 0; j < blockSize; j++)
+                            cp[j] = ap[j] - bp[j];
+                    }
+                    else {
+                        for (int j = 0; j < blockSize; j++)
+                            cp[j] = ap[j] - bp[j] * beta;
+                    }
+                }
+            }
+            else {
+                ShowNTErrors("Something is wrong!");
+            }
+        }
+        else {
+            ShowNTErrors("TODO!");
+        }
+    }
 }

 /*
@@ -136,7 +136,7 @@ i.e., a is subtracted with b by broadcasting
 */
 void _SubDim(XTensor * a, const XTensor * b, int n, DTYPE beta)
 {
-	_SubDim(a, b, a, n, beta);
+    _SubDim(a, b, a, n, beta);
 }

 /*
@@ -155,20 +155,20 @@ i.e., a is subtracted with b by broadcasting
 */
 XTensor SubDim(const XTensor &a, const XTensor &b, int n, DTYPE beta)
 {
-	XTensor c(&a);
-	c.SetTMPFlag();
+    XTensor c(&a);
+    c.SetTMPFlag();

    n = MODX(n, a.order);

-	/* call _Sub function */
-	_SubDim(&a, &b, &c, n, beta);
+    /* call _Sub function */
+    _SubDim(&a, &b, &c, n, beta);

-	/* tensor connections */
-	XLink::MakeLink(&a, &b, &c, MATH_SUBDIM);
-	XLink::AddParamToHeadInt(&c, n);
-	XLink::AddParamToHead(&c, beta);
+    /* tensor connections */
+    XLink::MakeLink(&a, &b, &c, MATH_SUBDIM);
+    XLink::AddParamToHeadInt(&c, n);
+    XLink::AddParamToHead(&c, beta);

-	return c;
+    return c;
 }

 /*

--- a/source/tensor/core/arithmetic/SubDim.cu
+++ b/source/tensor/core/arithmetic/SubDim.cu
@@ -39,25 +39,25 @@ where a is a tensor and b is a row vector
 */
 template <class T, bool betaFired>
 __global__
-	void KernelSubWithRow(T * a, T * b, T * c, int rowNum, int colNum, T beta)
+    void KernelSubWithRow(T * a, T * b, T * c, int rowNum, int colNum, T beta)
 {
-	__shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
-	int col = blockDim.x * blockIdx.x + threadIdx.x;
-	int row = blockDim.y * blockIdx.y + threadIdx.y;
+    __shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+    int col = blockDim.x * blockIdx.x + threadIdx.x;
+    int row = blockDim.y * blockIdx.y + threadIdx.y;

-	if (col >= colNum || row >= rowNum)
-		return;
+    if (col >= colNum || row >= rowNum)
+        return;

-	if (threadIdx.y == 0)
-		bv[threadIdx.x] = b[col];
+    if (threadIdx.y == 0)
+        bv[threadIdx.x] = b[col];

-	__syncthreads();
+    __syncthreads();

-	int offset = colNum * row + col;
-	if (betaFired)
-		c[offset] = a[offset] - bv[threadIdx.x] * beta;
-	else
-		c[offset] = a[offset] - bv[threadIdx.x];
+    int offset = colNum * row + col;
+    if (betaFired)
+        c[offset] = a[offset] - bv[threadIdx.x] * beta;
+    else
+        c[offset] = a[offset] - bv[threadIdx.x];
 }

 /*
@@ -75,30 +75,30 @@ where a is a tensor and b is a colum vector
 */
 template <class T, bool betaFired>
 __global__
-	void KernelSubWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, T beta)
+    void KernelSubWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, T beta)
 {
-	__shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+    __shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];

-	int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
-	int row = blockDim.y * blockIdx.y + threadIdx.y;
+    int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
+    int row = blockDim.y * blockIdx.y + threadIdx.y;

-	int col = colIndex % colNum;
-	int block = colIndex / colNum;
+    int col = colIndex % colNum;
+    int block = colIndex / colNum;

-	if (row >= rowNum || block >= blockNum)
-		return;
+    if (row >= rowNum || block >= blockNum)
+        return;

-	if (threadIdx.x == 0)
-		bv[threadIdx.y] = b[row];
+    if (threadIdx.x == 0)
+        bv[threadIdx.y] = b[row];

-	__syncthreads();
+    __syncthreads();

-	int offset = block * blockSize + row * colNum + col;
+    int offset = block * blockSize + row * colNum + col;

-	if (betaFired)
-		c[offset] = a[offset] - bv[threadIdx.y] * beta;
-	else
-		c[offset] = a[offset] - bv[threadIdx.y];
+    if (betaFired)
+        c[offset] = a[offset] - bv[threadIdx.y] * beta;
+    else
+        c[offset] = a[offset] - bv[threadIdx.y];
 }

 /*
@@ -116,63 +116,63 @@ i.e., a is subtracted with b by broadcasting
 */
 void _CudaSubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta)
 {
-	CheckNTErrors(a && b && c, "Empty tensor input!");
-	CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in subtraction!");
-	CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
-		          "Unmatched data types in subtraction!");
-	CheckNTErrors(a->order == c->order, "The input tensors do not have the same order in subtraction!");
-	CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
-	CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
-
-	int stride = 1;
-	int blockSize = a->dimSize[n];
-	int blockNum = 1;
-
-	for (int i = a->order - 1; i >= 0; i--) {
-		if (i > n)
-			stride *= a->dimSize[i];
-		else if (i < n)
-			blockNum *= a->dimSize[i];
-	}
-
-	int cudaGrids[3];
-	int cudaBlocks[3];
-
-	int devIDBackup = 0;
-	ProtectCudaDev(a->devID, devIDBackup);
-
-	if (a->dataType == DEFAULT_DTYPE) {
-		if (stride > 1) {
-			GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
-			if (beta == (DTYPE)1.0F)
-				KernelSubWithCol<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
-				                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
-					                              blockSize, stride, blockSize * stride, blockNum, beta);
-			else
-				KernelSubWithCol<DTYPE, true>  <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
-				                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
-					                              blockSize, stride, blockSize * stride, blockNum, beta);
-		}
-		else if (stride == 1) {
-			GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
-			if (beta == (DTYPE)1.0F)
-				KernelSubWithRow<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
-				                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
-					                              blockNum, blockSize, beta);
-			else
-				KernelSubWithRow<DTYPE, true>  <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
-				                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
-					                              blockNum, blockSize, beta);
-		}
-		else {
-			ShowNTErrors("Something is wrong!");
-		}
-	}
-	else {
-		ShowNTErrors("TODO!");
-	}
-
-	BacktoCudaDev(a->devID, devIDBackup);
+    CheckNTErrors(a && b && c, "Empty tensor input!");
+    CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in subtraction!");
+    CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
+                  "Unmatched data types in subtraction!");
+    CheckNTErrors(a->order == c->order, "The input tensors do not have the same order in subtraction!");
+    CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
+    CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
+
+    int stride = 1;
+    int blockSize = a->dimSize[n];
+    int blockNum = 1;
+
+    for (int i = a->order - 1; i >= 0; i--) {
+        if (i > n)
+            stride *= a->dimSize[i];
+        else if (i < n)
+            blockNum *= a->dimSize[i];
+    }
+
+    int cudaGrids[3];
+    int cudaBlocks[3];
+
+    int devIDBackup = 0;
+    ProtectCudaDev(a->devID, devIDBackup);
+
+    if (a->dataType == DEFAULT_DTYPE) {
+        if (stride > 1) {
+            GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
+            if (beta == (DTYPE)1.0F)
+                KernelSubWithCol<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
+                                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
+                                                  blockSize, stride, blockSize * stride, blockNum, beta);
+            else
+                KernelSubWithCol<DTYPE, true>  <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
+                                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
+                                                  blockSize, stride, blockSize * stride, blockNum, beta);
+        }
+        else if (stride == 1) {
+            GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
+            if (beta == (DTYPE)1.0F)
+                KernelSubWithRow<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
+                                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
+                                                  blockNum, blockSize, beta);
+            else
+                KernelSubWithRow<DTYPE, true>  <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
+                                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
+                                                  blockNum, blockSize, beta);
+        }
+        else {
+            ShowNTErrors("Something is wrong!");
+        }
+    }
+    else {
+        ShowNTErrors("TODO!");
+    }
+
+    BacktoCudaDev(a->devID, devIDBackup);
 }

 #endif

--- a/source/tensor/core/arithmetic/Sum.cpp
+++ b/source/tensor/core/arithmetic/Sum.cpp
@@ -132,6 +132,19 @@ void _SumMe(XTensor * a, const XTensor * b, DTYPE beta)
    _Sum(a, b, a, beta);
 }

+/*
+tensor summation a = a + b * \beta (do it on site)
+keep the result in the tensor a and return nothing
+
+>> a - a tensor
+>> b - another tensor
+>> beta - the scaling factor
+*/
+void SumMe(XTensor& a, const XTensor& b, DTYPE beta)
+{
+    _Sum(&a, &b, &a, beta);
+}
+
 /* 
 return a dimension if the sum is performed as SumDim (in more details in SumDim.h)
 >> a - a tensor

--- a/source/tensor/core/arithmetic/Sum.h
+++ b/source/tensor/core/arithmetic/Sum.h
@@ -34,6 +34,7 @@ tensor summation a = a + b * \beta
 keep the result in the input tensor a and return nothing
 */
 void _SumMe(XTensor * a, const XTensor * b, DTYPE beta = (DTYPE)1.0);
+void SumMe(XTensor & a, const XTensor & b, DTYPE beta = (DTYPE)1.0);
    
 /*
 tensor summation c = a + b * \beta

--- a/source/tensor/core/arithmetic/XTensorBLAS.cpp
+++ b/source/tensor/core/arithmetic/XTensorBLAS.cpp
@@ -48,12 +48,12 @@ void _MatrixMULCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
    CheckNTErrors((c->dataType == DEFAULT_DTYPE), "TODO!");

 #if defined(USE_BLAS)
-	int an = a->dimSize[0];
+    int an = a->dimSize[0];
    int am = a->dimSize[1];
-	int bn = b->dimSize[0];
-	int bm = b->dimSize[1];
-	int cn = c->dimSize[0];
-	int cm = c->dimSize[1];
+    int bn = b->dimSize[0];
+    int bm = b->dimSize[1];
+    int cn = c->dimSize[0];
+    int cm = c->dimSize[1];

    if (transposedA == X_NOTRANS && transposedB == X_NOTRANS)
        GEMM(CblasRowMajor, CblasNoTrans, CblasNoTrans, cn, cm, am, alpha, (DTYPE*)a->data, am, (DTYPE*)b->data, bm, beta, (DTYPE*)c->data, cm);

--- a/source/tensor/core/math/Binary.cpp
+++ b/source/tensor/core/math/Binary.cpp
@@ -165,7 +165,7 @@ SIMPLE_BINARY_FUNCTION(Shift, _Shift, MATH_SHIFT)
 SIMPLE_BINARY_FUNCTION_VOID(Shift, _Shift, MATH_SHIFT)

 _SIMPLE_BINARY_FUNCTION_INT(_Mod, _CudaMod, mod)
-SIMPLE_BINARY_FUNCTION_ME_INT(_ModMe, _Mod)
+SIMPLE_BINARY_FUNCTION_ME_INT(ModMe, _Mod)
 SIMPLE_BINARY_FUNCTION_INT(Mod, _Mod)

 #else

--- a/source/tensor/core/math/Binary.h
+++ b/source/tensor/core/math/Binary.h
@@ -37,9 +37,16 @@ void _Scale(const XTensor * a, XTensor * b, float scale);
 scale up tensor entires (on site)
 b = a * scale
 */
-void _ScaleMe(XTensor & a, int scale);
-void _ScaleMe(XTensor & a, float scale);
-    
+void _ScaleMe(XTensor * a, int scale);
+void _ScaleMe(XTensor * a, float scale);
+
+/*
+scale up tensor entires (on site)
+b = a * scale
+*/
+void ScaleMe(XTensor & a, int scale);
+void ScaleMe(XTensor & a, float scale);
+   
 /*
 scale up tensor entires
 b = a * scale
@@ -64,8 +71,15 @@ void _Descale(const XTensor * a, XTensor * b, float scale);
 descale tensor entires (on site)
 b = a / scale
 */
-void _DescaleMe(XTensor & a, int scale);
-void _DescaleMe(XTensor & a, float scale);
+void _DescaleMe(XTensor * a, int scale);
+void _DescaleMe(XTensor * a, float scale);
+
+/*
+descale tensor entires (on site)
+b = a / scale
+*/
+void DescaleMe(XTensor & a, int scale);
+void DescaleMe(XTensor & a, float scale);
    
 /*
 descale tensor entires
@@ -91,8 +105,15 @@ void _Shift(const XTensor * a, XTensor * b, float shift);
 shift tensor entires (on site)
 b = a + shift
 */
-void _ShiftMe(XTensor & a, int shift);
-void _ShiftMe(XTensor & a, float shift);
+void _ShiftMe(XTensor * a, int shift);
+void _ShiftMe(XTensor * a, float shift);
+
+/*
+shift tensor entires (on site)
+b = a + shift
+*/
+void ShiftMe(XTensor & a, int shift);
+void ShiftMe(XTensor & a, float shift);
    
 /*
 shift tensor entires
@@ -118,7 +139,13 @@ void _Mod(const XTensor * a, XTensor * b, int base);
 mod tensor entires (on site)
 b = a % mod
 */
-void _ModMe(XTensor & a, int base);
+void _ModMe(XTensor * a, int base);
+
+/*
+mod tensor entires (on site)
+b = a % mod
+*/
+void ModMe(XTensor & a, int base);
    
 /*
 mod tensor entires

--- a/source/tensor/core/math/Clip.cpp
+++ b/source/tensor/core/math/Clip.cpp
@@ -36,26 +36,26 @@ set every entry to its clip value
 void _Clip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper)
 {
 #ifdef USE_CUDA
-	/* run it on GPUs */
-	if (a->devID >= 0) {
-		_CudaClip(a, b, lower, upper);
-		return;
-	}
+    /* run it on GPUs */
+    if (a->devID >= 0) {
+        _CudaClip(a, b, lower, upper);
+        return;
+    }
 #endif

-	CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
-	CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
-
-	DTYPE * d = (DTYPE*)a->data;
-	DTYPE * db = (DTYPE*)b->data;
-	for (int i = 0; i < a->unitNum; i++) {
-		if (d[i] > upper)
-			db[i] = upper;
-		else if (d[i] < lower)
-			db[i] = lower;
-		else
-			db[i] = d[i];
-	}
+    CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
+    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
+
+    DTYPE * d = (DTYPE*)a->data;
+    DTYPE * db = (DTYPE*)b->data;
+    for (int i = 0; i < a->unitNum; i++) {
+        if (d[i] > upper)
+            db[i] = upper;
+        else if (d[i] < lower)
+            db[i] = lower;
+        else
+            db[i] = d[i];
+    }
 }

 /*
@@ -67,7 +67,19 @@ keep the result in the input tensor a and return nothing
 */
 void _ClipMe(XTensor * a, DTYPE lower, DTYPE upper)
 {
-	_Clip(a, a, lower, upper);
+    _Clip(a, a, lower, upper);
+}
+
+/*
+set every entry to its clip value (do it on site)
+keep the result in the input tensor a and return nothing
+>> a - the tensor we are processing
+>> lower - the lower border
+>> upper - the upper border
+*/
+void ClipMe(XTensor& a, DTYPE lower, DTYPE upper)
+{
+    _Clip(&a, &a, lower, upper);
 }

 /*
@@ -80,18 +92,18 @@ make a new tensor to keep the result and return it
 */
 XTensor Clip(const XTensor & a, DTYPE lower, DTYPE upper)
 {
-	XTensor b(&a);
-	b.SetTMPFlag();
+    XTensor b(&a);
+    b.SetTMPFlag();

-	/* call _Clip function */
-	_Clip(&a, &b, lower, upper);
+    /* call _Clip function */
+    _Clip(&a, &b, lower, upper);

-	/* tensor connections */
-	XLink::MakeLink(&a, NULL, &b, MATH_CLIP);
-	XLink::AddParamToHead(&b, lower);
-	XLink::AddParamToHead(&b, upper);
+    /* tensor connections */
+    XLink::MakeLink(&a, NULL, &b, MATH_CLIP);
+    XLink::AddParamToHead(&b, lower);
+    XLink::AddParamToHead(&b, upper);

-	return b;
+    return b;
 }

 void Clip(const XTensor & a, XTensor & b, DTYPE lower, DTYPE upper)

--- a/source/tensor/core/math/Clip.cu
+++ b/source/tensor/core/math/Clip.cu
@@ -36,18 +36,18 @@ set each entry to its clip value (CUDA Kernel)
 >> size - size of the data array
 */
 __global__
-	void KernelClip(DTYPE * a, DTYPE * b, DTYPE lower, DTYPE upper, int size)
+    void KernelClip(DTYPE * a, DTYPE * b, DTYPE lower, DTYPE upper, int size)
 {
-	int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-	if (i < size) {
-		if (a[i] > upper)
-			b[i] = upper;
-		else if (a[i] < lower)
-			b[i] = lower;
-		else
-			b[i] = a[i];
-	}
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < size) {
+        if (a[i] > upper)
+            b[i] = upper;
+        else if (a[i] < lower)
+            b[i] = lower;
+        else
+            b[i] = a[i];
+    }
 }

 /*
@@ -62,7 +62,7 @@ This is for float16 computation
 __global__
 void KernelClip(__half * a, __half * b, DTYPE lower, DTYPE upper, int size)
 {
-	return;
+    return;
 }

 /*
@@ -74,31 +74,31 @@ set each entry to its clip value
 */
 void _CudaClip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper)
 {
-	CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
-	CheckNTErrors((a->isSparse == false), "TODO!");
+    CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
+    CheckNTErrors((a->isSparse == false), "TODO!");

-	int gridSize[3];
-	int blockSize[3];
+    int gridSize[3];
+    int blockSize[3];

-	GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
+    GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);

-	dim3 blocks(gridSize[0]);
-	dim3 threads(blockSize[0]);
+    dim3 blocks(gridSize[0]);
+    dim3 threads(blockSize[0]);

-	int devIDBackup;
-	ProtectCudaDev(a->devID, devIDBackup);
+    int devIDBackup;
+    ProtectCudaDev(a->devID, devIDBackup);

-	if (a->dataType == DEFAULT_DTYPE) {
-		KernelClip << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, lower, upper, a->unitNum);
-	}
-	else if (a->dataType == X_FLOAT16) {
-		KernelClip << <blocks, threads >> >((__half*)a->data, (__half*)b->data, lower, upper, a->unitNum);
-	}
-	else {
-		ShowNTErrors("TODO!");
-	}
+    if (a->dataType == DEFAULT_DTYPE) {
+        KernelClip << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, lower, upper, a->unitNum);
+    }
+    else if (a->dataType == X_FLOAT16) {
+        KernelClip << <blocks, threads >> >((__half*)a->data, (__half*)b->data, lower, upper, a->unitNum);
+    }
+    else {
+        ShowNTErrors("TODO!");
+    }

-	BacktoCudaDev(a->devID, devIDBackup);
+    BacktoCudaDev(a->devID, devIDBackup);
 }

 /*

--- a/source/tensor/core/math/Clip.h
+++ b/source/tensor/core/math/Clip.h
@@ -33,6 +33,10 @@ void _Clip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper);
   keep the result in the input tensor a and return nothing */
 void _ClipMe(XTensor * a, DTYPE lower, DTYPE upper);

+/* set every entry to its clip value (do it on site)
+keep the result in the input tensor a and return nothing */
+void ClipMe(XTensor & a, DTYPE lower, DTYPE upper);
+
 /* set every entry to its clip value  (return an XTensor structure)
   make a new tensor to keep the result and return it */
 XTensor Clip(const XTensor & a, DTYPE lower, DTYPE upper);

--- a/source/tensor/core/math/Compare.h
+++ b/source/tensor/core/math/Compare.h
@@ -32,6 +32,9 @@ void _Equal(const XTensor * a, XTensor * b, DTYPE value);
 /* check whether every entry is equal to the given value (do it on site) */
 void _EqualMe(XTensor * a, DTYPE value);

+/* check whether every entry is equal to the given value (do it on site) */
+void EqualMe(XTensor & a, DTYPE value);
+
 /* check whether every entry is equal to the given value (return an XTensor structure) */
 XTensor Equal(const XTensor & a, DTYPE value);

@@ -41,6 +44,9 @@ void _NotEqual(const XTensor * a, XTensor * b, DTYPE value);
 /* check whether every entry is not equal to the given value (do it on site) */
 void _NotEqualMe(XTensor * a, DTYPE value);

+/* check whether every entry is not equal to the given value (do it on site) */
+void NotEqualMe(XTensor & a, DTYPE value);
+
 /* check whether every entry is not equal to the given value (return an XTensor structure) */
 XTensor NotEqual(const XTensor & a, DTYPE value);


--- a/source/tensor/core/math/Normalize.cpp
+++ b/source/tensor/core/math/Normalize.cpp
@@ -44,7 +44,7 @@ where a and b are the scalar and bias respectively, and \epsilon is the adjustme
 */
 void _Normalize(const XTensor * input, XTensor * output, int dim, const XTensor * mean, const XTensor * var, const XTensor * a, const XTensor * b, DTYPE epsilon)
 {
-	int dimRDI = input->order - dim - 1;
+    int dimRDI = input->order - dim - 1;
    CheckNTErrors((XTensor::IsSameShaped(input, output)), "Unmatched input tensors!");
    CheckNTErrors((XTensor::IsSameShaped(a, b)), "Unmatched input tensors");
    CheckNTErrors((XTensor::IsSameShaped(mean, var)), "Unmatched input tensors");
@@ -113,6 +113,27 @@ void _NormalizeMe(XTensor * input, int dim, const XTensor * mean, const XTensor 
 {
    _Normalize(input, input, dim, mean, var, a, b, epsilon);
 }
+
+/*
+normalized the data with normal distribution (do it on site)
+keep the result in the input tensor and return nothing
+
+For an input x, x = a * (x-mean)/sqrt(variance+\epsilon) + b
+where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter.
+
+>> input - the input tensor
+>> dim - dimension alone which we generate the mean and variance
+>> mean - the mean of the input
+>> var - the variance of the input
+>> a - the scalar
+>> b - the bias
+>> epsilon - a parameter
+*/
+void NormalizeMe(XTensor& input, int dim, const XTensor& mean, const XTensor& var, const XTensor& a, const XTensor& b, DTYPE epsilon)
+{
+    _Normalize(&input, &input, dim, &mean, &var, &a, &b, epsilon);
+}
+
 /*
 normalized the data with normal distribution (return an XTensor structure)
 make a new tensor to keep the result and return it 

--- a/source/tensor/core/math/Normalize.cu
+++ b/source/tensor/core/math/Normalize.cu
@@ -95,8 +95,8 @@ void _CudaNormalize(const XTensor * input, XTensor * output, int dim,
 {
    CheckNTErrors((input->dataType == DEFAULT_DTYPE), "TODO!");

-	int dimRDI = input->order - dim - 1;
-	int stride = 1;
+    int dimRDI = input->order - dim - 1;
+    int stride = 1;
    int strideNum = input->dimSizeRDI[dimRDI];
    int blockNum = 1;
    for (int i = 0; i < input->order; i++) {

--- a/source/tensor/core/math/Normalize.h
+++ b/source/tensor/core/math/Normalize.h
@@ -42,6 +42,14 @@ where a and b are the scalar and bias respectively, and \epsilon is the adjustme
 void _NormalizeMe(XTensor * input, int dim, const XTensor * mean, const XTensor * var, const XTensor * a, const XTensor * b, DTYPE epsilon);

 /*
+normalized the data with normal distribution (do it on site)
+keep the result in the input tenosr and return nothing
+For an input x, x = a * (x-mean)/sqrt(variance+\epsilon) + b
+where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter.
+*/
+void NormalizeMe(XTensor & input, int dim, const XTensor & mean, const XTensor & var, const XTensor & a, const XTensor & b, DTYPE epsilon);
+
+/*
 normalized the data with normal distribution (return an XTensor structure)
 make a new tensor to keep the result and return it 
 For an input x, y = a * (x-mean)/sqrt(variance+\epsilon) + b

--- a/source/tensor/core/math/Power.cpp
+++ b/source/tensor/core/math/Power.cpp
@@ -81,6 +81,17 @@ void _PowerMe(XTensor * a, DTYPE p)
 }

 /*
+get the power(a, p) (do it on site)
+keep the result in the input tensor a and return nothing
+>> a - the tensor
+>> p - parameter
+*/
+void PowerMe(XTensor& a, DTYPE p)
+{
+    _Power(&a, &a, p);
+}
+
+/*
 get the power(a, p) (return an XTensor structure)
 make a new tensor to keep the result and return it
 >> a - input tensor

--- a/source/tensor/core/math/Power.h
+++ b/source/tensor/core/math/Power.h
@@ -36,6 +36,12 @@ keep the result in the input tensor a and return nothing
 void _PowerMe(XTensor * a, DTYPE p);

 /* 
+get the power(x, y) (do it on site)
+keep the result in the input tensor a and return nothing
+*/
+void PowerMe(XTensor & a, DTYPE p);
+
+/* 
 get the power(x, y) (return an XTensor structure)
 make a new tensor to keep the result and return it
 */

--- a/source/tensor/core/math/ScaleAndShift.cpp
+++ b/source/tensor/core/math/ScaleAndShift.cpp
@@ -92,6 +92,21 @@ void _ScaleAndShiftMe(XTensor * a, DTYPE scale, DTYPE shift)
 }

 /* 
+scale and shift all tensor entires (do it on site)
+keep the result in the input tensor a and return nothing
+
+a = a * scale + shift
+
+>> a - the input/output tensor
+>> scale - the scaler factor
+>> shift - the shift factor
+*/
+void ScaleAndShiftMe(XTensor& a, DTYPE scale, DTYPE shift)
+{
+    _ScaleAndShift(&a, &a, scale, shift);
+}
+
+/* 
 scale and shift all tensor entires (return an XTensor structure)
 make a new tensor to keep the result and return it


--- a/source/tensor/core/math/ScaleAndShift.h
+++ b/source/tensor/core/math/ScaleAndShift.h
@@ -45,6 +45,13 @@ void _ScaleAndShiftMe(XTensor * a, DTYPE scale, DTYPE shift = 0);

 /*
 scale and shift all tensor entires
+keep the result in the input tensor a and return nothing
+a = a * scale + shift 
+*/
+void ScaleAndShiftMe(XTensor & a, DTYPE scale, DTYPE shift = 0);
+
+/*
+scale and shift all tensor entires
 make a new tensor to keep the result and return it
 b = a * scale + shift 
 */

--- a/source/tensor/core/math/Unary.cpp
+++ b/source/tensor/core/math/Unary.cpp
@@ -34,7 +34,7 @@ DTYPE square(DTYPE x)

 DTYPE round(DTYPE r)
 {
-	return (r > 0.0) ? (DTYPE)floor(r + 0.5) : (DTYPE)ceil(r - 0.5);
+    return (r > 0.0) ? (DTYPE)floor(r + 0.5) : (DTYPE)ceil(r - 0.5);
 }

 DTYPE isnonzero(DTYPE r)

--- a/source/tensor/core/math/Unary.cu
+++ b/source/tensor/core/math/Unary.cu
@@ -38,7 +38,7 @@ DTYPE cudasquare(DTYPE x)
 __device__
 DTYPE cudaround(DTYPE r)
 {
-	return (r > 0.0) ? (DTYPE)floor(r + 0.5) : (DTYPE)ceil(r - 0.5);
+    return (r > 0.0) ? (DTYPE)floor(r + 0.5) : (DTYPE)ceil(r - 0.5);
 }

 __device__

--- a/source/tensor/core/math/Unary.h
+++ b/source/tensor/core/math/Unary.h
@@ -31,6 +31,9 @@ void _Absolute(const XTensor * a, XTensor * b);
 /* set every entry to its absolute value (do it on site)
 keep the result in the input tensor a and return nothing */
 void _AbsoluteMe(XTensor * a);
+/* set every entry to its absolute value (do it on site)
+keep the result in the input tensor a and return nothing */
+void AbsoluteMe(XTensor & a);
 /* set every entry to its absolute value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Absolute(const XTensor & a);
@@ -42,6 +45,9 @@ void _Ceil(const XTensor * a, XTensor * b);
 /* set every entry to its ceil value (do it on site)
 keep the result in the input tensor a and return nothing */
 void _CeilMe(XTensor * a);
+/* set every entry to its ceil value (do it on site)
+keep the result in the input tensor a and return nothing */
+void CeilMe(XTensor & a);
 /* set every entry to its ceil value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Ceil(const XTensor & a);
@@ -53,6 +59,9 @@ void _Exp(const XTensor * a, XTensor * b);
 /* set every entry to its exponent value (do it on site)
 keep the result in the input tensor a and return nothing */
 void _ExpMe(XTensor * a);
+/* set every entry to its exponent value (do it on site)
+keep the result in the input tensor a and return nothing */
+void ExpMe(XTensor & a);
 /* set every entry to its exponent value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Exp(const XTensor & a);
@@ -64,6 +73,9 @@ void _Floor(const XTensor * a, XTensor * b);
 /* set every entry to its floor value (do it on site)
 keep the result in the input tensor a and return nothing */
 void _FloorMe(XTensor * a);
+/* set every entry to its floor value (do it on site)
+keep the result in the input tensor a and return nothing */
+void FloorMe(XTensor & a);
 /* set every entry to its floor value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Floor(const XTensor & a);
@@ -75,6 +87,9 @@ void _IsNonZero(const XTensor *a, XTensor *b);
 /* if source entry is non-zero, set target entry to be one, otherwise zero (do it on site)
 keep the result in the input tensor a and return nothing */
 void _IsNonZeroMe(XTensor *a);
+/* if source entry is non-zero, set target entry to be one, otherwise zero (do it on site)
+keep the result in the input tensor a and return nothing */
+void IsNonZeroMe(XTensor &a);
 /* if source entry is non-zero, set target entry to be one, otherwise zero (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor IsNonZero(const XTensor &a);
@@ -86,6 +101,9 @@ void _IsZero(const XTensor *a, XTensor *b);
 /* if source entry is zero, set target entry to be one, otherwise zero (do it on site)
 keep the result in the input tensor a and return nothing */
 void _IsZeroMe(XTensor *a);
+/* if source entry is zero, set target entry to be one, otherwise zero (do it on site)
+keep the result in the input tensor a and return nothing */
+void IsZeroMe(XTensor &a);
 /* if source entry is zero, set target entry to be one, otherwise zero (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor IsZero(const XTensor &a);
@@ -97,6 +115,9 @@ void _Log(const XTensor * a, XTensor * b);
 /* set every entry to its logarithm value (do it on site)
 keep the result in the input tensor a and return nothing */
 void _LogMe(XTensor * a);
+/* set every entry to its logarithm value (do it on site)
+keep the result in the input tensor a and return nothing */
+void LogMe(XTensor & a);
 /* set every entry to its logarithm value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Log(const XTensor & a);
@@ -108,6 +129,9 @@ void _Round(const XTensor * a, XTensor * b);
 /* set every entry to its round value (do it on site)
 keep the result in the input tensor a and return nothing */
 void _RoundMe(XTensor * a);
+/* set every entry to its round value (do it on site)
+keep the result in the input tensor a and return nothing */
+void RoundMe(XTensor & a);
 /* set every entry to its round value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Round(const XTensor & a);
@@ -119,6 +143,9 @@ void _Sqrt(const XTensor * a, XTensor * b);
 /* set every entry to its sqrt value (do it on site)
 keep the result in the input tensor a and return nothing */
 void _SqrtMe(XTensor * a);
+/* set every entry to its sqrt value (do it on site)
+keep the result in the input tensor a and return nothing */
+void SqrtMe(XTensor & a);
 /* set every entry to its sqrt value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Sqrt(const XTensor & a);
@@ -130,6 +157,9 @@ void _Square(const XTensor * a, XTensor * b);
 /* set every entry to its square value (do it on site)
 keep the result in the input tensor a and return nothing */
 void _SquareMe(XTensor * a);
+/* set every entry to its square value (do it on site)
+keep the result in the input tensor a and return nothing */
+void SquareMe(XTensor & a);
 /* set every entry to its square value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Square(const XTensor & a);
@@ -142,6 +172,9 @@ void _Sin(const XTensor * a, XTensor * b);
 /* set every entry to its sine value (do it on site)
 keep the result in the input tensor a and return nothing */
 void _SinMe(XTensor * a);
+/* set every entry to its sine value (do it on site)
+keep the result in the input tensor a and return nothing */
+void SinMe(XTensor & a);
 /* set every entry to its sine value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Sin(const XTensor & a);
@@ -153,6 +186,9 @@ void _Cos(const XTensor * a, XTensor * b);
 /* set every entry to its cosine value (do it on site)
 keep the result in the input tensor a and return nothing */
 void _CosMe(XTensor * a);
+/* set every entry to its cosine value (do it on site)
+keep the result in the input tensor a and return nothing */
+void CosMe(XTensor & a);
 /* set every entry to its cosine value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Cos(const XTensor & a);
@@ -164,6 +200,9 @@ void _Tan(const XTensor * a, XTensor * b);
 /* set every entry to its tangent value (do it on site)
 keep the result in the input tensor a and return nothing */
 void _TanMe(XTensor * a);
+/* set every entry to its tangent value (do it on site)
+keep the result in the input tensor a and return nothing */
+void TanMe(XTensor & a);
 /* set every entry to its tangent value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Tan(const XTensor & a);

--- a/source/tensor/core/reduce/ReduceMax.cpp
+++ b/source/tensor/core/reduce/ReduceMax.cpp
@@ -41,8 +41,8 @@ void _ReduceMax(const XTensor * input, XTensor * output, int dim)
    CheckNTErrors((input->order == output->order + 1), "Incorrect tensor sizes!");
    CheckNTErrors((input->order > dim && dim >=0), "Illegal dimension to reduce!");
    CheckNTErrors((input->dataType == output->dataType), "Unmatched data types!");
-	
-	int dimRDI = input->order - dim - 1;
+    
+    int dimRDI = input->order - dim - 1;
    CheckNTErrors(dimRDI >= 0, "Wrong dimension!");

    for(int i = 0; i < input->order; i++){
@@ -104,7 +104,7 @@ make a new tensor to keep the result and return it
 XTensor ReduceMax(const XTensor &input, int dim)
 {
    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
-	
+    
    int order = input.order - 1;
    int * dimSize = new int[order];
    for(int i = 0; i < order; i++){

--- a/source/tensor/core/reduce/ReduceMax.cu
+++ b/source/tensor/core/reduce/ReduceMax.cu
@@ -504,7 +504,7 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
    CheckNTErrors(input->order > dim && dim >=0, "Illegal dimension to reduce!");
    CheckNTErrors(input->dataType == output->dataType, "Unmatched data types!");

-	int dimRDI = input->order - dim - 1;
+    int dimRDI = input->order - dim - 1;
    for(int i = 0; i < input->order; i++){
        if(i < dimRDI){
            CheckNTErrors(input->dimSizeRDI[i] == output->dimSizeRDI[i], "Unmatched tensors!");

--- a/source/tensor/core/reduce/ReduceMean.cpp
+++ b/source/tensor/core/reduce/ReduceMean.cpp
@@ -39,7 +39,7 @@ void _ReduceMean(const XTensor * input, XTensor * output, int dim)
 {
    CheckNTErrors((input->order > dim), "Illegal dimension specified!");

-	int dimRDI = input->order - dim - 1;
+    int dimRDI = input->order - dim - 1;
    int num = input->dimSizeRDI[dimRDI];

    _ReduceSum(input, output, dim);
@@ -59,7 +59,7 @@ For a 1-dimensional data array a, mean = (1/n) * sum_i input_i
 XTensor ReduceMean(const XTensor &input, int dim)
 {
    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
-	
+    
    int order = input.order - 1;
    int * dimSize = new int[order];
    for(int i = 0; i < order; i++){

--- a/source/tensor/core/reduce/ReduceSum.cpp
+++ b/source/tensor/core/reduce/ReduceSum.cpp
@@ -50,7 +50,7 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor 
    CheckNTErrors((input->dataType == output->dataType), "Unmatched data types!");
    CheckNTErrors((shift == NULL || XTensor::IsSameShaped(output, shift)), "Incorrect shift tensor size!");

-	int dimRDI = input->order - dim - 1;
+    int dimRDI = input->order - dim - 1;
    CheckNTErrors(dimRDI >= 0, "Wrong dimension!");

    for(int i = 0; i < input->order; i++){
@@ -215,7 +215,7 @@ sum = \sum_i exp((a_i - shift)^power) if isExp == true
 XTensor ReduceSum(const XTensor &input, int dim, const XTensor &shift, DTYPE power, bool isExp)
 {
    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
-	
+    
    int order = input.order - 1;
    int * dimSize = new int[order];
    for(int i = 0; i < order; i++){
@@ -294,7 +294,7 @@ sum = \sum_i exp((a_i)^power) if isExp == true
 XTensor ReduceSum(const XTensor &input, int dim, DTYPE power, bool isExp)
 {
    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
-	
+    
    int order = input.order - 1;
    int * dimSize = new int[order];
    for(int i = 0; i < order; i++){

--- a/source/tensor/core/reduce/ReduceSum.cu
+++ b/source/tensor/core/reduce/ReduceSum.cu
@@ -341,7 +341,7 @@ void KernelReduceSumFast(DTYPE * input, DTYPE * output,
        if (tid < blockDim.x / 32)
            value = data[tid];
        else
-	        value = 0;
+            value = 0;
        value = shflDownReduceSum(value);

        if (tid == 0 && blockIdx.x < reducedStrideNum) {
@@ -692,7 +692,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
    CheckNTErrors(input->dataType == output->dataType, "Unmatched data types!");
    CheckNTErrors(shift == NULL || output->unitNum == shift->unitNum, "Incorrect shift tensor size!");

-	int dimRDI = input->order - dim - 1;
+    int dimRDI = input->order - dim - 1;
    for(int i = 0; i < input->order; i++){
        if(i < dimRDI){
            CheckNTErrors(input->dimSizeRDI[i] == output->dimSizeRDI[i], "Unmatched tensors!");

--- a/source/tensor/core/reduce/ReduceSumSquared.cpp
+++ b/source/tensor/core/reduce/ReduceSumSquared.cpp
@@ -55,7 +55,7 @@ For a 1-dimensional data array a, sum = \sum_i (a_i - shift)^2
 XTensor ReduceSumSquared(const XTensor &input, int dim, const XTensor &shift)
 {
    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
-	
+    
    int order = input.order - 1;
    int * dimSize = new int[order];
    for(int i = 0; i < order; i++){

--- a/source/tensor/core/reduce/ReduceVariance.cpp
+++ b/source/tensor/core/reduce/ReduceVariance.cpp
@@ -38,7 +38,7 @@ For a 1-dimensional data array a, variance = 1/n * \sum_i (a_i - mean)^2
 */
 void _ReduceVariance(const XTensor * input, XTensor * output, int dim, const XTensor * mean)
 {
-	int dimRDI = input->order - dim - 1;
+    int dimRDI = input->order - dim - 1;
    int num = input->dimSizeRDI[dimRDI];
    _ReduceSum(input, output, dim, mean, 2.0F);
    _ScaleAndShiftMe(output, (DTYPE)1 / num, 0);
@@ -58,7 +58,7 @@ For a 1-dimensional data array a, variance = 1/n * \sum_i (a_i - mean)^2
 XTensor ReduceVariance(const XTensor &input, int dim, const XTensor &mean)
 {
    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
-	
+    
    int order = input.order - 1;
    int * dimSize = new int[order];
    for(int i = 0; i < order; i++){

--- a/source/tensor/core/shape/ConcatenateSolely.cpp
+++ b/source/tensor/core/shape/ConcatenateSolely.cpp
@@ -85,7 +85,7 @@ void _ConcatenateSolely(const TensorList * smalls, XTensor * big, int dim)
        }
    }
    else {
-		StrList* sourceArrays = new StrList(smalls->count);
+        StrList* sourceArrays = new StrList(smalls->count);
        int * blockSizes = new int[smalls->count];
        for (int i = 0; i < smalls->count; i++) {
            XTensor * tensor = (XTensor*)smalls->GetItem(i);

--- a/source/tensor/core/shape/Permute.h
+++ b/source/tensor/core/shape/Permute.h
@@ -41,6 +41,13 @@ a = permuted(a)
 */
 void _PermuteMe(XTensor * a, int * dimPermute);

+/*
+permute the tensor dimensions (do it on site).
+keep the result in the input tensor and return nothing.
+a = permuted(a)
+*/
+void PermuteMe(XTensor  &a, int * dimPermute);
+
 /* 
 make a tensor with permuted dimensions (return an XTensor structure).
 make a new tensor to keep the result and return it.

--- a/source/tensor/core/shape/Reshape.cpp
+++ b/source/tensor/core/shape/Reshape.cpp
@@ -43,9 +43,9 @@ XTensor Reshape(XTensor &s, int order, int * dimSize)
    t.Reshape(order, dimSize);

    /* tensor connections */
-	XLink::MakeLink(&s, NULL, &t, SHAPE_RESHAPE);
+    XLink::MakeLink(&s, NULL, &t, SHAPE_RESHAPE);

-	return t;
+    return t;
 }

 void Reshape(XTensor &s, XTensor &t, int order, int * dimSize)

--- a/source/tensor/core/shape/Squeeze.cpp
+++ b/source/tensor/core/shape/Squeeze.cpp
@@ -89,6 +89,20 @@ void _SqueezeMe(XTensor * source, int leadingDim)
 }

 /*
+squeeze the tensor along the specified dimension  (do it on site)
+keep the result in the input tensor a and return nothing
+
+>> source - the input tensor
+>> leadingDim - the dimension that we would squeeze
+                if leadingDim = -1, squeeze all dimensions that are 1
+                else, squeeze the specified dimension
+*/
+void SqueezeMe(XTensor& source, int leadingDim)
+{
+    _Squeeze(&source, &source, leadingDim);
+}
+
+/*
 squeeze the tensor along the specified dimension (return an XTensor structure)
 make a new tensor to keep the result and return it


--- a/source/tensor/core/shape/Squeeze.h
+++ b/source/tensor/core/shape/Squeeze.h
@@ -33,6 +33,10 @@ void _Squeeze(XTensor * source, XTensor * target, int leadingDim = -1);
   keep the result in the input tensor a and return nothing */
 void _SqueezeMe(XTensor * source, int leadingDim = -1);

+/* squeeze the tensor along the specified dimension (do it on site)
+   keep the result in the input tensor a and return nothing */
+void SqueezeMe(XTensor & source, int leadingDim = -1);
+
 /* squeeze the tensor along the specified dimension  (return an XTensor structure)
   make a new tensor to keep the result and return it */
 XTensor Squeeze(XTensor & source, int leadingDim = -1);

--- a/source/tensor/core/sort/Sort.cpp
+++ b/source/tensor/core/sort/Sort.cpp
@@ -45,7 +45,7 @@ void _Sort(const XTensor * a, XTensor * b, XTensor * index, int dim)
    CheckNTErrors((a->order == index->order), "Unmatched input tensors!");
    CheckNTErrors((index->dataType == X_INT), "Wrong data type!");

-	int dimRDI = a->order - dim - 1;
+    int dimRDI = a->order - dim - 1;
    /* make the index tensor */
    index->SetAscendingOrder(dim);

@@ -67,7 +67,7 @@ void _Sort(const XTensor * a, XTensor * b, XTensor * index, int dim)
            blockNum *= a->dimSizeRDI[i];
        int blockSize = stride * strideNum;

-		_CopyValues(a, b);
+        _CopyValues(a, b);
        for (int k = 0; k < blockNum; k++) {
        for (int i = 0; i < stride; i++) {
                void * dataB = (char*)b->data + (k * blockSize + i) * b->unitSize;
@@ -98,6 +98,21 @@ void _SortMe(XTensor * a, XTensor * index, int dim)
 }

 /*
+sort the tensor along a given dimension (do it on site)
+keep the result in the input tensor a and return nothing
+
+>> a - input tensor
+>> index - index of the items in the resulting tensor
+>> dim - the dimension along which the sorting is performed
+*/
+void SortMe(XTensor& a, XTensor& index, int dim)
+{
+    _Sort(&a, &a, &index, dim);
+}
+
+
+
+/*
 sort the tensor along a given dimension (return an XTensor structure)
 make a new tensor to keep the result and return it


--- a/source/tensor/core/sort/Sort.cu
+++ b/source/tensor/core/sort/Sort.cu
@@ -217,7 +217,7 @@ void _CudaSortBig(const XTensor * a, XTensor * b, XTensor * indexA, XTensor * in
    CheckNTErrors((a->order > dim && dim >= 0), "Incorrect dimension specified!");
    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");

-	int dimRDI = a->order - dim - 1;
+    int dimRDI = a->order - dim - 1;
    if (k < 0 || k > b->dimSizeRDI[dimRDI])
        k = b->dimSizeRDI[dimRDI];


--- a/source/tensor/core/sort/Sort.h
+++ b/source/tensor/core/sort/Sort.h
@@ -35,6 +35,12 @@ keep the result in the input tensor a and return nothing
 */
 void _SortMe(XTensor * a, XTensor * index, int dim);

+/*
+sort the data along a given dimension (do it on site)
+keep the result in the input tensor a and return nothing
+*/
+void SortMe(XTensor & a, XTensor & index, int dim);
+
 /* 
 sort the data along a given dimension (return an XTensor structure)
 make a new tensor to keep the result and return it

--- a/source/tensor/core/sort/TopK.cu
+++ b/source/tensor/core/sort/TopK.cu
@@ -238,9 +238,9 @@ void KernelTopK(T * input, int stride, int strideNum, int blockNum, int k, T min
        CudaXHeap<MIN_HEAP, T> heapFinal(k, k, heapData + k * threadIdx.y * blockDim.x);

        /* 
-	merge the result over the workers.
+    merge the result over the workers.
        This can be improved by parallel merging 
-	*/
+    */
        if (blockDim.x > 1) {
            for (int p = 1; p < blockDim.x && p < strideNum; p++) {
                CudaHeapNode<T> * hd = heapData + k * (threadIdx.y * blockDim.x + p);
@@ -770,22 +770,22 @@ void KernelTopKRadixSelect(unsigned int * input, int stride, int strideNum,
   /*
   if (idx == 0)
    {
-    	unsigned int* uintOutput = new unsigned int;
-    	int* tmpIndex = new int;
-    	//*******************something worng***************************
-    	cudaMalloc((void **)&uintOutput, sizeof(unsigned int)* k);
-    	cudaMalloc((void **)&tmpIndex, sizeof(unsigned int)*k);
-    	//*************************************************************
-    	collectNumberOld(input, limit, k, desire, uintOutput, tmpIndex, stride, strideNum);
-    	int blockIndex = idy / stride;
-    	int offsetInBlock = idy% stride;
-
-    	for (int i = stride * k * blockIndex + offsetInBlock, j = 0; j < k; j++, i += stride)
-    	{
-    		//for(int i = )
-    		output[i] = deconvert(uintOutput[j]);
-    		index[i] = tmpIndex[j];
-    	}
+        unsigned int* uintOutput = new unsigned int;
+        int* tmpIndex = new int;
+        //*******************something worng***************************
+        cudaMalloc((void **)&uintOutput, sizeof(unsigned int)* k);
+        cudaMalloc((void **)&tmpIndex, sizeof(unsigned int)*k);
+        //*************************************************************
+        collectNumberOld(input, limit, k, desire, uintOutput, tmpIndex, stride, strideNum);
+        int blockIndex = idy / stride;
+        int offsetInBlock = idy% stride;
+
+        for (int i = stride * k * blockIndex + offsetInBlock, j = 0; j < k; j++, i += stride)
+        {
+            //for(int i = )
+            output[i] = deconvert(uintOutput[j]);
+            index[i] = tmpIndex[j];
+        }
    }
    __syncthreads();
    */

--- a/source/tensor/core/utilities/SetAscendingOrder.cu
+++ b/source/tensor/core/utilities/SetAscendingOrder.cu
@@ -67,8 +67,8 @@ void CudaSetAscendingOrder(XTensor * a, int dim)
 {
    CheckNTErrors((a->dataType == X_INT), "TODO!");

-	int dimRDI = a->order - dim - 1;
-	int stride = 1;
+    int dimRDI = a->order - dim - 1;
+    int stride = 1;
    int strideNum = a->dimSizeRDI[dimRDI];
    for(int i = 0; i < dimRDI; i++)
        stride *= a->dimSizeRDI[i];

--- a/source/tensor/core/utilities/XMatrixSegment.cpp
+++ b/source/tensor/core/utilities/XMatrixSegment.cpp
@@ -56,7 +56,7 @@ void RunParallel2D(XPRunner * parallelRunner, void * job,
    va_list ap;
    va_start(ap, argNum);
    for (int i = 0; i < argNum; i++) {
-		XTensor* p = va_arg(ap, XTensor*);
+        XTensor* p = va_arg(ap, XTensor*);
        jobArgList->Add(p);
    }
    va_end(ap);
@@ -77,19 +77,19 @@ void RunParallel2D(XPRunner * parallelRunner, void * job,
    2. other arguments
    */
    for (int i = 0; i < jobNum; i++) {
-		IntList* indexArgs = new IntList(4);
+        IntList* indexArgs = new IntList(4);
        TensorList * blockArgs = new TensorList(argNum);
        int * blockIndex = indexList + i * 4;

-		indexArgs->Add(blockIndex[0]);
-		indexArgs->Add(blockIndex[1]);
-		indexArgs->Add(blockIndex[2]);
-		indexArgs->Add(blockIndex[3]);
+        indexArgs->Add(blockIndex[0]);
+        indexArgs->Add(blockIndex[1]);
+        indexArgs->Add(blockIndex[2]);
+        indexArgs->Add(blockIndex[3]);

        for (int j = 0; j < argNum; j++)
            blockArgs->Add(jobArgList->GetItem(j));

-		args->Add((XTensor*)indexArgs);
+        args->Add((XTensor*)indexArgs);
        args->Add((XTensor*)blockArgs);

        jobs->Add((XTensor*)job);

--- a/source/tensor/test/TAbsolute.cpp
+++ b/source/tensor/test/TAbsolute.cpp
@@ -30,84 +30,84 @@ Set every entry to its absolute value.
 */
 bool TestAbsolute1()
 {
-	/* a tensor of size (3, 2) */
-	int order = 2;
-	int * dimSize = new int[order];
-	dimSize[0] = 3;
-	dimSize[1] = 2;
-
-	int unitNum = 1;
-	for (int i = 0; i < order; i++)
-		unitNum *= dimSize[i];
-
-	DTYPE aData[3][2] = { {1.0F, -2.0F}, 
-	                      {0.5F, -4.0F},
-	                      {0.0F, 6.0F} };
-	DTYPE answer[3][2] = { {1.0F, 2.0F},
-	                       {0.5F, 4.0F},
-	                       {0.0F, 6.0F} };
-
-	/* CPU test */
-	bool cpuTest = true;
-
-	/* create tensors */
-	XTensor * a = NewTensor(order, dimSize);
-	XTensor * b = NewTensor(order, dimSize);
-	XTensor * aMe = NewTensor(order, dimSize);
+    /* a tensor of size (3, 2) */
+    int order = 2;
+    int * dimSize = new int[order];
+    dimSize[0] = 3;
+    dimSize[1] = 2;
+
+    int unitNum = 1;
+    for (int i = 0; i < order; i++)
+        unitNum *= dimSize[i];
+
+    DTYPE aData[3][2] = { {1.0F, -2.0F}, 
+                          {0.5F, -4.0F},
+                          {0.0F, 6.0F} };
+    DTYPE answer[3][2] = { {1.0F, 2.0F},
+                           {0.5F, 4.0F},
+                           {0.0F, 6.0F} };
+
+    /* CPU test */
+    bool cpuTest = true;
+
+    /* create tensors */
+    XTensor * a = NewTensor(order, dimSize);
+    XTensor * b = NewTensor(order, dimSize);
+    XTensor * aMe = NewTensor(order, dimSize);
    XTensor bUser;

-	/* initialize variables */
-	a->SetData(aData, unitNum);
+    /* initialize variables */
+    a->SetData(aData, unitNum);
    aMe->SetData(aData, unitNum);

-	/* call Absolute function */
+    /* call Absolute function */
    _Absolute(a, b);
-	_AbsoluteMe(aMe);
+    _AbsoluteMe(aMe);
    bUser = Absolute(*a);

-	/* check results */
-	cpuTest = b->CheckData(answer, unitNum, 1e-4F) && aMe->CheckData(answer, unitNum, 1e-4F) && bUser.CheckData(answer, unitNum, 1e-4F);
+    /* check results */
+    cpuTest = b->CheckData(answer, unitNum, 1e-4F) && aMe->CheckData(answer, unitNum, 1e-4F) && bUser.CheckData(answer, unitNum, 1e-4F);
    
 #ifdef USE_CUDA
-	/* GPU test */
-	bool gpuTest = true;
+    /* GPU test */
+    bool gpuTest = true;

-	/* create tensor */
-	XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
-	XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
-	XTensor * aMeGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    /* create tensor */
+    XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor * aMeGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
    XTensor bUserGPU;

-	/* Initialize variables */
-	aGPU->SetData(aData, unitNum);
+    /* Initialize variables */
+    aGPU->SetData(aData, unitNum);
    aMeGPU->SetData(aData, unitNum);

-	/* call Absolute function */
+    /* call Absolute function */
    _Absolute(aGPU, bGPU);
-	_AbsoluteMe(aMeGPU);
+    _AbsoluteMe(aMeGPU);
    bUserGPU = Absolute(*aGPU);

-	/* check results */
-	gpuTest = bGPU->CheckData(answer, unitNum, 1e-4F) && aMeGPU->CheckData(answer, unitNum, 1e-4F) && bUserGPU.CheckData(answer, unitNum, 1e-4F);
+    /* check results */
+    gpuTest = bGPU->CheckData(answer, unitNum, 1e-4F) && aMeGPU->CheckData(answer, unitNum, 1e-4F) && bUserGPU.CheckData(answer, unitNum, 1e-4F);

-	/* destroy variables */
-	delete a;
-	delete b;
-	delete aMe;
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete aMe;
    delete aGPU;
    delete bGPU;
    delete aMeGPU;
-	delete[] dimSize;
+    delete[] dimSize;

-	return cpuTest && gpuTest;
+    return cpuTest && gpuTest;
 #else
-	/* destroy variables */
-	delete a;
-	delete b;
-	delete aMe;
-	delete[] dimSize;
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete aMe;
+    delete[] dimSize;

-	return cpuTest;
+    return cpuTest;
 #endif // USE_CUDA
 }

@@ -119,33 +119,33 @@ TODO!!
 /* test for Absolute Function */
 bool TestAbsolute()
 {
-	XPRINT(0, stdout, "[TEST Absolute] set every entry to its absolute value \n");
-	bool returnFlag = true, caseFlag = true;
+    XPRINT(0, stdout, "[TEST Absolute] set every entry to its absolute value \n");
+    bool returnFlag = true, caseFlag = true;

-	/* case 1 test */
-	caseFlag = TestAbsolute1();
+    /* case 1 test */
+    caseFlag = TestAbsolute1();

-	if (!caseFlag) {
-		returnFlag = false;
-		XPRINT(0, stdout, ">> case 1 failed!\n");
-	}
-	else
-		XPRINT(0, stdout, ">> case 1 passed!\n");
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 1 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 1 passed!\n");

-	/* other cases test */
-	/*
-	TODO!!
-	*/
+    /* other cases test */
+    /*
+    TODO!!
+    */

-	if (returnFlag) {
-		XPRINT(0, stdout, ">> All Passed!\n");
-	}
-	else
-		XPRINT(0, stdout, ">> Failed!\n");
+    if (returnFlag) {
+        XPRINT(0, stdout, ">> All Passed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> Failed!\n");

-	XPRINT(0, stdout, "\n");
+    XPRINT(0, stdout, "\n");

-	return returnFlag;
+    return returnFlag;
 }

 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/test/TClip.cpp
+++ b/source/tensor/test/TClip.cpp
@@ -31,88 +31,88 @@ Set every entry to its clip value.
 */
 bool TestClip1()
 {
-	/* a tensor of size (3, 2) */
-	int aOrder = 2;
-	int * aDimSize = new int[aOrder];
-	aDimSize[0] = 3;
-	aDimSize[1] = 2;
-
-	int aUnitNum = 1;
-	for (int i = 0; i < aOrder; i++)
-		aUnitNum *= aDimSize[i];
-
-	DTYPE aData[3][2] = { {1.0F, -2.0F},
-						  {0.0F, 4.0F},
-						  {5.0F, -6.0F} };
-	DTYPE answer[3][2] = { {1.0F, -1.0F},
-						   {0.0F, 1.0F},
-					   	   {1.0F, -1.0F} };
-
-	/* CPU test */
-	bool cpuTest = true;
-
-	/* create tensors */
-	XTensor * a = NewTensor(aOrder, aDimSize);
-	XTensor * b = NewTensor(aOrder, aDimSize);
-	XTensor * aMe = NewTensor(aOrder, aDimSize);
-	XTensor bUser;
-
-	/* initialize variables */
-	a->SetData(aData, aUnitNum);
-	aMe->SetData(aData, aUnitNum);
-
-	/* call Clip function */
-	_Clip(a, b, -1.0, 1.0);
-	_ClipMe(aMe, -1.0, 1.0);
-	bUser = Clip(*a, -1.0, 1.0);
-
-	/* check results */
-	cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && 
+    /* a tensor of size (3, 2) */
+    int aOrder = 2;
+    int * aDimSize = new int[aOrder];
+    aDimSize[0] = 3;
+    aDimSize[1] = 2;
+
+    int aUnitNum = 1;
+    for (int i = 0; i < aOrder; i++)
+        aUnitNum *= aDimSize[i];
+
+    DTYPE aData[3][2] = { {1.0F, -2.0F},
+                          {0.0F, 4.0F},
+                          {5.0F, -6.0F} };
+    DTYPE answer[3][2] = { {1.0F, -1.0F},
+                           {0.0F, 1.0F},
+                              {1.0F, -1.0F} };
+
+    /* CPU test */
+    bool cpuTest = true;
+
+    /* create tensors */
+    XTensor * a = NewTensor(aOrder, aDimSize);
+    XTensor * b = NewTensor(aOrder, aDimSize);
+    XTensor * aMe = NewTensor(aOrder, aDimSize);
+    XTensor bUser;
+
+    /* initialize variables */
+    a->SetData(aData, aUnitNum);
+    aMe->SetData(aData, aUnitNum);
+
+    /* call Clip function */
+    _Clip(a, b, -1.0, 1.0);
+    _ClipMe(aMe, -1.0, 1.0);
+    bUser = Clip(*a, -1.0, 1.0);
+
+    /* check results */
+    cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && 
              aMe->CheckData(answer, aUnitNum, 1e-4F) && 
              bUser.CheckData(answer, aUnitNum, 1e-4F);

 #ifdef USE_CUDA
-	/* GPU test */
-	bool gpuTest = true;
-
-	/* create tensor */
-	XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
-	XTensor * bGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
-	XTensor * aMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
-	XTensor bUserGPU;
-
-	/* Initialize variables */
-	aGPU->SetData(aData, aUnitNum);
-	aMeGPU->SetData(aData, aUnitNum);
-
-	/* call Clip function */
-	_Clip(aGPU, bGPU, -1.0, 1.0);
-	_ClipMe(aMeGPU, -1.0, 1.0);
-	bUserGPU = Clip(*aGPU, -1.0, 1.0);
-
-	/* check results */
-	gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && 
+    /* GPU test */
+    bool gpuTest = true;
+
+    /* create tensor */
+    XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * bGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * aMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor bUserGPU;
+
+    /* Initialize variables */
+    aGPU->SetData(aData, aUnitNum);
+    aMeGPU->SetData(aData, aUnitNum);
+
+    /* call Clip function */
+    _Clip(aGPU, bGPU, -1.0, 1.0);
+    _ClipMe(aMeGPU, -1.0, 1.0);
+    bUserGPU = Clip(*aGPU, -1.0, 1.0);
+
+    /* check results */
+    gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && 
              aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && 
              bUserGPU.CheckData(answer, aUnitNum, 1e-4F);

-	/* destroy variables */
-	delete a;
-	delete b;
-	delete aMe;
-	delete aGPU;
-	delete bGPU;
-	delete aMeGPU;
-	delete[] aDimSize;
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete aMe;
+    delete aGPU;
+    delete bGPU;
+    delete aMeGPU;
+    delete[] aDimSize;

-	return cpuTest && gpuTest;
+    return cpuTest && gpuTest;
 #else
-	/* destroy variables */
-	delete a;
-	delete b;
-	delete aMe;
-	delete[] aDimSize;
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete aMe;
+    delete[] aDimSize;

-	return cpuTest;
+    return cpuTest;
 #endif // USE_CUDA
 }

@@ -124,33 +124,33 @@ TODO!!
 /* test for Clip Function */
 bool TestClip()
 {
-	XPRINT(0, stdout, "[TEST Clip] set every entry to its clip value \n");
-	bool returnFlag = true, caseFlag = true;
+    XPRINT(0, stdout, "[TEST Clip] set every entry to its clip value \n");
+    bool returnFlag = true, caseFlag = true;

-	/* case 1 test */
-	caseFlag = TestClip1();
+    /* case 1 test */
+    caseFlag = TestClip1();

-	if (!caseFlag) {
-		returnFlag = false;
-		XPRINT(0, stdout, ">> case 1 failed!\n");
-	}
-	else
-		XPRINT(0, stdout, ">> case 1 passed!\n");
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 1 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 1 passed!\n");

-	/* other cases test */
-	/*
-	TODO!!
-	*/
+    /* other cases test */
+    /*
+    TODO!!
+    */

-	if (returnFlag) {
-		XPRINT(0, stdout, ">> All Passed!\n");
-	}
-	else
-		XPRINT(0, stdout, ">> Failed!\n");
+    if (returnFlag) {
+        XPRINT(0, stdout, ">> All Passed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> Failed!\n");

-	XPRINT(0, stdout, "\n");
+    XPRINT(0, stdout, "\n");

-	return returnFlag;
+    return returnFlag;
 }

 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/test/TCompare.cpp
+++ b/source/tensor/test/TCompare.cpp
@@ -31,88 +31,88 @@ Comapre whether every entry is equal to the specified value.
 */
 bool TestCompare1()
 {
-	/* a tensor of size (3, 2) */
-	int aOrder = 2;
-	int * aDimSize = new int[aOrder];
-	aDimSize[0] = 3;
-	aDimSize[1] = 2;
-
-	int aUnitNum = 1;
-	for (int i = 0; i < aOrder; i++)
-		aUnitNum *= aDimSize[i];
-
-	DTYPE aData[3][2] = { {1.0F, -2.0F},
-						  {0.0F, 4.0F},
-						  {5.0F, 1.0F} };
-	DTYPE answer[3][2] = { {1.0F, 0.0F},
-						   {0.0F, 0.0F},
-					   	   {0.0F, 1.0F} };
-
-	/* CPU test */
-	bool cpuTest = true;
-
-	/* create tensors */
-	XTensor * a = NewTensor(aOrder, aDimSize);
-	XTensor * b = NewTensor(aOrder, aDimSize);
-	XTensor * aMe = NewTensor(aOrder, aDimSize);
-	XTensor bUser;
-
-	/* initialize variables */
-	a->SetData(aData, aUnitNum);
-	aMe->SetData(aData, aUnitNum);
-
-	/* call Equal function */
-	_Equal(a, b, 1.0);
-	_EqualMe(aMe, 1.0);
-	bUser = Equal(*a, 1.0);
-
-	/* check results */
-	cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && 
+    /* a tensor of size (3, 2) */
+    int aOrder = 2;
+    int * aDimSize = new int[aOrder];
+    aDimSize[0] = 3;
+    aDimSize[1] = 2;
+
+    int aUnitNum = 1;
+    for (int i = 0; i < aOrder; i++)
+        aUnitNum *= aDimSize[i];
+
+    DTYPE aData[3][2] = { {1.0F, -2.0F},
+                          {0.0F, 4.0F},
+                          {5.0F, 1.0F} };
+    DTYPE answer[3][2] = { {1.0F, 0.0F},
+                           {0.0F, 0.0F},
+                              {0.0F, 1.0F} };
+
+    /* CPU test */
+    bool cpuTest = true;
+
+    /* create tensors */
+    XTensor * a = NewTensor(aOrder, aDimSize);
+    XTensor * b = NewTensor(aOrder, aDimSize);
+    XTensor * aMe = NewTensor(aOrder, aDimSize);
+    XTensor bUser;
+
+    /* initialize variables */
+    a->SetData(aData, aUnitNum);
+    aMe->SetData(aData, aUnitNum);
+
+    /* call Equal function */
+    _Equal(a, b, 1.0);
+    _EqualMe(aMe, 1.0);
+    bUser = Equal(*a, 1.0);
+
+    /* check results */
+    cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && 
              aMe->CheckData(answer, aUnitNum, 1e-4F) && 
              bUser.CheckData(answer, aUnitNum, 1e-4F);

 #ifdef USE_CUDA
-	/* GPU test */
-	bool gpuTest = true;
-
-	/* create tensor */
-	XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
-	XTensor * bGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
-	XTensor * aMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
-	XTensor bUserGPU;
-
-	/* Initialize variables */
-	aGPU->SetData(aData, aUnitNum);
-	aMeGPU->SetData(aData, aUnitNum);
-
-	/* call Equal function */
-	_Equal(aGPU, bGPU, 1.0);
-	_EqualMe(aMeGPU, 1.0);
-	bUserGPU = Equal(*aGPU, 1.0);
-
-	/* check results */
-	gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && 
+    /* GPU test */
+    bool gpuTest = true;
+
+    /* create tensor */
+    XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * bGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * aMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor bUserGPU;
+
+    /* Initialize variables */
+    aGPU->SetData(aData, aUnitNum);
+    aMeGPU->SetData(aData, aUnitNum);
+
+    /* call Equal function */
+    _Equal(aGPU, bGPU, 1.0);
+    _EqualMe(aMeGPU, 1.0);
+    bUserGPU = Equal(*aGPU, 1.0);
+
+    /* check results */
+    gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && 
              aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && 
              bUserGPU.CheckData(answer, aUnitNum, 1e-4F);

-	/* destroy variables */
-	delete a;
-	delete b;
-	delete aMe;
-	delete aGPU;
-	delete bGPU;
-	delete aMeGPU;
-	delete[] aDimSize;
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete aMe;
+    delete aGPU;
+    delete bGPU;
+    delete aMeGPU;
+    delete[] aDimSize;

-	return cpuTest && gpuTest;
+    return cpuTest && gpuTest;
 #else
-	/* destroy variables */
-	delete a;
-	delete b;
-	delete aMe;
-	delete[] aDimSize;
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete aMe;
+    delete[] aDimSize;

-	return cpuTest;
+    return cpuTest;
 #endif // USE_CUDA
 }

@@ -124,33 +124,33 @@ TODO!!
 /* test for Compare Function */
 bool TestCompare()
 {
-	XPRINT(0, stdout, "[TEST Compare] compare every entry with specified value \n");
-	bool returnFlag = true, caseFlag = true;
+    XPRINT(0, stdout, "[TEST Compare] compare every entry with specified value \n");
+    bool returnFlag = true, caseFlag = true;

-	/* case 1 test */
-	caseFlag = TestCompare1();
+    /* case 1 test */
+    caseFlag = TestCompare1();

-	if (!caseFlag) {
-		returnFlag = false;
-		XPRINT(0, stdout, ">> case 1 failed!\n");
-	}
-	else
-		XPRINT(0, stdout, ">> case 1 passed!\n");
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 1 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 1 passed!\n");

-	/* other cases test */
-	/*
-	TODO!!
-	*/
+    /* other cases test */
+    /*
+    TODO!!
+    */

-	if (returnFlag) {
-		XPRINT(0, stdout, ">> All Passed!\n");
-	}
-	else
-		XPRINT(0, stdout, ">> Failed!\n");
+    if (returnFlag) {
+        XPRINT(0, stdout, ">> All Passed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> Failed!\n");

-	XPRINT(0, stdout, "\n");
+    XPRINT(0, stdout, "\n");

-	return returnFlag;
+    return returnFlag;
 }

 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/test/TConcatenate.cpp
+++ b/source/tensor/test/TConcatenate.cpp
@@ -29,7 +29,7 @@ In this case, 2 * (2, 1) -> (2, 2), dim=1.
 */
 bool TestConcatenate1()
 {
-	/* create list */
+    /* create list */
    TensorList * sList = new TensorList();

    /* a source tensor of size (2, 1) */
@@ -83,7 +83,7 @@ bool TestConcatenate1()
    s2->SetData(sData2, sUnitNum2);
    t->SetZeroAll();

-	/* add tensors to list */
+    /* add tensors to list */
    sList->Add(s1);
    sList->Add(s2);

@@ -99,29 +99,29 @@ bool TestConcatenate1()
    bool gpuTest = true;

    /* create tensor */
-	XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
-	XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
-	XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
+    XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
    XTensor tUserGPU;

    /* Initialize variables */
-	sGPU1->SetData(sData1, sUnitNum1);
-	sGPU2->SetData(sData2, sUnitNum2);
-	tGPU->SetZeroAll();
+    sGPU1->SetData(sData1, sUnitNum1);
+    sGPU2->SetData(sData2, sUnitNum2);
+    tGPU->SetZeroAll();

-	/* clear list */
-	sList->Clear();
+    /* clear list */
+    sList->Clear();

-	/* add tensors to list*/
-	sList->Add(sGPU1);
-	sList->Add(sGPU2);
+    /* add tensors to list*/
+    sList->Add(sGPU1);
+    sList->Add(sGPU2);

-	/* call Concatenate function */
-	_Concatenate(sList, tGPU, 1);
+    /* call Concatenate function */
+    _Concatenate(sList, tGPU, 1);
    tUserGPU = Concatenate(*sList, 1);

-	/* check results */
-	gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);
+    /* check results */
+    gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);

    /* destroy variables */
    delete sList;
@@ -135,7 +135,7 @@ bool TestConcatenate1()
    delete[] sDimSize2;
    delete[] tDimSize;

-	return cpuTest && gpuTest;
+    return cpuTest && gpuTest;
 #else
    /* destroy variables */
    delete sList;
@@ -156,7 +156,7 @@ In this case, 2 * (2, 1) -> (4, 1), dim=0.
 */
 bool TestConcatenate2()
 {
-	/* create list */
+    /* create list */
    TensorList * sList = new TensorList();

    /* a source tensor of size (2, 1) */
@@ -212,7 +212,7 @@ bool TestConcatenate2()
    s2->SetData(sData2, sUnitNum2);
    t->SetZeroAll();

-	/* add tensors to list */
+    /* add tensors to list */
    sList->Add(s1);
    sList->Add(s2);

@@ -224,35 +224,35 @@ bool TestConcatenate2()
    cpuTest = t->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum);

 #ifdef USE_CUDA
-	/* GPU test */
-	bool gpuTest = true;
+    /* GPU test */
+    bool gpuTest = true;

-	/* create tensor */
-	XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
-	XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
-	XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    /* create tensor */
+    XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
+    XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
    XTensor tUserGPU;

-	/* Initialize variables */
-	sGPU1->SetData(sData1, sUnitNum1);
-	sGPU2->SetData(sData2, sUnitNum2);
-	tGPU->SetZeroAll();
+    /* Initialize variables */
+    sGPU1->SetData(sData1, sUnitNum1);
+    sGPU2->SetData(sData2, sUnitNum2);
+    tGPU->SetZeroAll();
    
-	/* clear list */
-	sList->Clear();
+    /* clear list */
+    sList->Clear();

-	/* add tensors to list*/
-	sList->Add(sGPU1);
-	sList->Add(sGPU2);
+    /* add tensors to list*/
+    sList->Add(sGPU1);
+    sList->Add(sGPU2);

-	/* call Concatenate function */
-	_Concatenate(sList, tGPU, 0);
+    /* call Concatenate function */
+    _Concatenate(sList, tGPU, 0);
    tUserGPU = Concatenate(*sList, 0);

-	/* check results */
-	gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);
+    /* check results */
+    gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);

-	/* destroy variables */
+    /* destroy variables */
    delete sList;
    delete s1;
    delete s2;
@@ -264,7 +264,7 @@ bool TestConcatenate2()
    delete[] sDimSize2;
    delete[] tDimSize;

-	return cpuTest && gpuTest;
+    return cpuTest && gpuTest;
 #else
    /* destroy variables */
    delete sList;
@@ -285,7 +285,7 @@ In this case, (2, 1) + (2, 2) -> (2, 3), dim=1.
 */
 bool TestConcatenate3()
 {
-	/* create list */
+    /* create list */
    TensorList * sList = new TensorList();

    /* a source tensor of size (2, 1) */
@@ -339,7 +339,7 @@ bool TestConcatenate3()
    s2->SetData(sData2, sUnitNum2);
    t->SetZeroAll();

-	/* add tensors to list */
+    /* add tensors to list */
    sList->Add(s1);
    sList->Add(s2);

@@ -351,35 +351,35 @@ bool TestConcatenate3()
    cpuTest = t->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum);

 #ifdef USE_CUDA
-	/* GPU test */
-	bool gpuTest = true;
+    /* GPU test */
+    bool gpuTest = true;

-	/* create tensor */
-	XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
-	XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
-	XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    /* create tensor */
+    XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
+    XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
    XTensor tUserGPU;

-	/* Initialize variables */
-	sGPU1->SetData(sData1, sUnitNum1);
-	sGPU2->SetData(sData2, sUnitNum2);
-	tGPU->SetZeroAll();
+    /* Initialize variables */
+    sGPU1->SetData(sData1, sUnitNum1);
+    sGPU2->SetData(sData2, sUnitNum2);
+    tGPU->SetZeroAll();
    
-	/* clear list */
-	sList->Clear();
+    /* clear list */
+    sList->Clear();

-	/* add tensors to list*/
-	sList->Add(sGPU1);
-	sList->Add(sGPU2);
+    /* add tensors to list*/
+    sList->Add(sGPU1);
+    sList->Add(sGPU2);

-	/* call Concatenate function */
-	_Concatenate(sList, tGPU, 1);
+    /* call Concatenate function */
+    _Concatenate(sList, tGPU, 1);
    tUserGPU = Concatenate(*sList, 1);

-	/* check results */
-	gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);
+    /* check results */
+    gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);

-	/* destroy variables */
+    /* destroy variables */
    delete sList;
    delete s1;
    delete s2;
@@ -391,7 +391,7 @@ bool TestConcatenate3()
    delete[] sDimSize2;
    delete[] tDimSize;

-	return cpuTest && gpuTest;
+    return cpuTest && gpuTest;
 #else
    /* destroy variables */
    delete sList;
@@ -402,7 +402,7 @@ bool TestConcatenate3()
    delete[] sDimSize2;
    delete[] tDimSize;

-	return cpuTest;
+    return cpuTest;
 #endif // USE_CUDA
 }

@@ -471,28 +471,28 @@ bool TestConcatenate4()
    cpuTest = t->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum);

 #ifdef USE_CUDA
-	/* GPU test */
-	bool gpuTest = true;
+    /* GPU test */
+    bool gpuTest = true;

-	/* create tensor */
-	XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
-	XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
-	XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    /* create tensor */
+    XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
+    XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
    XTensor tUserGPU;

-	/* Initialize variables */
-	sGPU1->SetData(sData1, sUnitNum1);
-	sGPU2->SetData(sData2, sUnitNum2);
-	tGPU->SetZeroAll();
+    /* Initialize variables */
+    sGPU1->SetData(sData1, sUnitNum1);
+    sGPU2->SetData(sData2, sUnitNum2);
+    tGPU->SetZeroAll();

-	/* call Concatenate function */
-	_Concatenate(sGPU1, sGPU2, tGPU, 1);
+    /* call Concatenate function */
+    _Concatenate(sGPU1, sGPU2, tGPU, 1);
    tUserGPU = Concatenate(*sGPU1, *sGPU2, 1);

-	/* check results */
-	gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);
+    /* check results */
+    gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);

-	/* destroy variables */
+    /* destroy variables */
    delete s1;
    delete s2;
    delete t;
@@ -503,7 +503,7 @@ bool TestConcatenate4()
    //delete[] sDimSize2;
    //delete[] tDimSize;

-	return cpuTest && gpuTest;
+    return cpuTest && gpuTest;
 #else
    /* destroy variables */
    delete s1;
@@ -513,7 +513,7 @@ bool TestConcatenate4()
    delete[] sDimSize2;
    delete[] tDimSize;

-	return cpuTest;
+    return cpuTest;
 #endif // USE_CUDA
 }


--- a/source/tensor/test/TConcatenateSolely.cpp
+++ b/source/tensor/test/TConcatenateSolely.cpp
@@ -30,7 +30,7 @@ In this case, 2 * (2, 1) -> (2, 2), dim=1.
 */
 bool TestConcatenateSolely1()
 {
-	/* create list */
+    /* create list */
    TensorList * sList = new TensorList();

    /* a source tensor of size (2, 1) */
@@ -83,44 +83,44 @@ bool TestConcatenateSolely1()
    s2->SetData(sData2, sUnitNum2);
    t->SetZeroAll();

-	/* add tensors to list */
+    /* add tensors to list */
    sList->Add(s1);
    sList->Add(s2);

-	/* call ConcatenateSolely function */
+    /* call ConcatenateSolely function */
    _ConcatenateSolely(sList, t, 1);

    /* check results */
    cpuTest = t->CheckData(answer, tUnitNum);

 #ifdef USE_CUDA
-	/* GPU test */
-	bool gpuTest = true;
-
-	/* create tensor */
-	XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
-	XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
-	XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
-
-	/* Initialize variables */
-	sGPU1->SetData(sData1, sUnitNum1);
-	sGPU2->SetData(sData2, sUnitNum2);
-	tGPU->SetZeroAll();
+    /* GPU test */
+    bool gpuTest = true;
+
+    /* create tensor */
+    XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
+    XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+
+    /* Initialize variables */
+    sGPU1->SetData(sData1, sUnitNum1);
+    sGPU2->SetData(sData2, sUnitNum2);
+    tGPU->SetZeroAll();
    
-	/* clear list */
-	sList->Clear();
+    /* clear list */
+    sList->Clear();

-	/* add tensors to list*/
-	sList->Add(sGPU1);
-	sList->Add(sGPU2);
+    /* add tensors to list*/
+    sList->Add(sGPU1);
+    sList->Add(sGPU2);

-	/* call ConcatenateSolely function */
-	_ConcatenateSolely(sList, tGPU, 1);
+    /* call ConcatenateSolely function */
+    _ConcatenateSolely(sList, tGPU, 1);

-	/* check results */
-	gpuTest = tGPU->CheckData(answer, tUnitNum);
+    /* check results */
+    gpuTest = tGPU->CheckData(answer, tUnitNum);

-	/* destroy variables */
+    /* destroy variables */
    delete sList;
    delete s1;
    delete s2;
@@ -132,7 +132,7 @@ bool TestConcatenateSolely1()
    delete[] sDimSize2;
    delete[] tDimSize;

-	return cpuTest && gpuTest;
+    return cpuTest && gpuTest;
 #else
    /* destroy variables */
    delete sList;
@@ -143,7 +143,7 @@ bool TestConcatenateSolely1()
    delete[] sDimSize2;
    delete[] tDimSize;

-	return cpuTest;
+    return cpuTest;
 #endif // USE_CUDA
    }

@@ -153,7 +153,7 @@ In this case, 2 * (2, 1) -> (4, 1), dim=0.
 */
 bool TestConcatenateSolely2()
 {
-	/* create list */
+    /* create list */
    TensorList * sList = new TensorList();

    /* a source tensor of size (2, 1) */
@@ -208,7 +208,7 @@ bool TestConcatenateSolely2()
    s2->SetData(sData2, sUnitNum2);
    t->SetZeroAll();

-	/* add tensors to list */
+    /* add tensors to list */
    sList->Add(s1);
    sList->Add(s2);

@@ -219,33 +219,33 @@ bool TestConcatenateSolely2()
    cpuTest = t->CheckData(answer, tUnitNum);

 #ifdef USE_CUDA
-	/* GPU test */
-	bool gpuTest = true;
-
-	/* create tensor */
-	XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
-	XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
-	XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
-
-	/* Initialize variables */
-	sGPU1->SetData(sData1, sUnitNum1);
-	sGPU2->SetData(sData2, sUnitNum2);
-	tGPU->SetZeroAll();
+    /* GPU test */
+    bool gpuTest = true;
+
+    /* create tensor */
+    XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
+    XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+
+    /* Initialize variables */
+    sGPU1->SetData(sData1, sUnitNum1);
+    sGPU2->SetData(sData2, sUnitNum2);
+    tGPU->SetZeroAll();
    
-	/* clear list */
-	sList->Clear();
+    /* clear list */
+    sList->Clear();

-	/* add tensors to list*/
-	sList->Add(sGPU1);
-	sList->Add(sGPU2);
+    /* add tensors to list*/
+    sList->Add(sGPU1);
+    sList->Add(sGPU2);

-	/* call concatenatesolely function */
-	_ConcatenateSolely(sList, tGPU, 0);
+    /* call concatenatesolely function */
+    _ConcatenateSolely(sList, tGPU, 0);

-	/* check results */
-	gpuTest = tGPU->CheckData(answer, tUnitNum);
+    /* check results */
+    gpuTest = tGPU->CheckData(answer, tUnitNum);

-	/* destroy variables */
+    /* destroy variables */
    delete sList;
    delete s1;
    delete s2;
@@ -257,7 +257,7 @@ bool TestConcatenateSolely2()
    delete[] sDimSize2;
    delete[] tDimSize;

-	return cpuTest && gpuTest;
+    return cpuTest && gpuTest;
 #else
    /* destroy variables */
    delete sList;
@@ -268,7 +268,7 @@ bool TestConcatenateSolely2()
    delete[] sDimSize2;
    delete[] tDimSize;

-	return cpuTest;
+    return cpuTest;
 #endif // USE_CUDA
 }

@@ -278,7 +278,7 @@ In this case, (2, 1) + (2, 2) -> (2, 3), dim=1.
 */
 bool TestConcatenateSolely3()
 {
-	/* create list */
+    /* create list */
    TensorList * sList = new TensorList();

    /* a source tensor of size (2, 1) */
@@ -331,44 +331,44 @@ bool TestConcatenateSolely3()
    s2->SetData(sData2, sUnitNum2);
    t->SetZeroAll();

-	/* add tensors to list */
+    /* add tensors to list */
    sList->Add(s1);
    sList->Add(s2);

-	/* call ConcatenateSolely function */
+    /* call ConcatenateSolely function */
    _ConcatenateSolely(sList, t, 1);

    /* check results */
    cpuTest = t->CheckData(answer, tUnitNum);

 #ifdef USE_CUDA
-	/* GPU test */
-	bool gpuTest = true;
-
-	/* create tensor */
-	XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
-	XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
-	XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
-
-	/* Initialize variables */
-	sGPU1->SetData(sData1, sUnitNum1);
-	sGPU2->SetData(sData2, sUnitNum2);
-	tGPU->SetZeroAll();
+    /* GPU test */
+    bool gpuTest = true;
+
+    /* create tensor */
+    XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
+    XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+
+    /* Initialize variables */
+    sGPU1->SetData(sData1, sUnitNum1);
+    sGPU2->SetData(sData2, sUnitNum2);
+    tGPU->SetZeroAll();
    
-	/* clear list */
-	sList->Clear();
+    /* clear list */
+    sList->Clear();

-	/* add tensors to list*/
-	sList->Add(sGPU1);
-	sList->Add(sGPU2);
+    /* add tensors to list*/
+    sList->Add(sGPU1);
+    sList->Add(sGPU2);

-	/* call ConcatenateSolely function */
-	_ConcatenateSolely(sList, tGPU, 1);
+    /* call ConcatenateSolely function */
+    _ConcatenateSolely(sList, tGPU, 1);

-	/* check results */
-	gpuTest = tGPU->CheckData(answer, tUnitNum);
+    /* check results */
+    gpuTest = tGPU->CheckData(answer, tUnitNum);

-	/* destroy variables */
+    /* destroy variables */
    delete sList;
    delete s1;
    delete s2;
@@ -380,7 +380,7 @@ bool TestConcatenateSolely3()
    delete[] sDimSize2;
    delete[] tDimSize;

-	return cpuTest && gpuTest;
+    return cpuTest && gpuTest;
 #else
    /* destroy variables */
    delete sList;
@@ -391,7 +391,7 @@ bool TestConcatenateSolely3()
    delete[] sDimSize2;
    delete[] tDimSize;

-	return cpuTest;
+    return cpuTest;
 #endif // USE_CUDA
 }


--- a/source/tensor/test/TConvertDataType.cpp
+++ b/source/tensor/test/TConvertDataType.cpp
@@ -31,72 +31,72 @@ In this case, the flaot32 data type is converted to int32 data type.
 */
 bool TestConvertDataType1()
 {
-	/* a tensor of size (3, 2) */
-	int aOrder = 2;
-	int * aDimSize = new int[aOrder];
-	aDimSize[0] = 3;
-	aDimSize[1] = 2;
-
-	int aUnitNum = 1;
-	for (int i = 0; i < aOrder; i++)
-		aUnitNum *= aDimSize[i];
-
-	DTYPE aData[3][2] = { {1.0F, 2.0F}, 
-	                      {0.5F, 4.0F},
-	                      {5.0F, 6.0F} };
-	int answer[3][2] = { {1, 2},
-	                     {0, 4},
+    /* a tensor of size (3, 2) */
+    int aOrder = 2;
+    int * aDimSize = new int[aOrder];
+    aDimSize[0] = 3;
+    aDimSize[1] = 2;
+
+    int aUnitNum = 1;
+    for (int i = 0; i < aOrder; i++)
+        aUnitNum *= aDimSize[i];
+
+    DTYPE aData[3][2] = { {1.0F, 2.0F}, 
+                          {0.5F, 4.0F},
+                          {5.0F, 6.0F} };
+    int answer[3][2] = { {1, 2},
+                         {0, 4},
                         {5, 6} };

-	/* CPU test */
-	bool cpuTest = true;
+    /* CPU test */
+    bool cpuTest = true;

-	/* create tensors */
-	XTensor * a = NewTensor(aOrder, aDimSize);
-	XTensor * b = NewTensor(aOrder, aDimSize, X_INT);
+    /* create tensors */
+    XTensor * a = NewTensor(aOrder, aDimSize);
+    XTensor * b = NewTensor(aOrder, aDimSize, X_INT);

-	/* initialize variables */
-	a->SetData(aData, aUnitNum);
-	b->SetZeroAll();
+    /* initialize variables */
+    a->SetData(aData, aUnitNum);
+    b->SetZeroAll();

-	/* call ConvertDataType function */
-	_ConvertDataType(a, b);
+    /* call ConvertDataType function */
+    _ConvertDataType(a, b);

-	/* check results */
-	cpuTest = b->CheckData(answer, aUnitNum);
+    /* check results */
+    cpuTest = b->CheckData(answer, aUnitNum);
    
 #ifdef USE_CUDA
-	/* GPU test */
-	bool gpuTest = true;
+    /* GPU test */
+    bool gpuTest = true;

-	/* create tensor */
-	XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
-	XTensor * bGPU = NewTensor(aOrder, aDimSize, X_INT, 1.0F, 0);
+    /* create tensor */
+    XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * bGPU = NewTensor(aOrder, aDimSize, X_INT, 1.0F, 0);

-	/* Initialize variables */
-	aGPU->SetData(aData, aUnitNum);
+    /* Initialize variables */
+    aGPU->SetData(aData, aUnitNum);

-	/* call ConvertDataType function */
-	_ConvertDataType(aGPU, bGPU);
+    /* call ConvertDataType function */
+    _ConvertDataType(aGPU, bGPU);

-	/* check results */
-	gpuTest = bGPU->CheckData(answer, aUnitNum);
+    /* check results */
+    gpuTest = bGPU->CheckData(answer, aUnitNum);

-	/* destroy variables */
-	delete a;
-	delete b;
+    /* destroy variables */
+    delete a;
+    delete b;
    delete aGPU;
    delete bGPU;
-	delete[] aDimSize;
+    delete[] aDimSize;

-	return cpuTest && gpuTest;
+    return cpuTest && gpuTest;
 #else
-	/* destroy variables */
-	delete a;
-	delete b;
-	delete[] aDimSize;
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete[] aDimSize;

-	return cpuTest;
+    return cpuTest;
 #endif // USE_CUDA
 }

@@ -106,72 +106,72 @@ In this case, the int32 data type is converted to float32 data type.
 */
 bool TestConvertDataType2()
 {
-	/* a tensor of size (3, 2) */
-	int aOrder = 2;
-	int * aDimSize = new int[aOrder];
-	aDimSize[0] = 3;
-	aDimSize[1] = 2;
-
-	int aUnitNum = 1;
-	for (int i = 0; i < aOrder; i++)
-		aUnitNum *= aDimSize[i];
-
-	int aData[3][2] = { {1, 2}, 
-	                    {0, 4},
-	                    {5, 6} };
-	DTYPE answer[3][2] = { {1.0F, 2.0F}, 
-	                       {0.0F, 4.0F},
-	                       {5.0F, 6.0F} };
-
-	/* CPU test */
-	bool cpuTest = true;
-
-	/* create tensors */
-	XTensor * a = NewTensor(aOrder, aDimSize, X_INT);
-	XTensor * b = NewTensor(aOrder, aDimSize);
-
-	/* initialize variables */
-	a->SetData(aData, aUnitNum);
-	b->SetZeroAll();
-
-	/* call ConvertDataType function */
-	_ConvertDataType(a, b);
-
-	/* check results */
-	cpuTest = b->CheckData(answer, aUnitNum, 1e-4F);
+    /* a tensor of size (3, 2) */
+    int aOrder = 2;
+    int * aDimSize = new int[aOrder];
+    aDimSize[0] = 3;
+    aDimSize[1] = 2;
+
+    int aUnitNum = 1;
+    for (int i = 0; i < aOrder; i++)
+        aUnitNum *= aDimSize[i];
+
+    int aData[3][2] = { {1, 2}, 
+                        {0, 4},
+                        {5, 6} };
+    DTYPE answer[3][2] = { {1.0F, 2.0F}, 
+                           {0.0F, 4.0F},
+                           {5.0F, 6.0F} };
+
+    /* CPU test */
+    bool cpuTest = true;
+
+    /* create tensors */
+    XTensor * a = NewTensor(aOrder, aDimSize, X_INT);
+    XTensor * b = NewTensor(aOrder, aDimSize);
+
+    /* initialize variables */
+    a->SetData(aData, aUnitNum);
+    b->SetZeroAll();
+
+    /* call ConvertDataType function */
+    _ConvertDataType(a, b);
+
+    /* check results */
+    cpuTest = b->CheckData(answer, aUnitNum, 1e-4F);
    
 #ifdef USE_CUDA
-	/* GPU test */
-	bool gpuTest = true;
+    /* GPU test */
+    bool gpuTest = true;

-	/* create tensor */
-	XTensor * aGPU = NewTensor(aOrder, aDimSize, X_INT, 1.0F, 0);
-	XTensor * bGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    /* create tensor */
+    XTensor * aGPU = NewTensor(aOrder, aDimSize, X_INT, 1.0F, 0);
+    XTensor * bGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);

-	/* Initialize variables */
-	aGPU->SetData(aData, aUnitNum);
+    /* Initialize variables */
+    aGPU->SetData(aData, aUnitNum);

-	/* call ConvertDataType function */
-	_ConvertDataType(aGPU, bGPU);
+    /* call ConvertDataType function */
+    _ConvertDataType(aGPU, bGPU);

-	/* check results */
-	gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F);
+    /* check results */
+    gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F);

-	/* destroy variables */
-	delete a;
-	delete b;
+    /* destroy variables */
+    delete a;
+    delete b;
    delete aGPU;
    delete bGPU;
-	delete[] aDimSize;
+    delete[] aDimSize;

-	return cpuTest && gpuTest;
+    return cpuTest && gpuTest;
 #else
-	/* destroy variables */
-	delete a;
-	delete b;
-	delete[] aDimSize;
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete[] aDimSize;

-	return cpuTest;
+    return cpuTest;
 #endif // USE_CUDA
 }

@@ -298,53 +298,53 @@ TODO!!
 /* test for ConvertDataType Function */
 bool TestConvertDataType()
 {
-	XPRINT(0, stdout, "[TEST ConvertDataType] convert data type \n");
-	bool returnFlag = true, caseFlag = true;
-
-	/* case 1 test */
-	caseFlag = TestConvertDataType1();
-
-	if (!caseFlag) {
-		returnFlag = false;
-		XPRINT(0, stdout, ">> case 1 failed!\n");
-	}
-	else
-		XPRINT(0, stdout, ">> case 1 passed!\n");
-
-	/* case 2 test */
-	caseFlag = TestConvertDataType2();
-
-	if (!caseFlag) {
-		returnFlag = false;
-		XPRINT(0, stdout, ">> case 2 failed!\n");
-	}
-	else
-		XPRINT(0, stdout, ">> case 2 passed!\n");
+    XPRINT(0, stdout, "[TEST ConvertDataType] convert data type \n");
+    bool returnFlag = true, caseFlag = true;
+
+    /* case 1 test */
+    caseFlag = TestConvertDataType1();
+
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 1 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 1 passed!\n");
+
+    /* case 2 test */
+    caseFlag = TestConvertDataType2();
+
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 2 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 2 passed!\n");
    
    /* case 3 test */
-	caseFlag = TestConvertDataType3();
+    caseFlag = TestConvertDataType3();

-	if (!caseFlag) {
-		returnFlag = false;
-		XPRINT(0, stdout, ">> case 3 failed!\n");
-	}
-	else
-		XPRINT(0, stdout, ">> case 3 passed!\n");
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 3 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 3 passed!\n");

-	/* other cases test */
-	/*
-	TODO!!
-	*/
+    /* other cases test */
+    /*
+    TODO!!
+    */

-	if (returnFlag) {
-		XPRINT(0, stdout, ">> All Passed!\n");
-	}
-	else
-		XPRINT(0, stdout, ">> Failed!\n");
+    if (returnFlag) {
+        XPRINT(0, stdout, ">> All Passed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> Failed!\n");

-	XPRINT(0, stdout, "\n");
+    XPRINT(0, stdout, "\n");

-	return returnFlag;
+    return returnFlag;
 }

 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/test/TCos.cpp
+++ b/source/tensor/test/TCos.cpp
@@ -30,84 +30,84 @@ Set every entry to its cosine value.
 */
 bool TestCos1()
 {
-	/* a tensor of size (3, 2) */
-	int order = 2;
-	int * dimSize = new int[order];
-	dimSize[0] = 3;
-	dimSize[1] = 2;
-
-	int unitNum = 1;
-	for (int i = 0; i < order; i++)
-		unitNum *= dimSize[i];
-
-	DTYPE aData[3][2] = { {1.0F, 2.0F}, 
-	                      {-1.0F, -2.0F},
-	                      {0.0F, 0.5F} };
-	DTYPE answer[3][2] = { {0.5403F, -0.4161F},
-	                       {0.5403F, -0.4161F},
-	                       {1.0F, 0.8776F} };
-
-	/* CPU test */
-	bool cpuTest = true;
-
-	/* create tensors */
-	XTensor * a = NewTensor(order, dimSize);
+    /* a tensor of size (3, 2) */
+    int order = 2;
+    int * dimSize = new int[order];
+    dimSize[0] = 3;
+    dimSize[1] = 2;
+
+    int unitNum = 1;
+    for (int i = 0; i < order; i++)
+        unitNum *= dimSize[i];
+
+    DTYPE aData[3][2] = { {1.0F, 2.0F}, 
+                          {-1.0F, -2.0F},
+                          {0.0F, 0.5F} };
+    DTYPE answer[3][2] = { {0.5403F, -0.4161F},
+                           {0.5403F, -0.4161F},
+                           {1.0F, 0.8776F} };
+
+    /* CPU test */
+    bool cpuTest = true;
+
+    /* create tensors */
+    XTensor * a = NewTensor(order, dimSize);
    XTensor * b = NewTensor(order, dimSize);
-	XTensor * aMe = NewTensor(order, dimSize);
+    XTensor * aMe = NewTensor(order, dimSize);
    XTensor bUser;

-	/* initialize variables */
-	a->SetData(aData, unitNum);
-	aMe->SetData(aData, unitNum);
+    /* initialize variables */
+    a->SetData(aData, unitNum);
+    aMe->SetData(aData, unitNum);

-	/* call Cos function */
-	_Cos(a, b);
-	_CosMe(aMe);
+    /* call Cos function */
+    _Cos(a, b);
+    _CosMe(aMe);
    bUser = Cos(*a);

-	/* check results */
-	cpuTest = b->CheckData(answer, unitNum, 1e-4F) && aMe->CheckData(answer, unitNum, 1e-4F) && bUser.CheckData(answer, unitNum, 1e-4F);
+    /* check results */
+    cpuTest = b->CheckData(answer, unitNum, 1e-4F) && aMe->CheckData(answer, unitNum, 1e-4F) && bUser.CheckData(answer, unitNum, 1e-4F);
    
 #ifdef USE_CUDA
-	/* GPU test */
-	bool gpuTest = true;
+    /* GPU test */
+    bool gpuTest = true;

-	/* create tensor */
-	XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
-	XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
-	XTensor * aMeGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    /* create tensor */
+    XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor * aMeGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
    XTensor bUserGPU;

-	/* Initialize variables */
-	aGPU->SetData(aData, unitNum);
-	aMeGPU->SetData(aData, unitNum);
+    /* Initialize variables */
+    aGPU->SetData(aData, unitNum);
+    aMeGPU->SetData(aData, unitNum);

-	/* call Cos function */
+    /* call Cos function */
    _Cos(aGPU, bGPU);
-	_CosMe(aMeGPU);
+    _CosMe(aMeGPU);
    bUserGPU = Cos(*aGPU);

-	/* check results */
-	gpuTest = bGPU->CheckData(answer, unitNum, 1e-4F) && aMeGPU->CheckData(answer, unitNum, 1e-4F) && bUserGPU.CheckData(answer, unitNum, 1e-4F);
+    /* check results */
+    gpuTest = bGPU->CheckData(answer, unitNum, 1e-4F) && aMeGPU->CheckData(answer, unitNum, 1e-4F) && bUserGPU.CheckData(answer, unitNum, 1e-4F);

-	/* destroy variables */
-	delete a;
-	delete b;
-	delete aMe;
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete aMe;
    delete aGPU;
    delete bGPU;
    delete aMeGPU;
-	delete[] dimSize;
+    delete[] dimSize;

-	return cpuTest && gpuTest;
+    return cpuTest && gpuTest;
 #else
-	/* destroy variables */
-	delete a;
-	delete b;
-	delete aMe;
-	delete[] dimSize;
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete aMe;
+    delete[] dimSize;

-	return cpuTest;
+    return cpuTest;
 #endif // USE_CUDA
 }

@@ -119,33 +119,33 @@ TODO!!
 /* test for Cos Function */
 bool TestCos()
 {
-	XPRINT(0, stdout, "[TEST Cos] set every entry to its cosine value \n");
-	bool returnFlag = true, caseFlag = true;
+    XPRINT(0, stdout, "[TEST Cos] set every entry to its cosine value \n");
+    bool returnFlag = true, caseFlag = true;

-	/* case 1 test */
-	caseFlag = TestCos1();
+    /* case 1 test */
+    caseFlag = TestCos1();

-	if (!caseFlag) {
-		returnFlag = false;
-		XPRINT(0, stdout, ">> case 1 failed!\n");
-	}
-	else
-		XPRINT(0, stdout, ">> case 1 passed!\n");
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 1 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 1 passed!\n");

-	/* other cases test */
-	/*
-	TODO!!
-	*/
+    /* other cases test */
+    /*
+    TODO!!
+    */

-	if (returnFlag) {
-		XPRINT(0, stdout, ">> All Passed!\n");
-	}
-	else
-		XPRINT(0, stdout, ">> Failed!\n");
+    if (returnFlag) {
+        XPRINT(0, stdout, ">> All Passed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> Failed!\n");

-	XPRINT(0, stdout, "\n");
+    XPRINT(0, stdout, "\n");

-	return returnFlag;
+    return returnFlag;
 }

 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/test/TDiv.cpp
+++ b/source/tensor/test/TDiv.cpp
@@ -30,97 +30,97 @@ In this case, (2, 2)  (2, 2) -> (2, 2), leadingDim=0, alpha=0.
 */
 bool TestDiv1()
 {
-	/* a source tensor of size (2, 2) */
-	int sOrder1 = 2;
-	int * sDimSize1 = new int[sOrder1];
-	sDimSize1[0] = 2;
-	sDimSize1[1] = 2;
-
-	int sUnitNum1 = 1;
-	for (int i = 0; i < sOrder1; i++)
-		sUnitNum1 *= sDimSize1[i];
-
-	/* a source tensor of size (2, 2) */
-	int sOrder2 = 2;
-	int * sDimSize2 = new int[sOrder2];
-	sDimSize2[0] = 2;
-	sDimSize2[1] = 2;
-
-	int sUnitNum2 = 1;
-	for (int i = 0; i < sOrder2; i++)
-		sUnitNum2 *= sDimSize2[i];
-
-	/* a target tensor of size (2, 2) */
-	int tOrder = 2;
-	int * tDimSize = new int[tOrder];
-	tDimSize[0] = 2;
-	tDimSize[1] = 2;
-
-	int tUnitNum = 1;
-	for (int i = 0; i < tOrder; i++)
-		tUnitNum *= tDimSize[i];
-
-	DTYPE sData1[2][2] = { {0.0F, 1.0F},
-	                       {2.0F, 3.0F} };
-	DTYPE sData2[2][2] = { {1.0F, 1.0F},
-	                       {4.0F, 9.0F} };
-	DTYPE answer[2][2] = { {0.0F, 1.0F},
-	                       {0.5F, 0.3333F} };
-
-	/* CPU test */
-	bool cpuTest = true;
-
-	/* create tensors */
-	XTensor * s1 = NewTensor(sOrder1, sDimSize1);
-	XTensor * s2 = NewTensor(sOrder2, sDimSize2);
-	XTensor * t = NewTensor(tOrder, tDimSize);
+    /* a source tensor of size (2, 2) */
+    int sOrder1 = 2;
+    int * sDimSize1 = new int[sOrder1];
+    sDimSize1[0] = 2;
+    sDimSize1[1] = 2;
+
+    int sUnitNum1 = 1;
+    for (int i = 0; i < sOrder1; i++)
+        sUnitNum1 *= sDimSize1[i];
+
+    /* a source tensor of size (2, 2) */
+    int sOrder2 = 2;
+    int * sDimSize2 = new int[sOrder2];
+    sDimSize2[0] = 2;
+    sDimSize2[1] = 2;
+
+    int sUnitNum2 = 1;
+    for (int i = 0; i < sOrder2; i++)
+        sUnitNum2 *= sDimSize2[i];
+
+    /* a target tensor of size (2, 2) */
+    int tOrder = 2;
+    int * tDimSize = new int[tOrder];
+    tDimSize[0] = 2;
+    tDimSize[1] = 2;
+
+    int tUnitNum = 1;
+    for (int i = 0; i < tOrder; i++)
+        tUnitNum *= tDimSize[i];
+
+    DTYPE sData1[2][2] = { {0.0F, 1.0F},
+                           {2.0F, 3.0F} };
+    DTYPE sData2[2][2] = { {1.0F, 1.0F},
+                           {4.0F, 9.0F} };
+    DTYPE answer[2][2] = { {0.0F, 1.0F},
+                           {0.5F, 0.3333F} };
+
+    /* CPU test */
+    bool cpuTest = true;
+
+    /* create tensors */
+    XTensor * s1 = NewTensor(sOrder1, sDimSize1);
+    XTensor * s2 = NewTensor(sOrder2, sDimSize2);
+    XTensor * t = NewTensor(tOrder, tDimSize);
    XTensor * tMe = NewTensor(tOrder, tDimSize);
    XTensor tUser;

-	/* initialize variables */
-	s1->SetData(sData1, sUnitNum1);
-	tMe->SetData(sData1, sUnitNum1);
-	s2->SetData(sData2, sUnitNum2);
-	t->SetZeroAll();
+    /* initialize variables */
+    s1->SetData(sData1, sUnitNum1);
+    tMe->SetData(sData1, sUnitNum1);
+    s2->SetData(sData2, sUnitNum2);
+    t->SetZeroAll();

-	/* call Div function */
-	_Div(s1, s2, t, 0, 0);
-	_DivMe(tMe, s2, 0, 0);
+    /* call Div function */
+    _Div(s1, s2, t, 0, 0);
+    _DivMe(tMe, s2, 0, 0);
    tUser = Div(*s1, *s2, 0);

-	/* check results */
-	cpuTest = t->CheckData(answer, tUnitNum, 1e-4F) && 
+    /* check results */
+    cpuTest = t->CheckData(answer, tUnitNum, 1e-4F) && 
              tMe->CheckData(answer, tUnitNum, 1e-4F) && 
              tUser.CheckData(answer, tUnitNum, 1e-4F);

 #ifdef USE_CUDA
-	/* GPU test */
-	bool gpuTest = true;
+    /* GPU test */
+    bool gpuTest = true;

-	/* create tensor */
-	XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
-	XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
-	XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    /* create tensor */
+    XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
+    XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
    XTensor * tMeGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
    XTensor tUserGPU;

-	/* Initialize variables */
-	sGPU1->SetData(sData1, sUnitNum1);
-	tMeGPU->SetData(sData1, sUnitNum1);
-	sGPU2->SetData(sData2, sUnitNum2);
-	tGPU->SetZeroAll();
+    /* Initialize variables */
+    sGPU1->SetData(sData1, sUnitNum1);
+    tMeGPU->SetData(sData1, sUnitNum1);
+    sGPU2->SetData(sData2, sUnitNum2);
+    tGPU->SetZeroAll();

-	/* call Div function */
-	_Div(sGPU1, sGPU2, tGPU, 0, 0);
-	_DivMe(tMeGPU, sGPU2, 0, 0);
+    /* call Div function */
+    _Div(sGPU1, sGPU2, tGPU, 0, 0);
+    _DivMe(tMeGPU, sGPU2, 0, 0);
    tUserGPU = Div(*sGPU1, *sGPU2, 0);

-	/* check results */
-	gpuTest = tGPU->CheckData(answer, tUnitNum, 1e-4F) && 
+    /* check results */
+    gpuTest = tGPU->CheckData(answer, tUnitNum, 1e-4F) && 
              tMeGPU->CheckData(answer, tUnitNum, 1e-4F) && 
              tUserGPU.CheckData(answer, tUnitNum, 1e-4F);

-	/* destroy variables */
+    /* destroy variables */
    delete s1;
    delete s2;
    delete t;
@@ -133,7 +133,7 @@ bool TestDiv1()
    delete[] sDimSize2;
    delete[] tDimSize;

-	return cpuTest && gpuTest;
+    return cpuTest && gpuTest;
 #else
    /* destroy variables */
    delete s1;
@@ -144,7 +144,7 @@ bool TestDiv1()
    delete[] sDimSize2;
    delete[] tDimSize;

-	return cpuTest;
+    return cpuTest;
 #endif // USE_CUDA
 }

@@ -156,33 +156,33 @@ TODO!!
 /* test for Div Function */
 bool TestDiv()
 {
-	XPRINT(0, stdout, "[TEST Div] element-wise division of two tensors \n");
-	bool returnFlag = true, caseFlag = true;
+    XPRINT(0, stdout, "[TEST Div] element-wise division of two tensors \n");
+    bool returnFlag = true, caseFlag = true;

-	/* case 1 test */
-	caseFlag = TestDiv1();
+    /* case 1 test */
+    caseFlag = TestDiv1();

-	if (!caseFlag) {
-		returnFlag = false;
-		XPRINT(0, stdout, ">> case 1 failed!\n");
-	}
-	else
-		XPRINT(0, stdout, ">> case 1 passed!\n");
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 1 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 1 passed!\n");

-	/* other cases test */
-	/*
-	TODO!!
-	*/
+    /* other cases test */
+    /*
+    TODO!!
+    */

-	if (returnFlag) {
-		XPRINT(0, stdout, ">> All Passed!\n");
-	}
-	else
-		XPRINT(0, stdout, ">> Failed!\n");
+    if (returnFlag) {
+        XPRINT(0, stdout, ">> All Passed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> Failed!\n");

-	XPRINT(0, stdout, "\n");
+    XPRINT(0, stdout, "\n");

-	return returnFlag;
+    return returnFlag;
 }

 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/test/TDivDim.cpp
+++ b/source/tensor/test/TDivDim.cpp
@@ -127,8 +127,8 @@ bool TestDivDim1()
 #else
    /* destroy variables */
    delete a;
-	delete b;
-	delete c;
+    delete b;
+    delete c;
    delete cMe;
    delete[] aDimSize;
    delete[] bDimSize;
@@ -241,8 +241,8 @@ bool TestDivDim2()
 #else
    /* destroy variables */
    delete a;
-	delete b;
-	delete c;
+    delete b;
+    delete c;
    delete cMe;
    delete[] aDimSize;
    delete[] bDimSize;

--- a/source/tensor/test/TExp.cpp
+++ b/source/tensor/test/TExp.cpp
@@ -30,88 +30,88 @@ Set every entry to its exponent value.
 */
 bool TestExp1()
 {
-	/* a tensor of size (3, 2) */
-	int order = 2;
-	int * dimSize = new int[order];
-	dimSize[0] = 3;
-	dimSize[1] = 2;
-
-	int unitNum = 1;
-	for (int i = 0; i < order; i++)
-		unitNum *= dimSize[i];
-
-	DTYPE aData[3][2] = { {1.0F, 2.0F}, 
-	                      {-1.0F, -2.0F},
-	                      {0.0F, 0.5F} };
-	DTYPE answer[3][2] = { {2.7183F, 7.3891F},
-	                       {0.3679F, 0.1353F},
-	                       {1.0F, 1.6487F} };
-
-	/* CPU test */
-	bool cpuTest = true;
-
-	/* create tensors */
-	XTensor * a = NewTensor(order, dimSize);
+    /* a tensor of size (3, 2) */
+    int order = 2;
+    int * dimSize = new int[order];
+    dimSize[0] = 3;
+    dimSize[1] = 2;
+
+    int unitNum = 1;
+    for (int i = 0; i < order; i++)
+        unitNum *= dimSize[i];
+
+    DTYPE aData[3][2] = { {1.0F, 2.0F}, 
+                          {-1.0F, -2.0F},
+                          {0.0F, 0.5F} };
+    DTYPE answer[3][2] = { {2.7183F, 7.3891F},
+                           {0.3679F, 0.1353F},
+                           {1.0F, 1.6487F} };
+
+    /* CPU test */
+    bool cpuTest = true;
+
+    /* create tensors */
+    XTensor * a = NewTensor(order, dimSize);
    XTensor * b = NewTensor(order, dimSize);
-	XTensor * aMe = NewTensor(order, dimSize);
+    XTensor * aMe = NewTensor(order, dimSize);
    XTensor bUser;

-	/* initialize variables */
-	a->SetData(aData, unitNum);
-	aMe->SetData(aData, unitNum);
+    /* initialize variables */
+    a->SetData(aData, unitNum);
+    aMe->SetData(aData, unitNum);

-	/* call Exp function */
-	_Exp(a, b);
-	_ExpMe(aMe);
+    /* call Exp function */
+    _Exp(a, b);
+    _ExpMe(aMe);
    bUser = Exp(*a);

-	/* check results */
-	cpuTest = b->CheckData(answer, unitNum, 1e-4F) && 
+    /* check results */
+    cpuTest = b->CheckData(answer, unitNum, 1e-4F) && 
              aMe->CheckData(answer, unitNum, 1e-4F) && 
              bUser.CheckData(answer, unitNum, 1e-4F);
    
 #ifdef USE_CUDA
-	/* GPU test */
-	bool gpuTest = true;
+    /* GPU test */
+    bool gpuTest = true;

-	/* create tensor */
-	XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
-	XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
-	XTensor * aMeGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    /* create tensor */
+    XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor * aMeGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
    XTensor bUserGPU;

-	/* Initialize variables */
-	aGPU->SetData(aData, unitNum);
-	aMeGPU->SetData(aData, unitNum);
+    /* Initialize variables */
+    aGPU->SetData(aData, unitNum);
+    aMeGPU->SetData(aData, unitNum);

-	/* call Exp function */
+    /* call Exp function */
    _Exp(aGPU, bGPU);
-	_ExpMe(aMeGPU);
+    _ExpMe(aMeGPU);
    bUserGPU = Exp(*aGPU);

-	/* check results */
-	gpuTest = bGPU->CheckData(answer, unitNum, 1e-4F) && 
+    /* check results */
+    gpuTest = bGPU->CheckData(answer, unitNum, 1e-4F) && 
              aMeGPU->CheckData(answer, unitNum, 1e-4F) && \
              bUserGPU.CheckData(answer, unitNum, 1e-4F);

-	/* destroy variables */
-	delete a;
-	delete b;
-	delete aMe;
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete aMe;
    delete aGPU;
    delete bGPU;
    delete aMeGPU;
-	delete[] dimSize;
+    delete[] dimSize;

-	return cpuTest && gpuTest;
+    return cpuTest && gpuTest;
 #else
-	/* destroy variables */
-	delete a;
-	delete b;
-	delete aMe;
-	delete[] dimSize;
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete aMe;
+    delete[] dimSize;

-	return cpuTest;
+    return cpuTest;
 #endif // USE_CUDA
 }

@@ -123,33 +123,33 @@ TODO!!
 /* test for Exp Function */
 bool TestExp()
 {
-	XPRINT(0, stdout, "[TEST Exp] set every entry to its exponent value \n");
-	bool returnFlag = true, caseFlag = true;
+    XPRINT(0, stdout, "[TEST Exp] set every entry to its exponent value \n");
+    bool returnFlag = true, caseFlag = true;

-	/* case 1 test */
-	caseFlag = TestExp1();
+    /* case 1 test */
+    caseFlag = TestExp1();

-	if (!caseFlag) {
-		returnFlag = false;
-		XPRINT(0, stdout, ">> case 1 failed!\n");
-	}
-	else
-		XPRINT(0, stdout, ">> case 1 passed!\n");
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 1 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 1 passed!\n");

-	/* other cases test */
-	/*
-	TODO!!
-	*/
+    /* other cases test */
+    /*
+    TODO!!
+    */

-	if (returnFlag) {
-		XPRINT(0, stdout, ">> All Passed!\n");
-	}
-	else
-		XPRINT(0, stdout, ">> Failed!\n");
+    if (returnFlag) {
+        XPRINT(0, stdout, ">> All Passed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> Failed!\n");

-	XPRINT(0, stdout, "\n");
+    XPRINT(0, stdout, "\n");

-	return returnFlag;
+    return returnFlag;
 }

 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/test/THardTanH.cpp
+++ b/source/tensor/test/THardTanH.cpp
--- a/source/tensor/test/TIdentity.cpp
+++ b/source/tensor/test/TIdentity.cpp
--- a/source/tensor/test/TLog.cpp
+++ b/source/tensor/test/TLog.cpp
--- a/source/tensor/test/TLogSoftmax.cpp
+++ b/source/tensor/test/TLogSoftmax.cpp
--- a/source/tensor/test/TMerge.cpp
+++ b/source/tensor/test/TMerge.cpp
--- a/source/tensor/test/TMultiply.cpp
+++ b/source/tensor/test/TMultiply.cpp
--- a/source/tensor/test/TNegate.cpp
+++ b/source/tensor/test/TNegate.cpp
--- a/source/tensor/test/TNormalize.cpp
+++ b/source/tensor/test/TNormalize.cpp
--- a/source/tensor/test/TPower.cpp
+++ b/source/tensor/test/TPower.cpp
--- a/source/tensor/test/TRectify.cpp
+++ b/source/tensor/test/TRectify.cpp
--- a/source/tensor/test/TRound.cpp
+++ b/source/tensor/test/TRound.cpp
--- a/source/tensor/test/TSigmoid.cpp
+++ b/source/tensor/test/TSigmoid.cpp
--- a/source/tensor/test/TSign.cpp
+++ b/source/tensor/test/TSign.cpp
--- a/source/tensor/test/TSin.cpp
+++ b/source/tensor/test/TSin.cpp
--- a/source/tensor/test/TSoftmax.cpp
+++ b/source/tensor/test/TSoftmax.cpp
--- a/source/tensor/test/TSplit.cpp
+++ b/source/tensor/test/TSplit.cpp
--- a/source/tensor/test/TSub.cpp
+++ b/source/tensor/test/TSub.cpp
--- a/source/tensor/test/TSubDim.cpp
+++ b/source/tensor/test/TSubDim.cpp
--- a/source/tensor/test/TSum.cpp
+++ b/source/tensor/test/TSum.cpp
--- a/source/tensor/test/TSumDim.cpp
+++ b/source/tensor/test/TSumDim.cpp
--- a/source/tensor/test/TTan.cpp
+++ b/source/tensor/test/TTan.cpp
--- a/source/tensor/test/TTranspose.cpp
+++ b/source/tensor/test/TTranspose.cpp
--- a/source/tensor/test/Test.cpp
+++ b/source/tensor/test/Test.cpp