refactor parameter from pointer to reference

771643c6 · huchi · 04f129fc · 771643c6 · 771643c6 · 771643c6
Commit 771643c6 authored Jul 19, 2019 by huchi
--- a/source/network/Main.cpp
+++ b/source/network/Main.cpp
--- a/source/network/XBackwardMath.cpp
+++ b/source/network/XBackwardMath.cpp
--- a/source/network/XBackwardMath.h
+++ b/source/network/XBackwardMath.h
--- a/source/network/XBackwardShape.cpp
+++ b/source/network/XBackwardShape.cpp
--- a/source/network/XNet.cpp
+++ b/source/network/XNet.cpp
--- a/source/sample/fnnlm/FNNLM.cpp
+++ b/source/sample/fnnlm/FNNLM.cpp
--- a/source/sample/transformer/T2TAttention.h
+++ b/source/sample/transformer/T2TAttention.h
--- a/source/sample/transformer/T2TBatchLoader.cpp
+++ b/source/sample/transformer/T2TBatchLoader.cpp
--- a/source/sample/transformer/T2TBatchLoader.h
+++ b/source/sample/transformer/T2TBatchLoader.h
--- a/source/sample/transformer/T2TSearch.cpp
+++ b/source/sample/transformer/T2TSearch.cpp
@@ -303,7 +303,7 @@ void T2TSearch::Generate(T2TStateBundle * beam)
    /* Then, we do something similar to "preID". For the top-k predictions, we need 
       to know their indices in the vocabulary. We compute the offset of each prediction
       in the vocabulary by dividing it with vocab-size and computing the remainder. */
-    _ModMe(index, sizeVocab);
+    ModMe(index, sizeVocab);
    score.Reshape(order, dims);

--- a/source/tensor/XDevice.cpp
+++ b/source/tensor/XDevice.cpp
--- a/source/tensor/XList.cpp
+++ b/source/tensor/XList.cpp
--- a/source/tensor/XList.h
+++ b/source/tensor/XList.h
--- a/source/tensor/XMem.cpp
+++ b/source/tensor/XMem.cpp
--- a/source/tensor/XQueue-李垠桥的MacBook Pro.cpp
+++ b/source/tensor/XQueue-李垠桥的MacBook Pro.cpp
-/* NiuTrans.Tensor - an open-source tensor library
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/*
- * 
- * This is an implementation of queue. Actually we intend to use it to maintain
- * a priority job list
- *
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2017-04-05
- *
- */
-#include <stdio.h>
-#include <stdlib.h>
-#include "XQueue.h"
-#include "XDevice.h"
-#include "XList.h"
-#include "XUtility.h"
-/* the nts (NiuTrans.Tensor) namespace */
-namespace nts{
-/**************************************
-job item used in queues
-*/
-/* constructor */
-JobQueueNode::JobQueueNode()
-{
-    job  = NULL;
-    args = new TensorList(1);
-}
-/* de-constructor */
-JobQueueNode::~JobQueueNode()
-{
-    delete args;
-}
-/**************************************
-This class provides standard utilities of Queue.
-*/
-/* constuctor */
-XQueue::XQueue(int mySize)
-{
-    queue = new void*[mySize];
-    memset(queue, 0, sizeof(void*) * mySize);
-    size = mySize;
-    itemCount = 0;
-    head = 0;
-    tail = 0;
-    isJobQueue = false;
-    jobDequeuerArgs = new TensorList(1);
-    jobDequeuerBreak = false;
-    runningJobCount = 0;
-    jobStream = NULL;
-    jobStream1 = NULL;
-    jobStream2 = NULL;
-    MUTEX_INIT(enqueueMutex);
-    MUTEX_INIT(dequeueMutex);
-    COND_INIT(queueCond);
-    MUTEX_INIT(jobQueueMutex);
-}
-/* deconstructor */
-XQueue::~XQueue()
-{
-    delete[] queue;
-    delete jobDequeuerArgs;
-    delete jobStream;
-    delete jobStream1;
-    delete jobStream2;
-    //if(isJobQueue)
-    //    StopJobConsumer();
-    MUTEX_DELE(enqueueMutex);
-    MUTEX_DELE(dequeueMutex);
-    COND_DELE(queueCond);
-    MUTEX_DELE(jobQueueMutex);
-}
-/* 
-put an item in the tail of the queue 
->> item - the item we intend to add into the queue
-*/
-void XQueue::Enqueue(void * item)
-{
-    MUTEX_LOCK(enqueueMutex);
-    MUTEX_LOCK(dequeueMutex);
-    CheckNTErrors((itemCount < size), "Put too many items into the queue!");
-    queue[tail] = item;
-    tail = (tail + 1) % size;
-    itemCount++;
-    COND_SIGNAL(queueCond);
-    MUTEX_UNLOCK(dequeueMutex);
-    MUTEX_UNLOCK(enqueueMutex);
-}
-/* 
-fetch an item from head of the queue 
-<< return - the head item of the queue
-*/
-void * XQueue::Dequeue()
-{
-    MUTEX_LOCK(dequeueMutex);
-    while(itemCount == 0)
-    {
-#ifdef  WIN32
-        MUTEX_UNLOCK(dequeueMutex);
-#endif
-        COND_WAIT(queueCond, dequeueMutex);
-#ifdef  WIN32
-        MUTEX_LOCK(dequeueMutex);
-#endif
-    }
-    void * r = queue[head];
-    head = (head + 1) % size;
-    itemCount--;
-    MUTEX_UNLOCK(dequeueMutex);
-    return r;
-}
-/* return if the queue is empty */
-bool XQueue::IsEmpty()
-{
-    return itemCount == 0;
-}
-/* wait until the queue is empty */
-void XQueue::WaitForEmptyJobQueue()
-{
-    while(runningJobCount > 0){
-        XSleep(10);
-    }
-    if(jobStream != NULL){
-        CheckNTErrors((jobStream->IsFinished()), "None fineished jobs remain");
-        jobStream->Clear();
-    }
-    if(jobStream1 != NULL){
-        CheckNTErrors((jobStream1->IsFinished()), "None fineished jobs remain");
-        jobStream1->Clear();
-    }
-    if(jobStream2 != NULL){
-        CheckNTErrors((jobStream2->IsFinished()), "None fineished jobs remain");
-        jobStream2->Clear();
-    }
-}
-int devids[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
-int cpuid = -1;
-/* 
-run job consumer (in another thread) 
->> jobDevID - id of the device for running the jobs
-*/
-void XQueue::RunJobConsumer(int jobDevID)
-{
-    CheckNTErrors((jobDevID < 16), "device id is out of scope!");
-    isJobQueue = true;
-    jobDequeuerArgs->Clear();
-    jobDequeuerArgs->Add(this);
-    jobDequeuerArgs->Add(jobDevID >= 0 ? devids + jobDevID : &cpuid);
-    jobDequeuer.function = (TFunction)DequeueJobs;
-    jobDequeuer.argv = jobDequeuerArgs;
-    jobDequeuer.Start();
-    jobDequeuer.LetItGo();
-}
-/* stop the job consumer */
-void XQueue::StopJobConsumer()
-{
-    jobDequeuerBreak = true;
-    XSleep(10);
-    EnqueueJob(NULL, NULL);
-    jobDequeuer.End();
-    isJobQueue = false;
-}
-/* add a job item to process */
-void XQueue::EnqueueJob(void * job, TensorList * jobArgs)
-{
-    MUTEX_LOCK(jobQueueMutex);
-    runningJobCount++;
-    MUTEX_UNLOCK(jobQueueMutex);
-    JobQueueNode * node = new JobQueueNode();
-    node->job = job;
-    if(jobArgs != NULL)
-        node->args->AddList(jobArgs);
-    Enqueue(node);
-}
-/* job item consumer */
-void XQueue::DequeueJobs(TensorList * args)
-{
-    CheckNTErrors((args->count == 2), "Illegal arguments!");
-    XQueue * q = (XQueue*)args->GetItem(0);
-    int devID = *(int*)args->GetItem(1);
-    int devIDBackup = XDevice::GetGPUDevice();
-    if(devID >= 0)
-        XDevice::SetGPUDevice(devID);
-    while(1){
-        JobQueueNode * node = (JobQueueNode*)q->Dequeue();
-        if(q->GetJobBreak())
-            break;
-        CheckNTErrors((node != NULL), "Illegal job!");
-        /* process a job */
-        ((TFunction)node->job)(node->args);
-        delete node;
-        MUTEX_LOCK(q->jobQueueMutex);
-        q->runningJobCount--;
-        MUTEX_UNLOCK(q->jobQueueMutex);
-    }
-    if(devID >= 0)
-        XDevice::SetGPUDevice(devIDBackup);
-}
-/* get the break flag */
-bool XQueue::GetJobBreak()
-{
-    return jobDequeuerBreak;
-}
-/* get job stream */
-XStream * XQueue::GetJobStream(int n)
-{
-    if(n == 0)
-        return jobStream;
-    else if(n == 1)
-        return jobStream1;
-    else if(n == 2)
-        return jobStream2;
-    else{
-        ShowNTErrors("invalid stream id!");
-    }
-    return NULL;
-}
-/* make job streams */
-void XQueue::MakeJobStreams(int devID, int devID1, int devID2)
-{
-    if(devID != INVALID_DEVICE_ID)
-        jobStream = new XStream(0, devID);
-    if(devID1 != INVALID_DEVICE_ID)
-        jobStream1 = new XStream(0, devID1);
-    if(devID2 != INVALID_DEVICE_ID)
-        jobStream2 = new XStream(0, devID2);
-}
-} /* end of the nts (NiuTrans.Tensor) namespace */
--- a/source/tensor/XQueue.cpp
+++ b/source/tensor/XQueue.cpp
--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -190,7 +190,6 @@ XTensor::XTensor(const XTensor &reference)
    isInit = true;
    isTmp  = reference.isTmp;
-	enableGrad = reference.enableGrad;
 }
 /* copy constructor (with right value reference) */
@@ -219,7 +218,6 @@ XTensor::XTensor(const XTensor &&reference)
    isInit = true;
    isTmp  = reference.isTmp;
-	enableGrad = reference.enableGrad;
 }
 /* de-constructor */
@@ -316,6 +314,7 @@ void XTensor::ShallowCopy(const XTensor &tensor)
 {
    strcpy(name, tensor.name);
    order = tensor.order;
+    enableGrad = tensor.enableGrad;
    memcpy(dimSize, tensor.dimSize, sizeof(int) * MAX_TENSOR_DIM_NUM);
    memcpy(dimSizeRDI, tensor.dimSizeRDI, sizeof(int) * MAX_TENSOR_DIM_NUM);
    dataType = tensor.dataType;
@@ -403,7 +402,6 @@ XTensor& XTensor::operator= (const XTensor& tensor)
        /* create tensor links for the new tensor */
        XLink::Replace(&tensor, this);
    }
-	enableGrad = tensor.enableGrad;
    return *this;
 }
@@ -450,7 +448,6 @@ XTensor& XTensor::operator= (const XTensor&& tensor)
    *tensor.dataP = NULL;
    XLink::Replace(&tensor, this);
-	enableGrad = tensor.enableGrad;
    return *this;
 }

--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
--- a/source/tensor/core/arithmetic/Div.cpp
+++ b/source/tensor/core/arithmetic/Div.cpp
@@ -143,6 +143,23 @@ void _DivMe(XTensor * a, const XTensor * b, DTYPE alpha, int leadingDim)
 }
 /*
+element-wise division of two tensors (do it on site)
+keep the result in the input tensor a and return nothing
+a(i) = a(i)*b(i) + \alpha * a(i)
+where i is the index of the item
+>> a - tensor a (where keep the result)
+>> b - tensor b
+>> alpha - the coefficient
+>> leadingDim - the dimension along which we perform broadcasting
+*/
+void DivMe(XTensor& a, const XTensor& b, DTYPE alpha, int leadingDim)
+{
+    _Div(&a, &b, &a, alpha, leadingDim);
+}
+/* 
 return a dimension if the division is performed as DivDim (in more details in DivDim.h)
 >> a - a tensor
 >> b - another tensor for division

--- a/source/tensor/core/arithmetic/Div.cu
+++ b/source/tensor/core/arithmetic/Div.cu
--- a/source/tensor/core/arithmetic/Div.h
+++ b/source/tensor/core/arithmetic/Div.h
@@ -40,6 +40,7 @@ a(i) = a(i)/b(i) + \alpha * a(i)
 where i is the index of the element 
 */
 void _DivMe(XTensor * a, const XTensor * b, DTYPE alpha = 0.0, int leadingDim = 0);
+void DivMe(XTensor & a, const XTensor & b, DTYPE alpha = 0.0, int leadingDim = 0);
 /* 
 element-wise division of two tensors (return an XTensor structure)

--- a/source/tensor/core/arithmetic/Mask.cpp
+++ b/source/tensor/core/arithmetic/Mask.cpp
@@ -130,6 +130,17 @@ void _MaskMe(XTensor * a, const XTensor * mask, DTYPE alpha)
 }
 /*
+mask entries of a given tensor (on site):
+a(i) = a(i) if mask(i) is non-zero
+a(i) = alpha if mask(i) = 0
+where i is the index of the element
+*/
+void MaskMe(XTensor& a, const XTensor& mask, DTYPE alpha)
+{
+    _Mask(&a, &mask, &a, alpha);
+}
+/*
 mask entries of a given tensor (return an XTensor structure):
 a(i) = a(i) if mask(i) is non-zero
 a(i) = alpha if mask(i) = 0

--- a/source/tensor/core/arithmetic/Mask.h
+++ b/source/tensor/core/arithmetic/Mask.h
@@ -43,6 +43,7 @@ a(i) = alpha if mask(i) = 0
 where i is the index of the element
 */
 void _MaskMe(XTensor * a, const XTensor * mask, DTYPE alpha);
+void MaskMe(XTensor & a, const XTensor & mask, DTYPE alpha);
 /* 
 mask entries of a given tensor (return an XTensor structure):

--- a/source/tensor/core/arithmetic/MatrixMul2D.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul2D.cpp
--- a/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.cpp
--- a/source/tensor/core/arithmetic/Multiply.cpp
+++ b/source/tensor/core/arithmetic/Multiply.cpp
@@ -144,6 +144,23 @@ void _MultiplyMe(XTensor * a, const XTensor * b, DTYPE alpha, int leadingDim)
 }
 /*
+element-wise product of two tensors (do it on site)
+keep the result in the input tensor a and return nothing
+a(i) = a(i)*b(i) + \alpha * a(i)
+where i is the index of the item
+>> a - tensor a (where keep the result)
+>> b - tensor b
+>> alpha - the coefficient
+>> leadingDim - the dimension along which we perform broadcasting
+*/
+void MultiplyMe(XTensor& a, const XTensor& b, DTYPE alpha, int leadingDim)
+{
+    _Multiply(&a, &b, &a, alpha, leadingDim);
+}
+/* 
 return a dimension if the multiplication is performed as MultiplyDim (in more details in MultiplyDim.h)
 >> a - a tensor
 >> b - another tensor for multiplication

--- a/source/tensor/core/arithmetic/Multiply.cu
+++ b/source/tensor/core/arithmetic/Multiply.cu
--- a/source/tensor/core/arithmetic/Multiply.h
+++ b/source/tensor/core/arithmetic/Multiply.h
@@ -40,6 +40,7 @@ a(i) = a(i)*b(i) + \alpha * a(i)
 where i is the index of the element 
 */
 void _MultiplyMe(XTensor * a, const XTensor * b, DTYPE alpha = 0.0, int leadingDim = 0);
+void MultiplyMe(XTensor & a, const XTensor & b, DTYPE alpha = 0.0, int leadingDim = 0);
 /* 
 element-wise product of two tensors (return an XTensor structure)

--- a/source/tensor/core/arithmetic/MultiplyDim.cpp
+++ b/source/tensor/core/arithmetic/MultiplyDim.cpp
@@ -139,6 +139,24 @@ void _MultiplyDimMe(XTensor * a, const XTensor * b, int n, DTYPE alpha)
 }
 /*
+tensor multiplication(do it on site)
+make a new tensor to keep the result and return it
+c = a * b + \alpha * c
+where the size of b is equal to the n-th dimension of a,
+i.e., a is multiplied with b by broadcasting
+>> a - a tensor
+>> b - another tensor whose size is equal to that of dimension n of a
+>> n - the dimension index
+>> alpha - the scaling factor
+*/
+void MultiplyDimMe(XTensor& a, const XTensor& b, int n, DTYPE alpha)
+{
+    _MultiplyDim(&a, &b, &a, n, alpha);
+}
+/*
 tensor multiplication (return an XTensor structure and make tensor connections)
 make a new tensor to keep the result and return it

--- a/source/tensor/core/arithmetic/MultiplyDim.h
+++ b/source/tensor/core/arithmetic/MultiplyDim.h
@@ -33,6 +33,7 @@ void _MultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYP
 /* tensor multiplication a = a * b + \alpha * c where the size of b is equal to the n-th dimension of a,
   i.e., a is multiplied with b by broadcasting. we keep the result in the input tensor a and return nothing */
 void _MultiplyDimMe(XTensor * a, const XTensor * b, int n, DTYPE alpha = 0.0);
+void MultiplyDimMe(XTensor & a, const XTensor & b, int n, DTYPE alpha = 0.0);
 /* tensor multiplication c = a * b where the size of b is equal to the n-th dimension of a,
   i.e., a is multiplied with b by broadcasting. We make a new tensor c to keep the result and return it */

--- a/source/tensor/core/arithmetic/Negate.cpp
+++ b/source/tensor/core/arithmetic/Negate.cpp
@@ -60,6 +60,16 @@ void _NegateMe(XTensor * a)
 }
 /*
+set every entry to its minus value (do it on site)
+keep the result in the input tensor a and return nothing
+>> a - the tensor we are processing
+*/
+void NegateMe(XTensor& a)
+{
+    _Negate(&a, &a);
+}
+/*
 set every entry to its minus value (return an XTensor structure)
 make a new tensor to keep the result and return it
 >> a - input tensor we are processing

--- a/source/tensor/core/arithmetic/Negate.h
+++ b/source/tensor/core/arithmetic/Negate.h
@@ -34,6 +34,7 @@ set every entry to its minus value (do it on site)
 keep the result in the input tensor a and return nothing
 */
 void _NegateMe(XTensor * a);
+void NegateMe(XTensor & a);
 /* 
 set every entry to its minus value (return an XTensor structure)

--- a/source/tensor/core/arithmetic/Sign.cpp
+++ b/source/tensor/core/arithmetic/Sign.cpp
@@ -66,6 +66,16 @@ void _SignMe(XTensor * a)
 }
 /*
+set every entry to its sign value (do it on site)
+keep the result in the input tensor a and return nothing
+>> a - the tensor we are processing
+*/
+void SignMe(XTensor& a)
+{
+    _Sign(&a, &a);
+}
+/*
 set every entry to its sign value (return an XTensor structure)
 make a new tensor to keep the result and return it
 >> a - input tensor we are processing

--- a/source/tensor/core/arithmetic/Sign.h
+++ b/source/tensor/core/arithmetic/Sign.h
@@ -36,6 +36,12 @@ keep the result in the input tensor a and return nothing
 void _SignMe(XTensor * a);
 /* 
+set every entry to its sign value (do it on site)
+keep the result in the input tensor a and return nothing
+*/
+void SignMe(XTensor & a);
+/* 
 set every entry to its sign value  (return an XTensor structure)
 make a new tensor to keep the result and return it
 */

--- a/source/tensor/core/arithmetic/Sub.cpp
+++ b/source/tensor/core/arithmetic/Sub.cpp
@@ -128,6 +128,19 @@ void _SubMe(XTensor * a, const XTensor * b, DTYPE beta)
 }
 /*
+tensor subtraction a = a - b * \beta (do it on site)
+keep the result in the tensor a and return nothing
+>> a - a tensor
+>> b - another tensor
+>> beta - the scaling factor
+*/
+void SubMe(XTensor& a, const XTensor& b, DTYPE beta)
+{
+    _Sub(&a, &b, &a, beta);
+}
+/* 
 return a dimension if the subtraction is performed as SubDim (in more details in SubDim.h)
 >> a - a tensor
 >> b - another tensor for subtraction

--- a/source/tensor/core/arithmetic/Sub.h
+++ b/source/tensor/core/arithmetic/Sub.h
@@ -35,6 +35,7 @@ tensor subtraction a = a - b * \beta
 keep the result in the input tensor a and return nothing
 */
 void _SubMe(XTensor * a, const XTensor * b, DTYPE beta = (DTYPE)1.0);
+void SubMe(XTensor & a, const XTensor & b, DTYPE beta = (DTYPE)1.0);
 /*
 tensor subtraction c = a - b * \beta

--- a/source/tensor/core/arithmetic/SubDim.cpp
+++ b/source/tensor/core/arithmetic/SubDim.cpp
--- a/source/tensor/core/arithmetic/SubDim.cu
+++ b/source/tensor/core/arithmetic/SubDim.cu
--- a/source/tensor/core/arithmetic/Sum.cpp
+++ b/source/tensor/core/arithmetic/Sum.cpp
@@ -133,6 +133,19 @@ void _SumMe(XTensor * a, const XTensor * b, DTYPE beta)
 }
 /*
+tensor summation a = a + b * \beta (do it on site)
+keep the result in the tensor a and return nothing
+>> a - a tensor
+>> b - another tensor
+>> beta - the scaling factor
+*/
+void SumMe(XTensor& a, const XTensor& b, DTYPE beta)
+{
+    _Sum(&a, &b, &a, beta);
+}
+/* 
 return a dimension if the sum is performed as SumDim (in more details in SumDim.h)
 >> a - a tensor
 >> b - another tensor for sum

--- a/source/tensor/core/arithmetic/Sum.h
+++ b/source/tensor/core/arithmetic/Sum.h
@@ -34,6 +34,7 @@ tensor summation a = a + b * \beta
 keep the result in the input tensor a and return nothing
 */
 void _SumMe(XTensor * a, const XTensor * b, DTYPE beta = (DTYPE)1.0);
+void SumMe(XTensor & a, const XTensor & b, DTYPE beta = (DTYPE)1.0);
 /*
 tensor summation c = a + b * \beta

--- a/source/tensor/core/arithmetic/XTensorBLAS.cpp
+++ b/source/tensor/core/arithmetic/XTensorBLAS.cpp
--- a/source/tensor/core/math/Binary.cpp
+++ b/source/tensor/core/math/Binary.cpp
@@ -165,7 +165,7 @@ SIMPLE_BINARY_FUNCTION(Shift, _Shift, MATH_SHIFT)
 SIMPLE_BINARY_FUNCTION_VOID(Shift, _Shift, MATH_SHIFT)
 _SIMPLE_BINARY_FUNCTION_INT(_Mod, _CudaMod, mod)
-SIMPLE_BINARY_FUNCTION_ME_INT(_ModMe, _Mod)
+SIMPLE_BINARY_FUNCTION_ME_INT(ModMe, _Mod)
 SIMPLE_BINARY_FUNCTION_INT(Mod, _Mod)
 #else

--- a/source/tensor/core/math/Binary.h
+++ b/source/tensor/core/math/Binary.h
@@ -37,8 +37,15 @@ void _Scale(const XTensor * a, XTensor * b, float scale);
 scale up tensor entires (on site)
 b = a * scale
 */
-void _ScaleMe(XTensor & a, int scale);
+void _ScaleMe(XTensor * a, int scale);
-void _ScaleMe(XTensor & a, float scale);
+void _ScaleMe(XTensor * a, float scale);
+/*
+scale up tensor entires (on site)
+b = a * scale
+*/
+void ScaleMe(XTensor & a, int scale);
+void ScaleMe(XTensor & a, float scale);
 /*
 scale up tensor entires
@@ -64,8 +71,15 @@ void _Descale(const XTensor * a, XTensor * b, float scale);
 descale tensor entires (on site)
 b = a / scale
 */
-void _DescaleMe(XTensor & a, int scale);
+void _DescaleMe(XTensor * a, int scale);
-void _DescaleMe(XTensor & a, float scale);
+void _DescaleMe(XTensor * a, float scale);
+/*
+descale tensor entires (on site)
+b = a / scale
+*/
+void DescaleMe(XTensor & a, int scale);
+void DescaleMe(XTensor & a, float scale);
 /*
 descale tensor entires
@@ -91,8 +105,15 @@ void _Shift(const XTensor * a, XTensor * b, float shift);
 shift tensor entires (on site)
 b = a + shift
 */
-void _ShiftMe(XTensor & a, int shift);
+void _ShiftMe(XTensor * a, int shift);
-void _ShiftMe(XTensor & a, float shift);
+void _ShiftMe(XTensor * a, float shift);
+/*
+shift tensor entires (on site)
+b = a + shift
+*/
+void ShiftMe(XTensor & a, int shift);
+void ShiftMe(XTensor & a, float shift);
 /*
 shift tensor entires
@@ -118,7 +139,13 @@ void _Mod(const XTensor * a, XTensor * b, int base);
 mod tensor entires (on site)
 b = a % mod
 */
-void _ModMe(XTensor & a, int base);
+void _ModMe(XTensor * a, int base);
+/*
+mod tensor entires (on site)
+b = a % mod
+*/
+void ModMe(XTensor & a, int base);
 /*
 mod tensor entires

--- a/source/tensor/core/math/Clip.cpp
+++ b/source/tensor/core/math/Clip.cpp
@@ -71,6 +71,18 @@ void _ClipMe(XTensor * a, DTYPE lower, DTYPE upper)
 }
 /*
+set every entry to its clip value (do it on site)
+keep the result in the input tensor a and return nothing
+>> a - the tensor we are processing
+>> lower - the lower border
+>> upper - the upper border
+*/
+void ClipMe(XTensor& a, DTYPE lower, DTYPE upper)
+{
+    _Clip(&a, &a, lower, upper);
+}
+/*
 set every entry to its clip value (return an XTensor structure)
 make a new tensor to keep the result and return it
 >> a - input tensor we are processing

--- a/source/tensor/core/math/Clip.cu
+++ b/source/tensor/core/math/Clip.cu
--- a/source/tensor/core/math/Clip.h
+++ b/source/tensor/core/math/Clip.h
@@ -33,6 +33,10 @@ void _Clip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper);
   keep the result in the input tensor a and return nothing */
 void _ClipMe(XTensor * a, DTYPE lower, DTYPE upper);
+/* set every entry to its clip value (do it on site)
+keep the result in the input tensor a and return nothing */
+void ClipMe(XTensor & a, DTYPE lower, DTYPE upper);
 /* set every entry to its clip value  (return an XTensor structure)
   make a new tensor to keep the result and return it */
 XTensor Clip(const XTensor & a, DTYPE lower, DTYPE upper);

--- a/source/tensor/core/math/Compare.h
+++ b/source/tensor/core/math/Compare.h
@@ -32,6 +32,9 @@ void _Equal(const XTensor * a, XTensor * b, DTYPE value);
 /* check whether every entry is equal to the given value (do it on site) */
 void _EqualMe(XTensor * a, DTYPE value);
+/* check whether every entry is equal to the given value (do it on site) */
+void EqualMe(XTensor & a, DTYPE value);
 /* check whether every entry is equal to the given value (return an XTensor structure) */
 XTensor Equal(const XTensor & a, DTYPE value);
@@ -41,6 +44,9 @@ void _NotEqual(const XTensor * a, XTensor * b, DTYPE value);
 /* check whether every entry is not equal to the given value (do it on site) */
 void _NotEqualMe(XTensor * a, DTYPE value);
+/* check whether every entry is not equal to the given value (do it on site) */
+void NotEqualMe(XTensor & a, DTYPE value);
 /* check whether every entry is not equal to the given value (return an XTensor structure) */
 XTensor NotEqual(const XTensor & a, DTYPE value);

--- a/source/tensor/core/math/Normalize.cpp
+++ b/source/tensor/core/math/Normalize.cpp
@@ -113,6 +113,27 @@ void _NormalizeMe(XTensor * input, int dim, const XTensor * mean, const XTensor 
 {
    _Normalize(input, input, dim, mean, var, a, b, epsilon);
 }
+/*
+normalized the data with normal distribution (do it on site)
+keep the result in the input tensor and return nothing
+For an input x, x = a * (x-mean)/sqrt(variance+\epsilon) + b
+where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter.
+>> input - the input tensor
+>> dim - dimension alone which we generate the mean and variance
+>> mean - the mean of the input
+>> var - the variance of the input
+>> a - the scalar
+>> b - the bias
+>> epsilon - a parameter
+*/
+void NormalizeMe(XTensor& input, int dim, const XTensor& mean, const XTensor& var, const XTensor& a, const XTensor& b, DTYPE epsilon)
+{
+    _Normalize(&input, &input, dim, &mean, &var, &a, &b, epsilon);
+}
 /*
 normalized the data with normal distribution (return an XTensor structure)
 make a new tensor to keep the result and return it 

--- a/source/tensor/core/math/Normalize.cu
+++ b/source/tensor/core/math/Normalize.cu
--- a/source/tensor/core/math/Normalize.h
+++ b/source/tensor/core/math/Normalize.h
@@ -42,6 +42,14 @@ where a and b are the scalar and bias respectively, and \epsilon is the adjustme
 void _NormalizeMe(XTensor * input, int dim, const XTensor * mean, const XTensor * var, const XTensor * a, const XTensor * b, DTYPE epsilon);
 /*
+normalized the data with normal distribution (do it on site)
+keep the result in the input tenosr and return nothing
+For an input x, x = a * (x-mean)/sqrt(variance+\epsilon) + b
+where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter.
+*/
+void NormalizeMe(XTensor & input, int dim, const XTensor & mean, const XTensor & var, const XTensor & a, const XTensor & b, DTYPE epsilon);
+/*
 normalized the data with normal distribution (return an XTensor structure)
 make a new tensor to keep the result and return it 
 For an input x, y = a * (x-mean)/sqrt(variance+\epsilon) + b

--- a/source/tensor/core/math/Power.cpp
+++ b/source/tensor/core/math/Power.cpp
@@ -81,6 +81,17 @@ void _PowerMe(XTensor * a, DTYPE p)
 }
 /*
+get the power(a, p) (do it on site)
+keep the result in the input tensor a and return nothing
+>> a - the tensor
+>> p - parameter
+*/
+void PowerMe(XTensor& a, DTYPE p)
+{
+    _Power(&a, &a, p);
+}
+/*
 get the power(a, p) (return an XTensor structure)
 make a new tensor to keep the result and return it
 >> a - input tensor

--- a/source/tensor/core/math/Power.h
+++ b/source/tensor/core/math/Power.h
@@ -36,6 +36,12 @@ keep the result in the input tensor a and return nothing
 void _PowerMe(XTensor * a, DTYPE p);
 /* 
+get the power(x, y) (do it on site)
+keep the result in the input tensor a and return nothing
+*/
+void PowerMe(XTensor & a, DTYPE p);
+/* 
 get the power(x, y) (return an XTensor structure)
 make a new tensor to keep the result and return it
 */

--- a/source/tensor/core/math/ScaleAndShift.cpp
+++ b/source/tensor/core/math/ScaleAndShift.cpp
@@ -92,6 +92,21 @@ void _ScaleAndShiftMe(XTensor * a, DTYPE scale, DTYPE shift)
 }
 /* 
+scale and shift all tensor entires (do it on site)
+keep the result in the input tensor a and return nothing
+a = a * scale + shift
+>> a - the input/output tensor
+>> scale - the scaler factor
+>> shift - the shift factor
+*/
+void ScaleAndShiftMe(XTensor& a, DTYPE scale, DTYPE shift)
+{
+    _ScaleAndShift(&a, &a, scale, shift);
+}
+/* 
 scale and shift all tensor entires (return an XTensor structure)
 make a new tensor to keep the result and return it

--- a/source/tensor/core/math/ScaleAndShift.h
+++ b/source/tensor/core/math/ScaleAndShift.h
@@ -45,6 +45,13 @@ void _ScaleAndShiftMe(XTensor * a, DTYPE scale, DTYPE shift = 0);
 /*
 scale and shift all tensor entires
+keep the result in the input tensor a and return nothing
+a = a * scale + shift 
+*/
+void ScaleAndShiftMe(XTensor & a, DTYPE scale, DTYPE shift = 0);
+/*
+scale and shift all tensor entires
 make a new tensor to keep the result and return it
 b = a * scale + shift 
 */

--- a/source/tensor/core/math/Unary.cpp
+++ b/source/tensor/core/math/Unary.cpp
--- a/source/tensor/core/math/Unary.cu
+++ b/source/tensor/core/math/Unary.cu
--- a/source/tensor/core/math/Unary.h
+++ b/source/tensor/core/math/Unary.h
@@ -31,6 +31,9 @@ void _Absolute(const XTensor * a, XTensor * b);
 /* set every entry to its absolute value (do it on site)
 keep the result in the input tensor a and return nothing */
 void _AbsoluteMe(XTensor * a);
+/* set every entry to its absolute value (do it on site)
+keep the result in the input tensor a and return nothing */
+void AbsoluteMe(XTensor & a);
 /* set every entry to its absolute value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Absolute(const XTensor & a);
@@ -42,6 +45,9 @@ void _Ceil(const XTensor * a, XTensor * b);
 /* set every entry to its ceil value (do it on site)
 keep the result in the input tensor a and return nothing */
 void _CeilMe(XTensor * a);
+/* set every entry to its ceil value (do it on site)
+keep the result in the input tensor a and return nothing */
+void CeilMe(XTensor & a);
 /* set every entry to its ceil value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Ceil(const XTensor & a);
@@ -53,6 +59,9 @@ void _Exp(const XTensor * a, XTensor * b);
 /* set every entry to its exponent value (do it on site)
 keep the result in the input tensor a and return nothing */
 void _ExpMe(XTensor * a);
+/* set every entry to its exponent value (do it on site)
+keep the result in the input tensor a and return nothing */
+void ExpMe(XTensor & a);
 /* set every entry to its exponent value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Exp(const XTensor & a);
@@ -64,6 +73,9 @@ void _Floor(const XTensor * a, XTensor * b);
 /* set every entry to its floor value (do it on site)
 keep the result in the input tensor a and return nothing */
 void _FloorMe(XTensor * a);
+/* set every entry to its floor value (do it on site)
+keep the result in the input tensor a and return nothing */
+void FloorMe(XTensor & a);
 /* set every entry to its floor value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Floor(const XTensor & a);
@@ -75,6 +87,9 @@ void _IsNonZero(const XTensor *a, XTensor *b);
 /* if source entry is non-zero, set target entry to be one, otherwise zero (do it on site)
 keep the result in the input tensor a and return nothing */
 void _IsNonZeroMe(XTensor *a);
+/* if source entry is non-zero, set target entry to be one, otherwise zero (do it on site)
+keep the result in the input tensor a and return nothing */
+void IsNonZeroMe(XTensor &a);
 /* if source entry is non-zero, set target entry to be one, otherwise zero (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor IsNonZero(const XTensor &a);
@@ -86,6 +101,9 @@ void _IsZero(const XTensor *a, XTensor *b);
 /* if source entry is zero, set target entry to be one, otherwise zero (do it on site)
 keep the result in the input tensor a and return nothing */
 void _IsZeroMe(XTensor *a);
+/* if source entry is zero, set target entry to be one, otherwise zero (do it on site)
+keep the result in the input tensor a and return nothing */
+void IsZeroMe(XTensor &a);
 /* if source entry is zero, set target entry to be one, otherwise zero (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor IsZero(const XTensor &a);
@@ -97,6 +115,9 @@ void _Log(const XTensor * a, XTensor * b);
 /* set every entry to its logarithm value (do it on site)
 keep the result in the input tensor a and return nothing */
 void _LogMe(XTensor * a);
+/* set every entry to its logarithm value (do it on site)
+keep the result in the input tensor a and return nothing */
+void LogMe(XTensor & a);
 /* set every entry to its logarithm value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Log(const XTensor & a);
@@ -108,6 +129,9 @@ void _Round(const XTensor * a, XTensor * b);
 /* set every entry to its round value (do it on site)
 keep the result in the input tensor a and return nothing */
 void _RoundMe(XTensor * a);
+/* set every entry to its round value (do it on site)
+keep the result in the input tensor a and return nothing */
+void RoundMe(XTensor & a);
 /* set every entry to its round value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Round(const XTensor & a);
@@ -119,6 +143,9 @@ void _Sqrt(const XTensor * a, XTensor * b);
 /* set every entry to its sqrt value (do it on site)
 keep the result in the input tensor a and return nothing */
 void _SqrtMe(XTensor * a);
+/* set every entry to its sqrt value (do it on site)
+keep the result in the input tensor a and return nothing */
+void SqrtMe(XTensor & a);
 /* set every entry to its sqrt value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Sqrt(const XTensor & a);
@@ -130,6 +157,9 @@ void _Square(const XTensor * a, XTensor * b);
 /* set every entry to its square value (do it on site)
 keep the result in the input tensor a and return nothing */
 void _SquareMe(XTensor * a);
+/* set every entry to its square value (do it on site)
+keep the result in the input tensor a and return nothing */
+void SquareMe(XTensor & a);
 /* set every entry to its square value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Square(const XTensor & a);
@@ -142,6 +172,9 @@ void _Sin(const XTensor * a, XTensor * b);
 /* set every entry to its sine value (do it on site)
 keep the result in the input tensor a and return nothing */
 void _SinMe(XTensor * a);
+/* set every entry to its sine value (do it on site)
+keep the result in the input tensor a and return nothing */
+void SinMe(XTensor & a);
 /* set every entry to its sine value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Sin(const XTensor & a);
@@ -153,6 +186,9 @@ void _Cos(const XTensor * a, XTensor * b);
 /* set every entry to its cosine value (do it on site)
 keep the result in the input tensor a and return nothing */
 void _CosMe(XTensor * a);
+/* set every entry to its cosine value (do it on site)
+keep the result in the input tensor a and return nothing */
+void CosMe(XTensor & a);
 /* set every entry to its cosine value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Cos(const XTensor & a);
@@ -164,6 +200,9 @@ void _Tan(const XTensor * a, XTensor * b);
 /* set every entry to its tangent value (do it on site)
 keep the result in the input tensor a and return nothing */
 void _TanMe(XTensor * a);
+/* set every entry to its tangent value (do it on site)
+keep the result in the input tensor a and return nothing */
+void TanMe(XTensor & a);
 /* set every entry to its tangent value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Tan(const XTensor & a);

--- a/source/tensor/core/reduce/ReduceMax.cpp
+++ b/source/tensor/core/reduce/ReduceMax.cpp
--- a/source/tensor/core/reduce/ReduceMax.cu
+++ b/source/tensor/core/reduce/ReduceMax.cu
--- a/source/tensor/core/reduce/ReduceMean.cpp
+++ b/source/tensor/core/reduce/ReduceMean.cpp
--- a/source/tensor/core/reduce/ReduceSum.cpp
+++ b/source/tensor/core/reduce/ReduceSum.cpp
--- a/source/tensor/core/reduce/ReduceSum.cu
+++ b/source/tensor/core/reduce/ReduceSum.cu
--- a/source/tensor/core/reduce/ReduceSumSquared.cpp
+++ b/source/tensor/core/reduce/ReduceSumSquared.cpp
--- a/source/tensor/core/reduce/ReduceVariance.cpp
+++ b/source/tensor/core/reduce/ReduceVariance.cpp
--- a/source/tensor/core/shape/ConcatenateSolely.cpp
+++ b/source/tensor/core/shape/ConcatenateSolely.cpp
--- a/source/tensor/core/shape/Permute.h
+++ b/source/tensor/core/shape/Permute.h
@@ -42,6 +42,13 @@ a = permuted(a)
 void _PermuteMe(XTensor * a, int * dimPermute);
 /*
+permute the tensor dimensions (do it on site).
+keep the result in the input tensor and return nothing.
+a = permuted(a)
+*/
+void PermuteMe(XTensor  &a, int * dimPermute);
+/* 
 make a tensor with permuted dimensions (return an XTensor structure).
 make a new tensor to keep the result and return it.
 b = permuted(a)

--- a/source/tensor/core/shape/Reshape.cpp
+++ b/source/tensor/core/shape/Reshape.cpp
--- a/source/tensor/core/shape/Squeeze.cpp
+++ b/source/tensor/core/shape/Squeeze.cpp
@@ -89,6 +89,20 @@ void _SqueezeMe(XTensor * source, int leadingDim)
 }
 /*
+squeeze the tensor along the specified dimension  (do it on site)
+keep the result in the input tensor a and return nothing
+>> source - the input tensor
+>> leadingDim - the dimension that we would squeeze
+                if leadingDim = -1, squeeze all dimensions that are 1
+                else, squeeze the specified dimension
+*/
+void SqueezeMe(XTensor& source, int leadingDim)
+{
+    _Squeeze(&source, &source, leadingDim);
+}
+/*
 squeeze the tensor along the specified dimension (return an XTensor structure)
 make a new tensor to keep the result and return it

--- a/source/tensor/core/shape/Squeeze.h
+++ b/source/tensor/core/shape/Squeeze.h
@@ -33,6 +33,10 @@ void _Squeeze(XTensor * source, XTensor * target, int leadingDim = -1);
   keep the result in the input tensor a and return nothing */
 void _SqueezeMe(XTensor * source, int leadingDim = -1);
+/* squeeze the tensor along the specified dimension (do it on site)
+   keep the result in the input tensor a and return nothing */
+void SqueezeMe(XTensor & source, int leadingDim = -1);
 /* squeeze the tensor along the specified dimension  (return an XTensor structure)
   make a new tensor to keep the result and return it */
 XTensor Squeeze(XTensor & source, int leadingDim = -1);

--- a/source/tensor/core/sort/Sort.cpp
+++ b/source/tensor/core/sort/Sort.cpp
@@ -98,6 +98,21 @@ void _SortMe(XTensor * a, XTensor * index, int dim)
 }
 /*
+sort the tensor along a given dimension (do it on site)
+keep the result in the input tensor a and return nothing
+>> a - input tensor
+>> index - index of the items in the resulting tensor
+>> dim - the dimension along which the sorting is performed
+*/
+void SortMe(XTensor& a, XTensor& index, int dim)
+{
+    _Sort(&a, &a, &index, dim);
+}
+/*
 sort the tensor along a given dimension (return an XTensor structure)
 make a new tensor to keep the result and return it

--- a/source/tensor/core/sort/Sort.cu
+++ b/source/tensor/core/sort/Sort.cu
--- a/source/tensor/core/sort/Sort.h
+++ b/source/tensor/core/sort/Sort.h
@@ -36,6 +36,12 @@ keep the result in the input tensor a and return nothing
 void _SortMe(XTensor * a, XTensor * index, int dim);
 /*
+sort the data along a given dimension (do it on site)
+keep the result in the input tensor a and return nothing
+*/
+void SortMe(XTensor & a, XTensor & index, int dim);
+/* 
 sort the data along a given dimension (return an XTensor structure)
 make a new tensor to keep the result and return it
 */

--- a/source/tensor/core/sort/TopK.cu
+++ b/source/tensor/core/sort/TopK.cu
--- a/source/tensor/core/utilities/SetAscendingOrder.cu
+++ b/source/tensor/core/utilities/SetAscendingOrder.cu
--- a/source/tensor/core/utilities/XMatrixSegment.cpp
+++ b/source/tensor/core/utilities/XMatrixSegment.cpp
--- a/source/tensor/test/TAbsolute.cpp
+++ b/source/tensor/test/TAbsolute.cpp
--- a/source/tensor/test/TClip.cpp
+++ b/source/tensor/test/TClip.cpp
--- a/source/tensor/test/TCompare.cpp
+++ b/source/tensor/test/TCompare.cpp
--- a/source/tensor/test/TConcatenate.cpp
+++ b/source/tensor/test/TConcatenate.cpp
--- a/source/tensor/test/TConcatenateSolely.cpp
+++ b/source/tensor/test/TConcatenateSolely.cpp
--- a/source/tensor/test/TConvertDataType.cpp
+++ b/source/tensor/test/TConvertDataType.cpp
--- a/source/tensor/test/TCos.cpp
+++ b/source/tensor/test/TCos.cpp
--- a/source/tensor/test/TDiv.cpp
+++ b/source/tensor/test/TDiv.cpp
--- a/source/tensor/test/TDivDim.cpp
+++ b/source/tensor/test/TDivDim.cpp
--- a/source/tensor/test/TExp.cpp
+++ b/source/tensor/test/TExp.cpp
--- a/source/tensor/test/THardTanH.cpp
+++ b/source/tensor/test/THardTanH.cpp
--- a/source/tensor/test/TIdentity.cpp
+++ b/source/tensor/test/TIdentity.cpp
--- a/source/tensor/test/TLog.cpp
+++ b/source/tensor/test/TLog.cpp
--- a/source/tensor/test/TLogSoftmax.cpp
+++ b/source/tensor/test/TLogSoftmax.cpp
--- a/source/tensor/test/TMerge.cpp
+++ b/source/tensor/test/TMerge.cpp
--- a/source/tensor/test/TMultiply.cpp
+++ b/source/tensor/test/TMultiply.cpp
--- a/source/tensor/test/TNegate.cpp
+++ b/source/tensor/test/TNegate.cpp
--- a/source/tensor/test/TNormalize.cpp
+++ b/source/tensor/test/TNormalize.cpp
--- a/source/tensor/test/TPower.cpp
+++ b/source/tensor/test/TPower.cpp
--- a/source/tensor/test/TRectify.cpp
+++ b/source/tensor/test/TRectify.cpp
--- a/source/tensor/test/TRound.cpp
+++ b/source/tensor/test/TRound.cpp
--- a/source/tensor/test/TSigmoid.cpp
+++ b/source/tensor/test/TSigmoid.cpp
--- a/source/tensor/test/TSign.cpp
+++ b/source/tensor/test/TSign.cpp
--- a/source/tensor/test/TSin.cpp
+++ b/source/tensor/test/TSin.cpp
--- a/source/tensor/test/TSoftmax.cpp
+++ b/source/tensor/test/TSoftmax.cpp
--- a/source/tensor/test/TSplit.cpp
+++ b/source/tensor/test/TSplit.cpp
--- a/source/tensor/test/TSub.cpp
+++ b/source/tensor/test/TSub.cpp
--- a/source/tensor/test/TSubDim.cpp
+++ b/source/tensor/test/TSubDim.cpp
--- a/source/tensor/test/TSum.cpp
+++ b/source/tensor/test/TSum.cpp
--- a/source/tensor/test/TSumDim.cpp
+++ b/source/tensor/test/TSumDim.cpp
--- a/source/tensor/test/TTan.cpp
+++ b/source/tensor/test/TTan.cpp
--- a/source/tensor/test/TTranspose.cpp
+++ b/source/tensor/test/TTranspose.cpp
--- a/source/tensor/test/Test.cpp
+++ b/source/tensor/test/Test.cpp