implement negate and sign operation by macro (unary and binary)

2cba1bdd · xuchen · 0d96c2a0 · 0d96c2a0 · 0d96c2a0 · 2cba1bdd
Commit 2cba1bdd authored Jul 21, 2019 by xuchen
--- a/source/network/XBackwardData.cpp
+++ b/source/network/XBackwardData.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * backward computation for data operation
- * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-12-26
- */
-
-#include "XNoder.h"
-#include "XBackwardData.h"
-#include "../tensor/XName.h"
-#include "../tensor/XUtility.h"
-#include "../tensor/core/CHeader.h"
-#include "../tensor/core/getandset/SetData.h"
-
-namespace nts{
-
-/* compute dE/dx of a node */
-void XDataGrad::MakeGrad(XTensor * node, bool isEfficent)
-{
-    CheckNTErrors(node->grad != NULL, "No gradient found!");
-
-    XLink &income = node->income;
-    int operID = income.typeID;
-
-    if(operID == GETANDSET_CONVERTDATATYPE)
-        GradConvertDataType(node, isEfficent);
-    else if(operID == GETANDSET_INDEXTOONEHOT)
-        GradIndexToOnehot(node, isEfficent);
-    else if(operID == GETANDSET_ONEHOTTOINDEX)
-        GradOnehotToIndex(node, isEfficent);
-    else{
-        ShowNTErrors("TODO!");
-    }
-}
-
-/* indicates whether the node is for a data operation */
-bool XDataGrad::IsDataOP(XTensor * node)
-{
-    XLink &income = node->income;
-    return (income.typeID & DATA_BASE) != 0;
-}
-
-/* 
-gradient computation for convert datatype
-for
-b = converdatatype(a) 
-we have
-dE/da = convertdatatype(b)
->> node - the node (c) for backward computation
->> isEfficient - indicates whether the computation is in
-                 an efficient manner
-*/
-void XDataGrad::GradConvertDataType(XTensor * node, bool isEfficent)
-{
-    XLink &income = node->income;
-    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for ConvertDataType!");
-
-    XTensor * input = income.tails[0];
-
-    XNoder::MakeGrad(input);
-    _ConvertDataType(node->grad, input->grad);
-}
-
-/* 
-gradient computation for OnehotToIndex
-for
-b = OnehotToIndex(a) 
-we have
-dE/da = IndexToOnehot(b)
->> node - the node (c) for backward computation
->> isEfficient - indicates whether the computation is in
-                 an efficient manner
-*/
-void XDataGrad::GradOnehotToIndex(XTensor * node, bool isEfficent)
-{
-    XLink &income = node->income;
-    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for IndexToOnehot!");
-
-    XTensor * input = income.tails[0];
-
-    XNoder::MakeGrad(input);
-
-}
-
-/* 
-gradient computation for IndexToOnehot
-for
-b = IndexToOnehot(a) 
-we have
-dE/da = IndexToOnehot(b)
->> node - the node (c) for backward computation
->> isEfficient - indicates whether the computation is in
-                 an efficient manner
-*/
-void XDataGrad::GradIndexToOnehot(XTensor * node, bool isEfficent)
-{
-    XLink &income = node->income;
-    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for IndexToOnehot!");
-
-    XTensor * input = income.tails[0];
-
-    XNoder::MakeGrad(input);
-
-}
-
-} // namespace nts(NiuTrans.Tensor)
--- a/source/network/XBackwardData.h
+++ b/source/network/XBackwardData.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * backward computation for data operation
- * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-12-26
- */
-
-#include "../tensor/XTensor.h"
-#include "../tensor/function/FHeader.h"
-
-#ifndef __XBACKWARDDATA_H__
-#define __XBACKWARDDATA_H__
-
-namespace nts{
-
-/* this class computes the gradient for tensor data operation given a node */
-class XDataGrad
-{
-public:
-    /* compute dE/dx of a node */
-    static
-    void MakeGrad(XTensor * node, bool isEfficent);
-
-    /* indicates whether the node is for a shaping operation */
-    static
-    bool IsDataOP(XTensor * node);
-
-private:
-    
-    /* gradient computation for ConverDataType: b = converdatatype(a, datatype) */
-    static
-    void GradConvertDataType(XTensor * node, bool isEfficent);
-
-    /* gradient computation for IndexToOnehot: b = indextoonehot(a, num) */
-    static
-    void GradIndexToOnehot(XTensor * node, bool isEfficent);
-
-    /* gradient computation for OnehotToIndex: b = onehottoindex(a, num) */
-    static
-    void GradOnehotToIndex(XTensor * node, bool isEfficent);
-};
-
-} // namespace nts(NiuTrans.Tensor)
-
-#endif
\ No newline at end of file
--- a/source/sample/transformer/T2TSearch.cpp
+++ b/source/sample/transformer/T2TSearch.cpp
@@ -302,12 +302,12 @@ void T2TSearch::Generate(T2TStateBundle * beam)
       row means a previous state. The column number is size-of-beam \times vocab-size. We,
       therefore, divide entries of the top-k index by vocab-size to compute the id of the
       previous state for each hypothesis in the top-k list. */
-    _DescaleMe(preID, sizeVocab);
+    DescaleMe(preID, sizeVocab);
    
    /* Then, we do something similar to "preID". For the top-k predictions, we need 
       to know their indices in the vocabulary. We compute the offset of each prediction
       in the vocabulary by dividing it with vocab-size and computing the remainder. */
-    _ModMe(index, sizeVocab);
+    ModMe(index, sizeVocab);

    score.Reshape(order, dims);


--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -280,6 +280,7 @@ void XTensor::Init()
    isTmp =  false;
    isGrad = false;
    isVar  = false;
+    enableGrad = false;
    visitMark = 0;
    grad = NULL;
 }
@@ -310,6 +311,7 @@ void XTensor::ShallowCopy(const XTensor &tensor)
 {
    strcpy(name, tensor.name);
    order = tensor.order;
+    enableGrad = tensor.enableGrad;
    memcpy(dimSize, tensor.dimSize, sizeof(int) * MAX_TENSOR_DIM_NUM);
    memcpy(dimSizeRDI, tensor.dimSizeRDI, sizeof(int) * MAX_TENSOR_DIM_NUM);
    dataType = tensor.dataType;
@@ -2447,6 +2449,7 @@ void InitTensor(XTensor * tensor, const XTensor * reference)
    if(reference->order < 0)
        return;

+    tensor->enableGrad = reference->enableGrad;
    InitTensor(tensor, reference->order, reference->dimSize, 
               reference->dataType, reference->denseRatio, 
               reference->devID, reference->mem);
@@ -2462,6 +2465,7 @@ void InitTensorV2(XTensor * tensor, const XTensor * reference)
    if(reference->order < 0)
        return;

+    tensor->enableGrad = reference->enableGrad;
    InitTensorV2(tensor, reference->order, reference->dimSize, 
               reference->dataType, reference->devID);
 }
@@ -2476,6 +2480,7 @@ void InitTensorOnCPU(XTensor * tensor, const XTensor * reference)
    if(reference->order < 0)
        return;
    
+    tensor->enableGrad = reference->enableGrad;
    InitTensor(tensor, reference->order, reference->dimSize,
               reference->dataType, reference->denseRatio,
               -1);

--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
@@ -151,6 +151,9 @@ public:
    /* indicates whether the tensor keeps the gradient when used as model parameters */
    bool isGrad;

+    /* indicates whether the gradient of the tensor should be computed */
+    bool enableGrad;
+
    /* indicates whether the tensor is used as paramters (or variables) */
    bool isVar;


--- a/source/tensor/core/CHeader.h
+++ b/source/tensor/core/CHeader.h
@@ -36,8 +36,6 @@
 #include "arithmetic/MatrixMulBatched.h"
 #include "arithmetic/Multiply.h"
 #include "arithmetic/MultiplyDim.h"
-#include "arithmetic/Negate.h"
-#include "arithmetic/Sign.h"
 #include "arithmetic/Sub.h"
 #include "arithmetic/SubDim.h"
 #include "arithmetic/Sum.h"

--- a/source/tensor/core/arithmetic/Negate.cpp
+++ b/source/tensor/core/arithmetic/Negate.cpp
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
-
-#include "../../XTensor.h"
-#include "../../XName.h"
-#include "Negate.h"
-#include "Negate.cuh"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-/*
-set every entry to its minus value
->> a - input tensor we are processing
->> b - output tensor we are processing
-*/
-void _Negate(const XTensor * a, XTensor * b)
-{
-#ifdef USE_CUDA
-    /* run it on GPUs */
-    if (a->devID >= 0) {
-        _CudaNegate(a, b);
-    return;
-    }
-#endif
-
-    CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
-    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
-    DTYPE * d = (DTYPE*)a->data;
-    DTYPE * db = (DTYPE*)b->data;
-    for (int i = 0; i < a->unitNum; i++)
-        db[i] = -d[i];
-}
-
-/*
-set every entry to its minus value (do it on site)
-keep the result in the input tensor a and return nothing
->> a - the tensor we are processing
-*/
-void _NegateMe(XTensor * a)
-{
-    _Negate(a, a);
-}
-
-/*
-set every entry to its minus value (return an XTensor structure)
-make a new tensor to keep the result and return it
->> a - input tensor we are processing
-<< return - the minus value of input tensor
-*/
-XTensor Negate(const XTensor & a)
-{
-    XTensor b(&a);
-    b.SetTMPFlag();
-    
-    /* call _Negate function */
-    _Negate(&a, &b);
-    
-    /* tensor connections */
-    XLink::MakeLink(&a, NULL, &b, MATH_NEGATE);
-    
-    return b;
-}
-
-/*
-set every entry to its minus value
->> a - input tensor we are processing
->> b - output tensor we are processing
->> requireLink - if add operation to network
-*/
-void Negate(const XTensor & a, XTensor & b, bool requireLink)
-{
-    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {
-        InitTensor(&b, &a);
-    }
-
-    /* call _Negate function */
-    _Negate(&a, &b);
-
-    if (requireLink) {
-        /* tensor connections */
-        XLink::MakeLink(&a, NULL, &b, MATH_NEGATE);
-    }
-}
-
-} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Negate.cu
+++ b/source/tensor/core/arithmetic/Negate.cu
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
-
-#include "../../XDevice.h"
-#include "../../XTensor.h"
-#include "Negate.h"
-#include "Negate.cuh"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-#ifdef USE_CUDA
-/*
-set each entry to its negtive value (CUDA Kernel)
->> a - pointer to the input data array
->> b - pointer to the output data array
->> size - size of the data array
-*/
-__global__
-void KernelNegate(DTYPE * a, DTYPE * b, int size)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-    if (i < size)
-        b[i] = -a[i];
-}
-
-/*
-set each entry to its negtive value (CUDA Kernel)
-This is for float16 computation
->> a - pointer to the input data array
->> b - pointer to the output data array
->> size - size of the data array
-*/
-__global__
-void KernelNegate(__half * a, __half * b, int size)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
-        if (i < size)
-            b[i] = __hsub(__float2half(0), a[i]);
-#else
-        if (i < size)
-            b[i] = __float2half(-__half2float(a[i]));
-#endif
-}
-
-/*
-set each entry to its negtive value
->> a - input tensor
->> b - output tensor
-*/
-void _CudaNegate(const XTensor * a, XTensor * b)
-{
-    CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
-    CheckNTErrors((a->isSparse == false), "TODO!");
-
-    int gridSize[3];
-    int blockSize[3];
-
-    GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
-
-    dim3 blocks(gridSize[0]);
-    dim3 threads(blockSize[0]);
-
-    int devIDBackup;
-    ProtectCudaDev(a->devID, devIDBackup);
-
-    if (a->dataType == DEFAULT_DTYPE) {
-        KernelNegate << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum);
-    }
-    else if (a->dataType == X_FLOAT16) {
-        KernelNegate << <blocks, threads >> >((__half*)a->data, (__half*)b->data, a->unitNum);
-    }
-    else {
-        ShowNTErrors("TODO!");
-    }
-
-    BacktoCudaDev(a->devID, devIDBackup);
-}
-
-#endif // USE_CUDA
-} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/Negate.cuh
+++ b/source/tensor/core/arithmetic/Negate.cuh
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
-
-#ifndef __NEGATE_CUH__
-#define __NEGATE_CUH__
-
-#include "Negate.h"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-#ifdef USE_CUDA
-
-/* set each entry to its negtive value (CUDA Kernel) */
-__global__
-void KernelNegate(DTYPE * a, DTYPE * b, int size);
-
-/* set each entry to its negtive value (CUDA Kernel) with float16 data type*/
-__global__
-void KernelNegate(__half * a, __half * b, int size);
-
-/* set each entry to its negtive value */
-void _CudaNegate(const XTensor * a, XTensor * b);
-
-#endif // USE_CUDA
-
-} // namespace nts(NiuTrans.Tensor)
-
-#endif // __NEGATE_CUH__
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Negate.h
+++ b/source/tensor/core/arithmetic/Negate.h
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
-
-#ifndef __NEGATE_H__
-#define __NEGATE_H__
-
-#include "../../XTensor.h"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-/* set every entry to its minus value */
-void _Negate(const XTensor * a, XTensor * b);
-
-/* 
-set every entry to its minus value (do it on site)
-keep the result in the input tensor a and return nothing
-*/
-void _NegateMe(XTensor * a);
-
-/* 
-set every entry to its minus value (return an XTensor structure)
-make a new tensor to keep the result and return it
-*/
-XTensor Negate(const XTensor & a);
-
-/* set every entry to its minus value */
-void Negate(const XTensor & a, XTensor & b, bool requireLink = false);
-
-} // namespace nts(NiuTrans.Tensor)
-
-#endif // __NEGATE_H__
--- a/source/tensor/core/arithmetic/Sign.cpp
+++ b/source/tensor/core/arithmetic/Sign.cpp
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
-*/
-
-#include "../../XTensor.h"
-#include "../../XName.h"
-#include "Sign.h"
-#include "Sign.cuh"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-/*
-set every entry to its sign value
->> a - input tensor we are processing
->> b - output tensor we are processing
-*/
-void _Sign(const XTensor * a, XTensor * b)
-{
-#ifdef USE_CUDA
-    /* run it on GPUs */
-    if (a->devID >= 0) {
-        _CudaSign(a, b);
-    return;
-}
-#endif
-
-    CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
-    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
-    DTYPE * d = (DTYPE*)a->data;
-    DTYPE * db = (DTYPE*)b->data;
-    for (int i = 0; i < a->unitNum; i++) {
-        if (d[i] > 0)
-            db[i] = 1.0F;
-        else if (d[i] == 0)
-            db[i] = 0.0F;
-        else
-            db[i] = -1.0F;
-    }
-}
-
-/*
-set every entry to its sign value (do it on site)
-keep the result in the input tensor a and return nothing
->> a - the tensor we are processing
-*/
-void _SignMe(XTensor * a)
-{
-    _Sign(a, a);
-}
-
-/*
-set every entry to its sign value (return an XTensor structure)
-make a new tensor to keep the result and return it
->> a - input tensor we are processing
-<< return - the sign value of the input tensor
-*/
-XTensor Sign(const XTensor & a)
-{
-    XTensor b(&a);
-    b.SetTMPFlag();
-
-    /* call _Sign function */
-    _Sign(&a, &b);
-
-    /* tensor connections */
-    XLink::MakeLink(&a, NULL, &b, MATH_SIGN);
-
-    return b;
-}
-
-/*
-set every entry to its sign value
->> a - input tensor we are processing
->> b - output tensor we are processing
->> requireLink - if add operation to network
-*/
-void Sign(const XTensor & a, XTensor & b, bool requireLink)
-{
-    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {
-        InitTensor(&b, &a);
-    }
-
-    /* call _Sign function */
-    _Sign(&a, &b);
-
-    if (requireLink) {
-        /* tensor connections */
-        XLink::MakeLink(&a, NULL, &b, MATH_SIGN);
-    }
-}
-} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Sign.cuh
+++ b/source/tensor/core/arithmetic/Sign.cuh
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
-*/
-
-#ifndef __SIGN_CUH__
-#define __SIGN_CUH__
-
-#include "Sign.h"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-#ifdef USE_CUDA
-
-/* set each entry to its sign value (CUDA Kernel) */
-__global__
-void KernelSign(DTYPE * a, DTYPE * b, int size);
-
-/* set each entry to its sign value (CUDA Kernel) with float16 data type*/
-__global__
-void KernelSign(__half * a, __half * b, int size);
-
-/* set each entry to its sign value */
-void _CudaSign(const XTensor * a, XTensor * b);
-
-#endif // USE_CUDA
-
-} // namespace nts(NiuTrans.Tensor)
-
-#endif // __SIGN_H__
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Sign.h
+++ b/source/tensor/core/arithmetic/Sign.h
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
-*/
-
-#ifndef __SIGN_H__
-#define __SIGN_H__
-
-#include "../../XTensor.h"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-/* set every entry to its sign value */
-void _Sign(const XTensor * a, XTensor * b);
-
-/* 
-set every entry to its sign value (do it on site)
-keep the result in the input tensor a and return nothing
-*/
-void _SignMe(XTensor * a);
-
-/* 
-set every entry to its sign value  (return an XTensor structure)
-make a new tensor to keep the result and return it
-*/
-XTensor Sign(const XTensor & a);
-
-/* set every entry to its sign value */
-void Sign(const XTensor & a, XTensor & b, bool requireLink = false);
-
-} // namespace nts(NiuTrans.Tensor)
-
-#endif // __SIGN_H__
--- a/source/tensor/core/math/Binary.cpp
+++ b/source/tensor/core/math/Binary.cpp
@@ -73,7 +73,8 @@ void _funcName(const XTensor * a, XTensor * b, int num)                     \
    }                                                                       \
    CheckNTErrors((XTensor::IsSameShaped(a, b)),                            \
                  "Input tensors should have the same data type!");         \
-    CheckNTErrors((a->dataType == X_INT&&b->dataType == X_INT), "TODO!");   \
+    CheckNTErrors(a->dataType == X_INT && b->dataType == X_INT,             \
+                 "TODO!");                                                  \
    int * d = (int*)a->data;                                                \
    int * db = (int*)b->data;                                               \
    for (int i = 0; i < a->unitNum; i++)                                    \
@@ -90,30 +91,37 @@ void _funcName(const XTensor * a, XTensor * b, float num)                   \
    }                                                                       \
    CheckNTErrors((XTensor::IsSameShaped(a, b)),                            \
                  "Input tensors should have the same data type!");         \
-    CheckNTErrors((a->dataType == X_FLOAT&&b->dataType == X_FLOAT), "TODO!");\
+    CheckNTErrors(a->dataType == X_FLOAT && b->dataType == X_FLOAT,         \
+                 "TODO!");                                                  \
    float * d = (float*)a->data;                                            \
    float * db = (float*)b->data;                                           \
    for (int i = 0; i < a->unitNum; i++)                                    \
        db[i] = (float)origFunc(d[i], num);                                 \
 }

-#define SIMPLE_BINARY_FUNCTION_ME_INT(funcName, _funcName)                  \
-void funcName(XTensor &a, int num)                                          \
+#define _SIMPLE_BINARY_FUNCTION_ME_INT(_funcNameMe, _funcName)              \
+void _funcNameMe(XTensor * a, int num)                                      \
 {                                                                           \
-    _funcName(&a, &a, num);                                                 \
-}                                                                           \
+    _funcName(a, a, num);                                                   \
+}      

-#define SIMPLE_BINARY_FUNCTION_ME(funcName, _funcName)                      \
-void funcName(XTensor &a, float num)                                        \
+#define _SIMPLE_BINARY_FUNCTION_ME(_funcNameMe, _funcName)                  \
+void _funcNameMe(XTensor * a, float num)                                    \
+{                                                                           \
+    _funcName(a, a, num);                                                   \
+}                                                                          
+    
+#define SIMPLE_BINARY_FUNCTION_ME_INT(funcNameMe, _funcName)                \
+void funcNameMe(XTensor &a, int num)                                        \
 {                                                                           \
    _funcName(&a, &a, num);                                                 \
 }                                                                                                                                                \

-#define SIMPLE_BINARY_FUNCTION_INT(funcName, _funcName)                     \
-void funcName(const XTensor &a, XTensor &b, int num)                        \
+#define SIMPLE_BINARY_FUNCTION_ME(funcNameMe, _funcName)                    \
+void funcNameMe(XTensor &a, float num)                                      \
 {                                                                           \
-    _funcName(&a, &b, num);                                                 \
-}                                                                           \
+    _funcName(&a, &a, num);                                                 \
+}                                                                           

 #define SIMPLE_BINARY_FUNCTION(funcName, _funcName, operationId)            \
 XTensor funcName(const XTensor &a, float num)                               \
@@ -123,50 +131,83 @@ XTensor funcName(const XTensor &a, float num)                               \
    _funcName(&a, &b, num);                                                 \
    XLink::MakeLink(&a, NULL, &b, operationId);                             \
    return b;                                                               \
-}                                                                           \
+}                                                                           
+
+#define SIMPLE_BINARY_FUNCTION_INT(funcName, _funcName, operationId)        \
+XTensor funcName(const XTensor &a, int num)                                 \
+{                                                                           \
+    XTensor b(&a);                                                          \
+    b.SetTMPFlag();                                                         \
+    _funcName(&a, &b, num);                                                 \
+    XLink::MakeLink(&a, NULL, &b, operationId);                             \
+    return b;                                                               \
+}                                                                           

 #define SIMPLE_BINARY_FUNCTION_VOID(funcName, _funcName, operationId)       \
-void funcName(const XTensor &a, XTensor &b, float num, bool requireLink)    \
+void funcName(const XTensor &a, XTensor &b, float num)                      \
 {                                                                           \
    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {                      \
        InitTensor(&b, &a);                                                 \
    }                                                                       \
    _funcName(&a, &b, num);                                                 \
-    if (requireLink) {                                                      \
+    if (b.enableGrad) {                                                     \
        XLink::MakeLink(&a, NULL, &b, operationId);                         \
    }                                                                       \
-}                                                                           \
+}                                                                           
+
+#define SIMPLE_BINARY_FUNCTION_INT_VOID(funcName, _funcName, operationId)   \
+void funcName(const XTensor &a, XTensor &b, int num)                        \
+{                                                                           \
+    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {                      \
+        InitTensor(&b, &a);                                                 \
+    }                                                                       \
+    _funcName(&a, &b, num);                                                 \
+    if (b.enableGrad) {                                                     \
+        XLink::MakeLink(&a, NULL, &b, operationId);                         \
+    }                                                                       \
+}                                                                           

 _SIMPLE_BINARY_FUNCTION_INT(_Scale, _CudaScale, scale)
-SIMPLE_BINARY_FUNCTION_ME_INT(_ScaleMe, _Scale)
-SIMPLE_BINARY_FUNCTION_INT(Scale, _Scale)
+_SIMPLE_BINARY_FUNCTION_ME_INT(_ScaleMe, _Scale)
+SIMPLE_BINARY_FUNCTION_ME_INT(ScaleMe, _Scale)
+SIMPLE_BINARY_FUNCTION_INT(Scale, _Scale, MATH_SCALE)
+SIMPLE_BINARY_FUNCTION_INT_VOID(Scale, _Scale, MATH_SCALE)

 _SIMPLE_BINARY_FUNCTION(_Scale, _CudaScaleFloat, scale)
-SIMPLE_BINARY_FUNCTION_ME(_ScaleMe, _Scale)
+_SIMPLE_BINARY_FUNCTION_ME(_ScaleMe, _Scale)
+SIMPLE_BINARY_FUNCTION_ME(ScaleMe, _Scale)
 SIMPLE_BINARY_FUNCTION(Scale, _Scale, MATH_SCALE)
 SIMPLE_BINARY_FUNCTION_VOID(Scale, _Scale, MATH_SCALE)

 _SIMPLE_BINARY_FUNCTION_INT(_Descale, _CudaDescale, descale)
-SIMPLE_BINARY_FUNCTION_ME_INT(_DescaleMe, _Descale)
-SIMPLE_BINARY_FUNCTION_INT(Descale, _Descale)
+_SIMPLE_BINARY_FUNCTION_ME_INT(_DescaleMe, _Descale)
+SIMPLE_BINARY_FUNCTION_ME_INT(DescaleMe, _Descale)
+SIMPLE_BINARY_FUNCTION_INT(Descale, _Descale, MATH_DESCALE)
+SIMPLE_BINARY_FUNCTION_INT_VOID(Descale, _Descale, MATH_DESCALE)

 _SIMPLE_BINARY_FUNCTION(_Descale, _CudaDescaleFloat, descale)
-SIMPLE_BINARY_FUNCTION_ME(_DescaleMe, _Descale)
+_SIMPLE_BINARY_FUNCTION_ME(_DescaleMe, _Descale)
+SIMPLE_BINARY_FUNCTION_ME(DescaleMe, _Descale)
 SIMPLE_BINARY_FUNCTION(Descale, _Descale, MATH_DESCALE)
 SIMPLE_BINARY_FUNCTION_VOID(Descale, _Descale, MATH_DESCALE)

 _SIMPLE_BINARY_FUNCTION_INT(_Shift, _CudaShift, shift)
-SIMPLE_BINARY_FUNCTION_ME_INT(_ShiftMe, _Shift)
-SIMPLE_BINARY_FUNCTION_INT(Shift, _Shift)
+_SIMPLE_BINARY_FUNCTION_ME_INT(_ShiftMe, _Shift)
+SIMPLE_BINARY_FUNCTION_ME_INT(ShiftMe, _Shift)
+SIMPLE_BINARY_FUNCTION_INT(Shift, _Shift, MATH_SHIFT)
+SIMPLE_BINARY_FUNCTION_INT_VOID(Shift, _Shift, MATH_SHIFT)

 _SIMPLE_BINARY_FUNCTION(_Shift, _CudaShiftFloat, shift)
-SIMPLE_BINARY_FUNCTION_ME(_ShiftMe, _Shift)
+_SIMPLE_BINARY_FUNCTION_ME(_ShiftMe, _Shift)
+SIMPLE_BINARY_FUNCTION_ME(ShiftMe, _Shift)
 SIMPLE_BINARY_FUNCTION(Shift, _Shift, MATH_SHIFT)
 SIMPLE_BINARY_FUNCTION_VOID(Shift, _Shift, MATH_SHIFT)

 _SIMPLE_BINARY_FUNCTION_INT(_Mod, _CudaMod, mod)
-SIMPLE_BINARY_FUNCTION_ME_INT(_ModMe, _Mod)
-SIMPLE_BINARY_FUNCTION_INT(Mod, _Mod)
+_SIMPLE_BINARY_FUNCTION_ME_INT(_ModMe, _Mod)
+SIMPLE_BINARY_FUNCTION_ME_INT(ModMe, _Mod)
+SIMPLE_BINARY_FUNCTION_INT(Mod, _Mod, MATH_MOD)
+SIMPLE_BINARY_FUNCTION_INT_VOID(Mod, _Mod, MATH_MOD)

 #else
 /* define three marco separately, specify the respective function names (CPU mode) */

--- a/source/tensor/core/math/Binary.h
+++ b/source/tensor/core/math/Binary.h
@@ -16,8 +16,8 @@
 */

 /*
-* $Created by: JIANG Yufan (email: jiangyufan2018@outlook.com) 2019-04-05
-*/
+ * $Created by: JIANG Yufan (email: jiangyufan2018@outlook.com) 2019-04-05
+ */

 #ifndef __BINARY_H__
 #define __BINARY_H__
@@ -26,105 +26,84 @@

 namespace nts { // namespace nts(NiuTrans.Tensor)

-/*
-scale up tensor entires
-b = a * scale
-*/
+/* scale up tensor entires
+b = a * scale */
 void _Scale(const XTensor * a, XTensor * b, int scale);
 void _Scale(const XTensor * a, XTensor * b, float scale);
-
-/*
-scale up tensor entires (on site)
-b = a * scale
-*/
-void _ScaleMe(XTensor & a, int scale);
-void _ScaleMe(XTensor & a, float scale);
-    
-/*
-scale up tensor entires
-b = a * scale
-*/
-void Scale(const XTensor & a, XTensor &b, int scale);
-void Scale(const XTensor & a, XTensor &b, float scale, bool requireLink = false);
-
-/*
-scale up tensor entires (return an XTensor structure)
-b = a * scale
-*/
+/* scale up tensor entires (on site)
+b = a * scale */
+void _ScaleMe(XTensor * a, int scale);
+void _ScaleMe(XTensor * a, float scale);
+/* scale up tensor entires (on site)
+b = a * scale */
+void ScaleMe(XTensor & a, int scale);
+void ScaleMe(XTensor & a, float scale);
+/* scale up tensor entires
+b = a * scale */
+void Scale(const XTensor & a, XTensor & b, int scale);
+void Scale(const XTensor & a, XTensor & b, float scale);
+/* scale up tensor entires (return an XTensor structure)
+b = a * scale */
+XTensor Scale(const XTensor & a, int scale);
 XTensor Scale(const XTensor & a, float scale);

-/*
-descale tensor entires
-b = a / scale
-*/
+/* descale tensor entires
+b = a / scale */
 void _Descale(const XTensor * a, XTensor * b, int scale);
 void _Descale(const XTensor * a, XTensor * b, float scale);
-
-/*
-descale tensor entires (on site)
-b = a / scale
-*/
-void _DescaleMe(XTensor & a, int scale);
-void _DescaleMe(XTensor & a, float scale);
-    
-/*
-descale tensor entires
-b = a / scale
-*/
+/* descale tensor entires (on site)
+b = a / scale */
+void _DescaleMe(XTensor * a, int scale);
+void _DescaleMe(XTensor * a, float scale);
+/* descale tensor entires (on site)
+b = a / scale */
+void DescaleMe(XTensor & a, int scale);
+void DescaleMe(XTensor & a, float scale); 
+/* descale tensor entires
+b = a / scale */
 void Descale(const XTensor & a, XTensor & b, int scale);
-void Descale(const XTensor & a, XTensor & b, float scale, bool requireLink = false);
-
-/*
-descale tensor entires (return an XTensor structure)
-b = a / scale
-*/
+void Descale(const XTensor & a, XTensor & b, float scale);
+/* descale tensor entires (return an XTensor structure)
+b = a / scale */
+XTensor Descale(const XTensor & a, int scale);
 XTensor Descale(const XTensor & a, float scale);

-/*
-shift tensor entires
-b = a + shift
-*/
+/* shift tensor entires
+b = a + shift */
 void _Shift(const XTensor * a, XTensor * b, int shift);
 void _Shift(const XTensor * a, XTensor * b, float shift);
-
-/*
-shift tensor entires (on site)
-b = a + shift
-*/
-void _ShiftMe(XTensor & a, int shift);
-void _ShiftMe(XTensor & a, float shift);
-    
-/*
-shift tensor entires
-b = a + shift
-*/
+/* shift tensor entires (on site)
+b = a + shift */
+void _ShiftMe(XTensor * a, int shift);
+void _ShiftMe(XTensor * a, float shift);
+/* shift tensor entires (on site)
+b = a + shift */
+void ShiftMe(XTensor & a, int shift);
+void ShiftMe(XTensor & a, float shift); 
+/* shift tensor entires
+b = a + shift */
 void Shift(const XTensor & a, XTensor & b, int shift);
-void Shift(const XTensor & a, XTensor & b, float shift, bool requireLink = false);
-
-/*
-shift tensor entires (return an XTensor structure)
-b = a + shift
-*/
+void Shift(const XTensor & a, XTensor & b, float shift);
+/* shift tensor entires (return an XTensor structure)
+b = a + shift */
+XTensor Shift(const XTensor & a, int shift);
 XTensor Shift(const XTensor & a, float shift);

-
-/*
-mod tensor entires
-b = a % mod
-*/
+/* mod tensor entires
+b = a % mod */
 void _Mod(const XTensor * a, XTensor * b, int base);
-
-/*
-mod tensor entires (on site)
-b = a % mod
-*/
-void _ModMe(XTensor & a, int base);
-    
-/*
-mod tensor entires
-b = a % mod
-*/
+/* mod tensor entires (on site)
+b = a % mod */
+void _ModMe(XTensor * a, int base);
+/* mod tensor entires (on site)
+b = a % mod */
+void ModMe(XTensor & a, int base);
+/* mod tensor entires
+b = a % mod */
 void Mod(const XTensor & a, XTensor & b, int base);
+/* mod tensor entires (return an XTensor structure)
+b = a + shift */
+XTensor Mod(const XTensor & a, int shift);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/math/Unary.cpp
+++ b/source/tensor/core/math/Unary.cpp
--- a/source/tensor/core/math/Unary.cu
+++ b/source/tensor/core/math/Unary.cu
@@ -30,6 +30,12 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA

 __device__
+DTYPE cudanegate(DTYPE x)
+{
+    return -x;
+}
+
+__device__
 DTYPE cudasquare(DTYPE x)
 {
    return x * x;
@@ -42,6 +48,17 @@ DTYPE cudaround(DTYPE r)
 }

 __device__
+DTYPE cudasign(DTYPE r)
+{
+    if (r > 0)
+       return 1.0F;
+    else if (r == 0)
+       return 0.0F;
+    else
+       return -1.0F;
+}
+
+__device__
 DTYPE cudaisnonzero(DTYPE r)
 {
    return (r != 0.0) ? (DTYPE)1.0 : (DTYPE)0.0;
@@ -72,7 +89,7 @@ void _Cuda##funcName(const XTensor * a, XTensor * b)                        \
 {                                                                           \
    CheckNTErrors((XTensor::IsSameShaped(a, b)),                            \
                  "Input tensors should have the same type!");              \
-    CheckNTErrors((a->isSparse == false), "TODO!");                         \
+    CheckNTErrors(a->isSparse == false, "TODO!");                           \
                                                                            \
    int gridSize[3];                                                        \
    int blockSize[3];                                                       \
@@ -107,7 +124,9 @@ SIMPLE_UNARY_FUNCTION_GPU(Floor, floor)
 SIMPLE_UNARY_FUNCTION_GPU(IsNonZero, cudaisnonzero)
 SIMPLE_UNARY_FUNCTION_GPU(IsZero, cudaiszero)
 SIMPLE_UNARY_FUNCTION_GPU(Log, log)
+SIMPLE_UNARY_FUNCTION_GPU(Negate, cudanegate)
 SIMPLE_UNARY_FUNCTION_GPU(Round, cudaround)
+SIMPLE_UNARY_FUNCTION_GPU(Sign, cudasign)
 SIMPLE_UNARY_FUNCTION_GPU(Sqrt, sqrt)
 SIMPLE_UNARY_FUNCTION_GPU(Square, cudasquare)


--- a/source/tensor/core/math/Unary.cuh
+++ b/source/tensor/core/math/Unary.cuh
@@ -92,6 +92,15 @@ void KernelLog(__half * a, __half * b, int size);
 /* set each entry to its logarithm value */
 void _CudaLog(const XTensor * a, XTensor * b);

+/* set each entry to its negative value (CUDA Kernel) */
+__global__
+void KernelNegate(DTYPE * a, DTYPE * b, int size);
+/* set each entry to its negative value (CUDA Kernel) with float16 data type*/
+__global__
+void KernelNegate(__half * a, __half * b, int size);
+/* set each entry to its negative value */
+void _CudaNegate(const XTensor * a, XTensor * b);
+
 /* set each entry to its round value (CUDA Kernel) */
 __global__
 void KernelRound(DTYPE * a, DTYPE * b, int size);
@@ -101,6 +110,15 @@ void KernelRound(__half * a, __half * b, int size);
 /* set each entry to its round value */
 void _CudaRound(const XTensor * a, XTensor * b);

+/* set each entry to its sign value (CUDA Kernel) */
+__global__
+void KernelSign(DTYPE * a, DTYPE * b, int size);
+/* set each entry to its sign value (CUDA Kernel) with float16 data type*/
+__global__
+void KernelSign(__half * a, __half * b, int size);
+/* set each entry to its sign value */
+void _CudaSign(const XTensor * a, XTensor * b);
+
 /* set each entry to its sqrt value (CUDA Kernel) */
 __global__
 void KernelSqrt(DTYPE * a, DTYPE * b, int size);

--- a/source/tensor/core/math/Unary.h
+++ b/source/tensor/core/math/Unary.h
--- a/source/tensor/function/CrossEntropy.cpp
+++ b/source/tensor/function/CrossEntropy.cpp
--- a/source/tensor/function/CrossEntropy.cu
+++ b/source/tensor/function/CrossEntropy.cu
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-17
- */
-
-#ifndef __CROSSENTROPY_CUH__
-#define __CROSSENTROPY_CUH__
-
-#include "../XTensor.h"
-#include "../XDevice.h"
-#include "CrossEntropy.cuh"
-#include "CrossEntropy.h"
-#include "../core/arithmetic/Div.h"
-#include "../core/arithmetic/Multiply.h"
-#include "../core/arithmetic/MultiplyDim.h"
-#include "../core/arithmetic/Negate.h"
-#include "../core/math/Unary.h"
-#include "../core/math/ScaleAndShift.h"
-#include "../core/reduce/ReduceSum.h"
-#include "../core/reduce/ReduceSumAll.h"
-#include "../core/shape/Transpose.h"
-#include "../core/shape/Unsqueeze.h"
-
-namespace nts{ // namespace nts(NiuTrans.Tensor)
-
-/*
-compute the cross entropy loss (cuda version) 
-loss = sum_{i} (-gold_i * log(output_i))
-where gold and output are distributions 
-        
->> output - model prediction
->> gold - gold standard
->> loss - compute loss
->> weight - a rescaling weight given to each class
->> padding - specify a target value that is ignored and does not contribute to the loss computation
->> leadingDim - the leading dimension for the output
-*/
-void _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
-                           XTensor * loss, const XTensor * weight, 
-                           const XTensor * padding, int leadingDim)
-{
-    int n = leadingDim < 0 ? output->order - 1 : leadingDim;
-    
-    XTensor * interBuf1 = NewTensorBuf(output, output->devID, output->mem);
-    XTensor * interBuf2 = NewTensorBuf(output, output->devID, output->mem);
-    
-    _Log(output, interBuf1);
-    _Multiply(gold, interBuf1, interBuf2);
-
-    if(weight != NULL)
-        _MultiplyDimMe(interBuf2, weight, n);
-    _NegateMe(interBuf2);
-    _ReduceSum(interBuf2, loss, n);
-    
-    if(padding != NULL)
-        _MultiplyMe(loss, padding);
-
-    DelTensorBuf(interBuf2);
-    DelTensorBuf(interBuf1);
-}
-
-/*
-compute the cross entropy loss (scalar version) 
-
-loss = sum_{i} (-gold_i * log(output_i))
-where gold and output are distributions 
-        
->> output - model prediction
->> gold - gold standard
->> reduceWay - loss compute way, sum or mean
->> weight - a rescaling weight given to each class
->> padding - specify a target value that is ignored and does not contribute to the loss computation
->> leadingDim - the leading dimension for the output
-<< return - the cross entropy loss that is a scalar
-*/
-DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
-                            LOSS_COMPUTE_WAY reduceWay, const XTensor * weight,
-                            const XTensor * padding, int leadingDim)
-{
-    DTYPE loss = 0;
-
-    int order = output->order;
-    int n = leadingDim < 0 ? output->order - 1 : leadingDim;
-    int leadingDimSize = output->GetDim(n);
-
-    CheckNTErrors(n >= 0 && n < output->order, 
-                 "Wrong leadingDim!");
-    CheckNTErrors(XTensor::IsSameShaped(output, gold), 
-                 "The output tensor and gold tensor must be of the same size!");
-    CheckNTErrors(weight == NULL || weight->unitNum == leadingDimSize, 
-                 "Wrong weight tensor!");
-    CheckNTErrors(padding == NULL || padding->order == output->order - 1, 
-                 "Wrong padding tensor!");
-    CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, 
-                 "TODO!");
-    
-    int * dimSize = new int[output->order - 1];
-    for (int i = 0; i < order; i++) {
-        if(i < n)
-            dimSize[i] = output->dimSize[i];
-        else if(i > n)
-            dimSize[i - 1] = output->dimSize[i];
-    }
-
-    XTensor * lossBuf = NewTensorBuf(output->order - 1, dimSize, output->dataType, output->denseRatio, 
-                                     output->devID, output->mem);
-
-    _CudaCrossEntropyFast(output, gold, lossBuf, weight, padding, leadingDim);
-
-    loss = _ReduceSumAll(lossBuf);
-
-    if(reduceWay == REDUCE_MEAN) {
-        int nonZeroNum;
-        if(padding == NULL) {
-            nonZeroNum = lossBuf->unitNum;
-        }
-        else {
-            XTensor * tmp = NewTensorBuf(padding, padding->devID, padding->mem);
-            _IsNonZero(padding, tmp);
-            nonZeroNum = (int)_ReduceSumAll(tmp);
-            DelTensorBuf(tmp);
-        }
-
-        loss = loss / (DTYPE)nonZeroNum;
-    }
-    else if(reduceWay == REDUCE_SUM) {
-        /* don't need to do anything */
-    }
-    else {
-        ShowNTErrors("TODO");
-    }
-
-    delete[] dimSize;
-    DelTensorBuf(lossBuf);
-
-    return loss;
-}
-
-/* 
-backward computation of cross entropy function 
-
-loss = sum_{i} (-t_i * log(y_i))
-dE/dy_i = -t_i / y_i
-where E is the error(loss) function that measure the errors in y
-with respect to gold standard, and y this the model output
-
->> dedy - dE/dy (for return)
->> output - model prediction
->> gold - gold standard
->> weight - a rescaling weight given to each class
->> padding - specify a target value that is ignored and does not contribute to the loss computation
->> leadingDim - the leading dimension for the output
-*/
-void _CudaCrossEntropyBackward(XTensor * dedy, const XTensor * output, 
-                               const XTensor * gold, const XTensor * weight,
-                               XTensor * padding, int leadingDim)
-{
-    int n = leadingDim < 0 ? output->order - 1 : leadingDim;
-    
-    _Div(gold, output, dedy);
-    _NegateMe(dedy);
-    if(weight != NULL)
-        _MultiplyDimMe(dedy, weight, n);
-    if(padding != NULL) {
-        int paddingOrder = padding->order;
-        int * paddingDims = new int[paddingOrder];
-        memcpy(paddingDims, padding->dimSize, padding->order * sizeof(int));
-        padding->Reshape(padding->unitNum);
-
-        int order = dedy->order;
-        int * dims = new int[order];
-        memcpy(dims, dedy->dimSize, dedy->order * sizeof(int));
-        dedy->Reshape(dedy->unitNum/dedy->GetDim(n), dedy->GetDim(n));
-        _MultiplyDimMe(dedy, padding, 0);
-
-        padding->Reshape(paddingOrder, paddingDims);
-        dedy->Reshape(order, dims);
-
-        delete[] paddingDims;
-        delete[] dims;
-    }
-
-    //if(padding != NULL) {
-    //    XTensor * tmp = NewTensor(padding);
-    //    _IsNonZero(padding, tmp);
-    //    int nonZeroNum = (int)_ReduceSumAll(tmp);
-    //    _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum);
-    //    delete tmp;
-    //}
-    //else {
-    //    _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)blockNum);
-    //}
-
-}
-
-} // namespace nts(NiuTrans.Tensor)
-
-#endif // __CROSSENTROPY_CUH__
\ No newline at end of file
--- a/source/tensor/function/CrossEntropy.cuh
+++ b/source/tensor/function/CrossEntropy.cuh
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-17
- */
-
-#ifndef __CROSSENTROPY_CUH__
-#define __CROSSENTROPY_CUH__
-
-#include "../XTensor.h"
-#include "CrossEntropy.h"
-
-namespace nts{ // namespace nts(NiuTrans.Tensor)
-
-/* compute the cross entropy loss */
-void _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
-                           XTensor * loss, const XTensor * weight = NULL, 
-                           const XTensor * padding = NULL, int leadingDim = -1);
-
-/* compute the cross entropy loss */
-DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
-                            LOSS_COMPUTE_WAY reduceWay, const XTensor * weight = NULL, 
-                            const XTensor * padding = NULL, int leadingDim = -1);
-
-/* backward computation of cross entropy function */
-void _CudaCrossEntropyBackward(XTensor * dedy, const XTensor * output, 
-                               const XTensor * gold, const XTensor * weight = NULL, 
-                               XTensor * padding = NULL, int leadingDim = -1);
-
-
-} // namespace nts(NiuTrans.Tensor)
-
-#endif // __CROSSENTROPY_CUH__
\ No newline at end of file
--- a/source/tensor/function/CrossEntropy.h
+++ b/source/tensor/function/CrossEntropy.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-17
- */
-
-#ifndef __CROSSENTROPY_H__
-#define __CROSSENTROPY_H__
-
-#include "../XTensor.h"
-
-namespace nts{ // namespace nts(NiuTrans.Tensor)
-
-enum LOSS_COMPUTE_WAY{
-REDUCE_SUM,
-REDUCE_MEAN
-};
-
-/* compute the cross entropy loss */
-void _CrossEntropy(const XTensor * output, const XTensor * gold, 
-                   XTensor * loss, const XTensor * weight = NULL, 
-                   const XTensor * padding = NULL, int leadingDim = -1);
-
-/* compute the cross entropy loss */
-void _CrossEntropyFast(const XTensor * output, const XTensor * gold,
-                         XTensor * loss, const XTensor * weight = NULL, 
-                         const XTensor * padding = NULL, int leadingDim = -1);
-
-/* compute the cross entropy loss (return the loss) */
-DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,
-                    LOSS_COMPUTE_WAY reduceWay, const XTensor * weight = NULL, 
-                    const XTensor * padding = NULL, int leadingDim = -1);
-
-/* compute the cross entropy loss (return the loss) */
-DTYPE _CrossEntropyFast(const XTensor * output, const XTensor * gold,
-                        LOSS_COMPUTE_WAY reduceWay = REDUCE_MEAN, const XTensor * weight = NULL,
-                        const XTensor * padding = NULL, int leadingDim = -1);
-
-/* backward computation of cross entropy function */
-void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, 
-                           const XTensor * gold, const XTensor * weight = NULL, 
-                           XTensor * padding = NULL, int leadingDim = -1);
-
-} // namespace nts(NiuTrans.Tensor)
-
-#endif // __CROSSENTROPY_H__
\ No newline at end of file
--- a/source/tensor/function/Loss.cu
+++ b/source/tensor/function/Loss.cu
@@ -25,7 +25,6 @@
 #include "../core/math/Power.h"
 #include "../core/math/ScaleAndShift.h"
 #include "../core/math/Unary.h"
-#include "../core/arithmetic/Negate.h"
 #include "../core/arithmetic/Sum.h"
 #include "../core/arithmetic/Multiply.h"
 #include "../core/reduce/ReduceSum.h"

--- a/source/tensor/loss/CrossEntropy.cpp
+++ b/source/tensor/loss/CrossEntropy.cpp
@@ -28,7 +28,6 @@
 #include "../core/arithmetic/Multiply.h"
 #include "../core/math/Unary.h"
 #include "../core/math/ScaleAndShift.h"
-#include "../core/arithmetic/Negate.h"
 #include "../core/reduce/ReduceSum.h"
 #include "../core/reduce/ReduceSumAll.h"


--- a/source/tensor/loss/CrossEntropy.cu
+++ b/source/tensor/loss/CrossEntropy.cu
@@ -29,7 +29,6 @@
 #include "../core/arithmetic/Div.h"
 #include "../core/arithmetic/Multiply.h"
 #include "../core/arithmetic/MultiplyDim.h"
-#include "../core/arithmetic/Negate.h"
 #include "../core/math/Unary.h"
 #include "../core/math/ScaleAndShift.h"
 #include "../core/reduce/ReduceSum.h"

--- a/source/tensor/test/TNegate.h
+++ b/source/tensor/test/TNegate.h
@@ -22,7 +22,7 @@
 #ifndef __TEST_NEGATE_H__
 #define __TEST_NEGATE_H__

-#include "../core/arithmetic/Negate.h"
+#include "../core/math/Unary.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/test/TSign.h
+++ b/source/tensor/test/TSign.h
@@ -22,7 +22,7 @@
 #ifndef __TEST_SIGN_H__
 #define __TEST_SIGN_H__

-#include "../core/arithmetic/Sign.h"
+#include "../core/math/Unary.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)