implement negate and sign operation by macro (unary and binary)

2cba1bdd · xuchen · 0d96c2a0 · 0d96c2a0 · 0d96c2a0 · 2cba1bdd
Commit 2cba1bdd authored Jul 21, 2019 by xuchen
--- a/source/network/XBackwardData.cpp
+++ b/source/network/XBackwardData.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/*
- * backward computation for data operation
- * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-12-26
- */
-#include "XNoder.h"
-#include "XBackwardData.h"
-#include "../tensor/XName.h"
-#include "../tensor/XUtility.h"
-#include "../tensor/core/CHeader.h"
-#include "../tensor/core/getandset/SetData.h"
-namespace nts{
-/* compute dE/dx of a node */
-void XDataGrad::MakeGrad(XTensor * node, bool isEfficent)
-{
-    CheckNTErrors(node->grad != NULL, "No gradient found!");
-    XLink &income = node->income;
-    int operID = income.typeID;
-    if(operID == GETANDSET_CONVERTDATATYPE)
-        GradConvertDataType(node, isEfficent);
-    else if(operID == GETANDSET_INDEXTOONEHOT)
-        GradIndexToOnehot(node, isEfficent);
-    else if(operID == GETANDSET_ONEHOTTOINDEX)
-        GradOnehotToIndex(node, isEfficent);
-    else{
-        ShowNTErrors("TODO!");
-    }
-}
-/* indicates whether the node is for a data operation */
-bool XDataGrad::IsDataOP(XTensor * node)
-{
-    XLink &income = node->income;
-    return (income.typeID & DATA_BASE) != 0;
-}
-/* 
-gradient computation for convert datatype
-for
-b = converdatatype(a) 
-we have
-dE/da = convertdatatype(b)
->> node - the node (c) for backward computation
->> isEfficient - indicates whether the computation is in
-                 an efficient manner
-*/
-void XDataGrad::GradConvertDataType(XTensor * node, bool isEfficent)
-{
-    XLink &income = node->income;
-    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for ConvertDataType!");
-    XTensor * input = income.tails[0];
-    XNoder::MakeGrad(input);
-    _ConvertDataType(node->grad, input->grad);
-}
-/* 
-gradient computation for OnehotToIndex
-for
-b = OnehotToIndex(a) 
-we have
-dE/da = IndexToOnehot(b)
->> node - the node (c) for backward computation
->> isEfficient - indicates whether the computation is in
-                 an efficient manner
-*/
-void XDataGrad::GradOnehotToIndex(XTensor * node, bool isEfficent)
-{
-    XLink &income = node->income;
-    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for IndexToOnehot!");
-    XTensor * input = income.tails[0];
-    XNoder::MakeGrad(input);
-}
-/* 
-gradient computation for IndexToOnehot
-for
-b = IndexToOnehot(a) 
-we have
-dE/da = IndexToOnehot(b)
->> node - the node (c) for backward computation
->> isEfficient - indicates whether the computation is in
-                 an efficient manner
-*/
-void XDataGrad::GradIndexToOnehot(XTensor * node, bool isEfficent)
-{
-    XLink &income = node->income;
-    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for IndexToOnehot!");
-    XTensor * input = income.tails[0];
-    XNoder::MakeGrad(input);
-}
-} // namespace nts(NiuTrans.Tensor)
--- a/source/network/XBackwardData.h
+++ b/source/network/XBackwardData.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/*
- * backward computation for data operation
- * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-12-26
- */
-#include "../tensor/XTensor.h"
-#include "../tensor/function/FHeader.h"
-#ifndef __XBACKWARDDATA_H__
-#define __XBACKWARDDATA_H__
-namespace nts{
-/* this class computes the gradient for tensor data operation given a node */
-class XDataGrad
-{
-public:
-    /* compute dE/dx of a node */
-    static
-    void MakeGrad(XTensor * node, bool isEfficent);
-    /* indicates whether the node is for a shaping operation */
-    static
-    bool IsDataOP(XTensor * node);
-private:
-    /* gradient computation for ConverDataType: b = converdatatype(a, datatype) */
-    static
-    void GradConvertDataType(XTensor * node, bool isEfficent);
-    /* gradient computation for IndexToOnehot: b = indextoonehot(a, num) */
-    static
-    void GradIndexToOnehot(XTensor * node, bool isEfficent);
-    /* gradient computation for OnehotToIndex: b = onehottoindex(a, num) */
-    static
-    void GradOnehotToIndex(XTensor * node, bool isEfficent);
-};
-} // namespace nts(NiuTrans.Tensor)
-#endif
\ No newline at end of file
--- a/source/sample/transformer/T2TSearch.cpp
+++ b/source/sample/transformer/T2TSearch.cpp
@@ -302,12 +302,12 @@ void T2TSearch::Generate(T2TStateBundle * beam)
       row means a previous state. The column number is size-of-beam \times vocab-size. We,
       therefore, divide entries of the top-k index by vocab-size to compute the id of the
       previous state for each hypothesis in the top-k list. */
-    _DescaleMe(preID, sizeVocab);
+    DescaleMe(preID, sizeVocab);
    /* Then, we do something similar to "preID". For the top-k predictions, we need 
       to know their indices in the vocabulary. We compute the offset of each prediction
       in the vocabulary by dividing it with vocab-size and computing the remainder. */
-    _ModMe(index, sizeVocab);
+    ModMe(index, sizeVocab);
    score.Reshape(order, dims);

--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -280,6 +280,7 @@ void XTensor::Init()
    isTmp =  false;
    isGrad = false;
    isVar  = false;
+    enableGrad = false;
    visitMark = 0;
    grad = NULL;
 }
@@ -310,6 +311,7 @@ void XTensor::ShallowCopy(const XTensor &tensor)
 {
    strcpy(name, tensor.name);
    order = tensor.order;
+    enableGrad = tensor.enableGrad;
    memcpy(dimSize, tensor.dimSize, sizeof(int) * MAX_TENSOR_DIM_NUM);
    memcpy(dimSizeRDI, tensor.dimSizeRDI, sizeof(int) * MAX_TENSOR_DIM_NUM);
    dataType = tensor.dataType;
@@ -2447,6 +2449,7 @@ void InitTensor(XTensor * tensor, const XTensor * reference)
    if(reference->order < 0)
        return;
+    tensor->enableGrad = reference->enableGrad;
    InitTensor(tensor, reference->order, reference->dimSize, 
               reference->dataType, reference->denseRatio, 
               reference->devID, reference->mem);
@@ -2462,6 +2465,7 @@ void InitTensorV2(XTensor * tensor, const XTensor * reference)
    if(reference->order < 0)
        return;
+    tensor->enableGrad = reference->enableGrad;
    InitTensorV2(tensor, reference->order, reference->dimSize, 
               reference->dataType, reference->devID);
 }
@@ -2476,6 +2480,7 @@ void InitTensorOnCPU(XTensor * tensor, const XTensor * reference)
    if(reference->order < 0)
        return;
+    tensor->enableGrad = reference->enableGrad;
    InitTensor(tensor, reference->order, reference->dimSize,
               reference->dataType, reference->denseRatio,
               -1);

--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
@@ -151,6 +151,9 @@ public:
    /* indicates whether the tensor keeps the gradient when used as model parameters */
    bool isGrad;
+    /* indicates whether the gradient of the tensor should be computed */
+    bool enableGrad;
    /* indicates whether the tensor is used as paramters (or variables) */
    bool isVar;

--- a/source/tensor/core/CHeader.h
+++ b/source/tensor/core/CHeader.h
@@ -36,8 +36,6 @@
 #include "arithmetic/MatrixMulBatched.h"
 #include "arithmetic/Multiply.h"
 #include "arithmetic/MultiplyDim.h"
-#include "arithmetic/Negate.h"
-#include "arithmetic/Sign.h"
 #include "arithmetic/Sub.h"
 #include "arithmetic/SubDim.h"
 #include "arithmetic/Sum.h"

--- a/source/tensor/core/arithmetic/Negate.cpp
+++ b/source/tensor/core/arithmetic/Negate.cpp
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-/*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
-#include "../../XTensor.h"
-#include "../../XName.h"
-#include "Negate.h"
-#include "Negate.cuh"
-namespace nts { // namespace nts(NiuTrans.Tensor)
-/*
-set every entry to its minus value
->> a - input tensor we are processing
->> b - output tensor we are processing
-*/
-void _Negate(const XTensor * a, XTensor * b)
-{
-#ifdef USE_CUDA
-    /* run it on GPUs */
-    if (a->devID >= 0) {
-        _CudaNegate(a, b);
-    return;
-    }
-#endif
-    CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
-    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
-    DTYPE * d = (DTYPE*)a->data;
-    DTYPE * db = (DTYPE*)b->data;
-    for (int i = 0; i < a->unitNum; i++)
-        db[i] = -d[i];
-}
-/*
-set every entry to its minus value (do it on site)
-keep the result in the input tensor a and return nothing
->> a - the tensor we are processing
-*/
-void _NegateMe(XTensor * a)
-{
-    _Negate(a, a);
-}
-/*
-set every entry to its minus value (return an XTensor structure)
-make a new tensor to keep the result and return it
->> a - input tensor we are processing
-<< return - the minus value of input tensor
-*/
-XTensor Negate(const XTensor & a)
-{
-    XTensor b(&a);
-    b.SetTMPFlag();
-    /* call _Negate function */
-    _Negate(&a, &b);
-    /* tensor connections */
-    XLink::MakeLink(&a, NULL, &b, MATH_NEGATE);
-    return b;
-}
-/*
-set every entry to its minus value
->> a - input tensor we are processing
->> b - output tensor we are processing
->> requireLink - if add operation to network
-*/
-void Negate(const XTensor & a, XTensor & b, bool requireLink)
-{
-    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {
-        InitTensor(&b, &a);
-    }
-    /* call _Negate function */
-    _Negate(&a, &b);
-    if (requireLink) {
-        /* tensor connections */
-        XLink::MakeLink(&a, NULL, &b, MATH_NEGATE);
-    }
-}
-} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Negate.cu
+++ b/source/tensor/core/arithmetic/Negate.cu
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-/*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
-#include "../../XDevice.h"
-#include "../../XTensor.h"
-#include "Negate.h"
-#include "Negate.cuh"
-namespace nts { // namespace nts(NiuTrans.Tensor)
-#ifdef USE_CUDA
-/*
-set each entry to its negtive value (CUDA Kernel)
->> a - pointer to the input data array
->> b - pointer to the output data array
->> size - size of the data array
-*/
-__global__
-void KernelNegate(DTYPE * a, DTYPE * b, int size)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-    if (i < size)
-        b[i] = -a[i];
-}
-/*
-set each entry to its negtive value (CUDA Kernel)
-This is for float16 computation
->> a - pointer to the input data array
->> b - pointer to the output data array
->> size - size of the data array
-*/
-__global__
-void KernelNegate(__half * a, __half * b, int size)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
-        if (i < size)
-            b[i] = __hsub(__float2half(0), a[i]);
-#else
-        if (i < size)
-            b[i] = __float2half(-__half2float(a[i]));
-#endif
-}
-/*
-set each entry to its negtive value
->> a - input tensor
->> b - output tensor
-*/
-void _CudaNegate(const XTensor * a, XTensor * b)
-{
-    CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
-    CheckNTErrors((a->isSparse == false), "TODO!");
-    int gridSize[3];
-    int blockSize[3];
-    GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
-    dim3 blocks(gridSize[0]);
-    dim3 threads(blockSize[0]);
-    int devIDBackup;
-    ProtectCudaDev(a->devID, devIDBackup);
-    if (a->dataType == DEFAULT_DTYPE) {
-        KernelNegate << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum);
-    }
-    else if (a->dataType == X_FLOAT16) {
-        KernelNegate << <blocks, threads >> >((__half*)a->data, (__half*)b->data, a->unitNum);
-    }
-    else {
-        ShowNTErrors("TODO!");
-    }
-    BacktoCudaDev(a->devID, devIDBackup);
-}
-#endif // USE_CUDA
-} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/Negate.cuh
+++ b/source/tensor/core/arithmetic/Negate.cuh
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-/*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
-#ifndef __NEGATE_CUH__
-#define __NEGATE_CUH__
-#include "Negate.h"
-namespace nts { // namespace nts(NiuTrans.Tensor)
-#ifdef USE_CUDA
-/* set each entry to its negtive value (CUDA Kernel) */
-__global__
-void KernelNegate(DTYPE * a, DTYPE * b, int size);
-/* set each entry to its negtive value (CUDA Kernel) with float16 data type*/
-__global__
-void KernelNegate(__half * a, __half * b, int size);
-/* set each entry to its negtive value */
-void _CudaNegate(const XTensor * a, XTensor * b);
-#endif // USE_CUDA
-} // namespace nts(NiuTrans.Tensor)
-#endif // __NEGATE_CUH__
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Negate.h
+++ b/source/tensor/core/arithmetic/Negate.h
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-/*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
-#ifndef __NEGATE_H__
-#define __NEGATE_H__
-#include "../../XTensor.h"
-namespace nts { // namespace nts(NiuTrans.Tensor)
-/* set every entry to its minus value */
-void _Negate(const XTensor * a, XTensor * b);
-/* 
-set every entry to its minus value (do it on site)
-keep the result in the input tensor a and return nothing
-*/
-void _NegateMe(XTensor * a);
-/* 
-set every entry to its minus value (return an XTensor structure)
-make a new tensor to keep the result and return it
-*/
-XTensor Negate(const XTensor & a);
-/* set every entry to its minus value */
-void Negate(const XTensor & a, XTensor & b, bool requireLink = false);
-} // namespace nts(NiuTrans.Tensor)
-#endif // __NEGATE_H__
--- a/source/tensor/core/arithmetic/Sign.cpp
+++ b/source/tensor/core/arithmetic/Sign.cpp
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-/*
-* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
-*/
-#include "../../XTensor.h"
-#include "../../XName.h"
-#include "Sign.h"
-#include "Sign.cuh"
-namespace nts { // namespace nts(NiuTrans.Tensor)
-/*
-set every entry to its sign value
->> a - input tensor we are processing
->> b - output tensor we are processing
-*/
-void _Sign(const XTensor * a, XTensor * b)
-{
-#ifdef USE_CUDA
-    /* run it on GPUs */
-    if (a->devID >= 0) {
-        _CudaSign(a, b);
-    return;
-}
-#endif
-    CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
-    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
-    DTYPE * d = (DTYPE*)a->data;
-    DTYPE * db = (DTYPE*)b->data;
-    for (int i = 0; i < a->unitNum; i++) {
-        if (d[i] > 0)
-            db[i] = 1.0F;
-        else if (d[i] == 0)
-            db[i] = 0.0F;
-        else
-            db[i] = -1.0F;
-    }
-}
-/*
-set every entry to its sign value (do it on site)
-keep the result in the input tensor a and return nothing
->> a - the tensor we are processing
-*/
-void _SignMe(XTensor * a)
-{
-    _Sign(a, a);
-}
-/*
-set every entry to its sign value (return an XTensor structure)
-make a new tensor to keep the result and return it
->> a - input tensor we are processing
-<< return - the sign value of the input tensor
-*/
-XTensor Sign(const XTensor & a)
-{
-    XTensor b(&a);
-    b.SetTMPFlag();
-    /* call _Sign function */
-    _Sign(&a, &b);
-    /* tensor connections */
-    XLink::MakeLink(&a, NULL, &b, MATH_SIGN);
-    return b;
-}
-/*
-set every entry to its sign value
->> a - input tensor we are processing
->> b - output tensor we are processing
->> requireLink - if add operation to network
-*/
-void Sign(const XTensor & a, XTensor & b, bool requireLink)
-{
-    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {
-        InitTensor(&b, &a);
-    }
-    /* call _Sign function */
-    _Sign(&a, &b);
-    if (requireLink) {
-        /* tensor connections */
-        XLink::MakeLink(&a, NULL, &b, MATH_SIGN);
-    }
-}
-} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Sign.cuh
+++ b/source/tensor/core/arithmetic/Sign.cuh
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-/*
-* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
-*/
-#ifndef __SIGN_CUH__
-#define __SIGN_CUH__
-#include "Sign.h"
-namespace nts { // namespace nts(NiuTrans.Tensor)
-#ifdef USE_CUDA
-/* set each entry to its sign value (CUDA Kernel) */
-__global__
-void KernelSign(DTYPE * a, DTYPE * b, int size);
-/* set each entry to its sign value (CUDA Kernel) with float16 data type*/
-__global__
-void KernelSign(__half * a, __half * b, int size);
-/* set each entry to its sign value */
-void _CudaSign(const XTensor * a, XTensor * b);
-#endif // USE_CUDA
-} // namespace nts(NiuTrans.Tensor)
-#endif // __SIGN_H__
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Sign.h
+++ b/source/tensor/core/arithmetic/Sign.h
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-/*
-* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
-*/
-#ifndef __SIGN_H__
-#define __SIGN_H__
-#include "../../XTensor.h"
-namespace nts { // namespace nts(NiuTrans.Tensor)
-/* set every entry to its sign value */
-void _Sign(const XTensor * a, XTensor * b);
-/* 
-set every entry to its sign value (do it on site)
-keep the result in the input tensor a and return nothing
-*/
-void _SignMe(XTensor * a);
-/* 
-set every entry to its sign value  (return an XTensor structure)
-make a new tensor to keep the result and return it
-*/
-XTensor Sign(const XTensor & a);
-/* set every entry to its sign value */
-void Sign(const XTensor & a, XTensor & b, bool requireLink = false);
-} // namespace nts(NiuTrans.Tensor)
-#endif // __SIGN_H__
--- a/source/tensor/core/math/Binary.cpp
+++ b/source/tensor/core/math/Binary.cpp
@@ -73,7 +73,8 @@ void _funcName(const XTensor * a, XTensor * b, int num)                     \
    }                                                                       \
    CheckNTErrors((XTensor::IsSameShaped(a, b)),                            \
                  "Input tensors should have the same data type!");         \
-    CheckNTErrors((a->dataType == X_INT&&b->dataType == X_INT), "TODO!");   \
+    CheckNTErrors(a->dataType == X_INT && b->dataType == X_INT,             \
+                 "TODO!");                                                  \
    int * d = (int*)a->data;                                                \
    int * db = (int*)b->data;                                               \
    for (int i = 0; i < a->unitNum; i++)                                    \
@@ -90,30 +91,37 @@ void _funcName(const XTensor * a, XTensor * b, float num)                   \
    }                                                                       \
    CheckNTErrors((XTensor::IsSameShaped(a, b)),                            \
                  "Input tensors should have the same data type!");         \
-    CheckNTErrors((a->dataType == X_FLOAT&&b->dataType == X_FLOAT), "TODO!");\
+    CheckNTErrors(a->dataType == X_FLOAT && b->dataType == X_FLOAT,         \
+                 "TODO!");                                                  \
    float * d = (float*)a->data;                                            \
    float * db = (float*)b->data;                                           \
    for (int i = 0; i < a->unitNum; i++)                                    \
        db[i] = (float)origFunc(d[i], num);                                 \
 }
-#define SIMPLE_BINARY_FUNCTION_ME_INT(funcName, _funcName)                  \
+#define _SIMPLE_BINARY_FUNCTION_ME_INT(_funcNameMe, _funcName)              \
-void funcName(XTensor &a, int num)                                          \
+void _funcNameMe(XTensor * a, int num)                                      \
 {                                                                           \
-    _funcName(&a, &a, num);                                                 \
+    _funcName(a, a, num);                                                   \
-}                                                                           \
+}      
-#define SIMPLE_BINARY_FUNCTION_ME(funcName, _funcName)                      \
+#define _SIMPLE_BINARY_FUNCTION_ME(_funcNameMe, _funcName)                  \
-void funcName(XTensor &a, float num)                                        \
+void _funcNameMe(XTensor * a, float num)                                    \
+{                                                                           \
+    _funcName(a, a, num);                                                   \
+}                                                                          
+#define SIMPLE_BINARY_FUNCTION_ME_INT(funcNameMe, _funcName)                \
+void funcNameMe(XTensor &a, int num)                                        \
 {                                                                           \
    _funcName(&a, &a, num);                                                 \
 }                                                                                                                                                \
-#define SIMPLE_BINARY_FUNCTION_INT(funcName, _funcName)                     \
+#define SIMPLE_BINARY_FUNCTION_ME(funcNameMe, _funcName)                    \
-void funcName(const XTensor &a, XTensor &b, int num)                        \
+void funcNameMe(XTensor &a, float num)                                      \
 {                                                                           \
-    _funcName(&a, &b, num);                                                 \
+    _funcName(&a, &a, num);                                                 \
-}                                                                           \
+}                                                                           
 #define SIMPLE_BINARY_FUNCTION(funcName, _funcName, operationId)            \
 XTensor funcName(const XTensor &a, float num)                               \
@@ -123,50 +131,83 @@ XTensor funcName(const XTensor &a, float num)                               \
    _funcName(&a, &b, num);                                                 \
    XLink::MakeLink(&a, NULL, &b, operationId);                             \
    return b;                                                               \
-}                                                                           \
+}                                                                           
+#define SIMPLE_BINARY_FUNCTION_INT(funcName, _funcName, operationId)        \
+XTensor funcName(const XTensor &a, int num)                                 \
+{                                                                           \
+    XTensor b(&a);                                                          \
+    b.SetTMPFlag();                                                         \
+    _funcName(&a, &b, num);                                                 \
+    XLink::MakeLink(&a, NULL, &b, operationId);                             \
+    return b;                                                               \
+}                                                                           
 #define SIMPLE_BINARY_FUNCTION_VOID(funcName, _funcName, operationId)       \
-void funcName(const XTensor &a, XTensor &b, float num, bool requireLink)    \
+void funcName(const XTensor &a, XTensor &b, float num)                      \
 {                                                                           \
    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {                      \
        InitTensor(&b, &a);                                                 \
    }                                                                       \
    _funcName(&a, &b, num);                                                 \
-    if (requireLink) {                                                      \
+    if (b.enableGrad) {                                                     \
        XLink::MakeLink(&a, NULL, &b, operationId);                         \
    }                                                                       \
-}                                                                           \
+}                                                                           
+#define SIMPLE_BINARY_FUNCTION_INT_VOID(funcName, _funcName, operationId)   \
+void funcName(const XTensor &a, XTensor &b, int num)                        \
+{                                                                           \
+    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {                      \
+        InitTensor(&b, &a);                                                 \
+    }                                                                       \
+    _funcName(&a, &b, num);                                                 \
+    if (b.enableGrad) {                                                     \
+        XLink::MakeLink(&a, NULL, &b, operationId);                         \
+    }                                                                       \
+}                                                                           
 _SIMPLE_BINARY_FUNCTION_INT(_Scale, _CudaScale, scale)
-SIMPLE_BINARY_FUNCTION_ME_INT(_ScaleMe, _Scale)
+_SIMPLE_BINARY_FUNCTION_ME_INT(_ScaleMe, _Scale)
-SIMPLE_BINARY_FUNCTION_INT(Scale, _Scale)
+SIMPLE_BINARY_FUNCTION_ME_INT(ScaleMe, _Scale)
+SIMPLE_BINARY_FUNCTION_INT(Scale, _Scale, MATH_SCALE)
+SIMPLE_BINARY_FUNCTION_INT_VOID(Scale, _Scale, MATH_SCALE)
 _SIMPLE_BINARY_FUNCTION(_Scale, _CudaScaleFloat, scale)
-SIMPLE_BINARY_FUNCTION_ME(_ScaleMe, _Scale)
+_SIMPLE_BINARY_FUNCTION_ME(_ScaleMe, _Scale)
+SIMPLE_BINARY_FUNCTION_ME(ScaleMe, _Scale)
 SIMPLE_BINARY_FUNCTION(Scale, _Scale, MATH_SCALE)
 SIMPLE_BINARY_FUNCTION_VOID(Scale, _Scale, MATH_SCALE)
 _SIMPLE_BINARY_FUNCTION_INT(_Descale, _CudaDescale, descale)
-SIMPLE_BINARY_FUNCTION_ME_INT(_DescaleMe, _Descale)
+_SIMPLE_BINARY_FUNCTION_ME_INT(_DescaleMe, _Descale)
-SIMPLE_BINARY_FUNCTION_INT(Descale, _Descale)
+SIMPLE_BINARY_FUNCTION_ME_INT(DescaleMe, _Descale)
+SIMPLE_BINARY_FUNCTION_INT(Descale, _Descale, MATH_DESCALE)
+SIMPLE_BINARY_FUNCTION_INT_VOID(Descale, _Descale, MATH_DESCALE)
 _SIMPLE_BINARY_FUNCTION(_Descale, _CudaDescaleFloat, descale)
-SIMPLE_BINARY_FUNCTION_ME(_DescaleMe, _Descale)
+_SIMPLE_BINARY_FUNCTION_ME(_DescaleMe, _Descale)
+SIMPLE_BINARY_FUNCTION_ME(DescaleMe, _Descale)
 SIMPLE_BINARY_FUNCTION(Descale, _Descale, MATH_DESCALE)
 SIMPLE_BINARY_FUNCTION_VOID(Descale, _Descale, MATH_DESCALE)
 _SIMPLE_BINARY_FUNCTION_INT(_Shift, _CudaShift, shift)
-SIMPLE_BINARY_FUNCTION_ME_INT(_ShiftMe, _Shift)
+_SIMPLE_BINARY_FUNCTION_ME_INT(_ShiftMe, _Shift)
-SIMPLE_BINARY_FUNCTION_INT(Shift, _Shift)
+SIMPLE_BINARY_FUNCTION_ME_INT(ShiftMe, _Shift)
+SIMPLE_BINARY_FUNCTION_INT(Shift, _Shift, MATH_SHIFT)
+SIMPLE_BINARY_FUNCTION_INT_VOID(Shift, _Shift, MATH_SHIFT)
 _SIMPLE_BINARY_FUNCTION(_Shift, _CudaShiftFloat, shift)
-SIMPLE_BINARY_FUNCTION_ME(_ShiftMe, _Shift)
+_SIMPLE_BINARY_FUNCTION_ME(_ShiftMe, _Shift)
+SIMPLE_BINARY_FUNCTION_ME(ShiftMe, _Shift)
 SIMPLE_BINARY_FUNCTION(Shift, _Shift, MATH_SHIFT)
 SIMPLE_BINARY_FUNCTION_VOID(Shift, _Shift, MATH_SHIFT)
 _SIMPLE_BINARY_FUNCTION_INT(_Mod, _CudaMod, mod)
-SIMPLE_BINARY_FUNCTION_ME_INT(_ModMe, _Mod)
+_SIMPLE_BINARY_FUNCTION_ME_INT(_ModMe, _Mod)
-SIMPLE_BINARY_FUNCTION_INT(Mod, _Mod)
+SIMPLE_BINARY_FUNCTION_ME_INT(ModMe, _Mod)
+SIMPLE_BINARY_FUNCTION_INT(Mod, _Mod, MATH_MOD)
+SIMPLE_BINARY_FUNCTION_INT_VOID(Mod, _Mod, MATH_MOD)
 #else
 /* define three marco separately, specify the respective function names (CPU mode) */

--- a/source/tensor/core/math/Binary.h
+++ b/source/tensor/core/math/Binary.h
@@ -16,8 +16,8 @@
 */
 /*
-* $Created by: JIANG Yufan (email: jiangyufan2018@outlook.com) 2019-04-05
+ * $Created by: JIANG Yufan (email: jiangyufan2018@outlook.com) 2019-04-05
-*/
+ */
 #ifndef __BINARY_H__
 #define __BINARY_H__
@@ -26,105 +26,84 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
-/*
+/* scale up tensor entires
-scale up tensor entires
+b = a * scale */
-b = a * scale
-*/
 void _Scale(const XTensor * a, XTensor * b, int scale);
 void _Scale(const XTensor * a, XTensor * b, float scale);
+/* scale up tensor entires (on site)
-/*
+b = a * scale */
-scale up tensor entires (on site)
+void _ScaleMe(XTensor * a, int scale);
-b = a * scale
+void _ScaleMe(XTensor * a, float scale);
-*/
+/* scale up tensor entires (on site)
-void _ScaleMe(XTensor & a, int scale);
+b = a * scale */
-void _ScaleMe(XTensor & a, float scale);
+void ScaleMe(XTensor & a, int scale);
+void ScaleMe(XTensor & a, float scale);
-/*
+/* scale up tensor entires
-scale up tensor entires
+b = a * scale */
-b = a * scale
+void Scale(const XTensor & a, XTensor & b, int scale);
-*/
+void Scale(const XTensor & a, XTensor & b, float scale);
-void Scale(const XTensor & a, XTensor &b, int scale);
+/* scale up tensor entires (return an XTensor structure)
-void Scale(const XTensor & a, XTensor &b, float scale, bool requireLink = false);
+b = a * scale */
+XTensor Scale(const XTensor & a, int scale);
-/*
-scale up tensor entires (return an XTensor structure)
-b = a * scale
-*/
 XTensor Scale(const XTensor & a, float scale);
-/*
+/* descale tensor entires
-descale tensor entires
+b = a / scale */
-b = a / scale
-*/
 void _Descale(const XTensor * a, XTensor * b, int scale);
 void _Descale(const XTensor * a, XTensor * b, float scale);
+/* descale tensor entires (on site)
-/*
+b = a / scale */
-descale tensor entires (on site)
+void _DescaleMe(XTensor * a, int scale);
-b = a / scale
+void _DescaleMe(XTensor * a, float scale);
-*/
+/* descale tensor entires (on site)
-void _DescaleMe(XTensor & a, int scale);
+b = a / scale */
-void _DescaleMe(XTensor & a, float scale);
+void DescaleMe(XTensor & a, int scale);
+void DescaleMe(XTensor & a, float scale); 
-/*
+/* descale tensor entires
-descale tensor entires
+b = a / scale */
-b = a / scale
-*/
 void Descale(const XTensor & a, XTensor & b, int scale);
-void Descale(const XTensor & a, XTensor & b, float scale, bool requireLink = false);
+void Descale(const XTensor & a, XTensor & b, float scale);
+/* descale tensor entires (return an XTensor structure)
-/*
+b = a / scale */
-descale tensor entires (return an XTensor structure)
+XTensor Descale(const XTensor & a, int scale);
-b = a / scale
-*/
 XTensor Descale(const XTensor & a, float scale);
-/*
+/* shift tensor entires
-shift tensor entires
+b = a + shift */
-b = a + shift
-*/
 void _Shift(const XTensor * a, XTensor * b, int shift);
 void _Shift(const XTensor * a, XTensor * b, float shift);
+/* shift tensor entires (on site)
-/*
+b = a + shift */
-shift tensor entires (on site)
+void _ShiftMe(XTensor * a, int shift);
-b = a + shift
+void _ShiftMe(XTensor * a, float shift);
-*/
+/* shift tensor entires (on site)
-void _ShiftMe(XTensor & a, int shift);
+b = a + shift */
-void _ShiftMe(XTensor & a, float shift);
+void ShiftMe(XTensor & a, int shift);
+void ShiftMe(XTensor & a, float shift); 
-/*
+/* shift tensor entires
-shift tensor entires
+b = a + shift */
-b = a + shift
-*/
 void Shift(const XTensor & a, XTensor & b, int shift);
-void Shift(const XTensor & a, XTensor & b, float shift, bool requireLink = false);
+void Shift(const XTensor & a, XTensor & b, float shift);
+/* shift tensor entires (return an XTensor structure)
-/*
+b = a + shift */
-shift tensor entires (return an XTensor structure)
+XTensor Shift(const XTensor & a, int shift);
-b = a + shift
-*/
 XTensor Shift(const XTensor & a, float shift);
+/* mod tensor entires
-/*
+b = a % mod */
-mod tensor entires
-b = a % mod
-*/
 void _Mod(const XTensor * a, XTensor * b, int base);
+/* mod tensor entires (on site)
-/*
+b = a % mod */
-mod tensor entires (on site)
+void _ModMe(XTensor * a, int base);
-b = a % mod
+/* mod tensor entires (on site)
-*/
+b = a % mod */
-void _ModMe(XTensor & a, int base);
+void ModMe(XTensor & a, int base);
+/* mod tensor entires
-/*
+b = a % mod */
-mod tensor entires
-b = a % mod
-*/
 void Mod(const XTensor & a, XTensor & b, int base);
+/* mod tensor entires (return an XTensor structure)
+b = a + shift */
+XTensor Mod(const XTensor & a, int shift);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/math/Unary.cpp
+++ b/source/tensor/core/math/Unary.cpp
@@ -27,6 +27,10 @@
 namespace nts{
+DTYPE negate(DTYPE x) {
+    return -x;
+}
 DTYPE square(DTYPE x)
 {
    return x * x;
@@ -37,6 +41,16 @@ DTYPE round(DTYPE r)
 	return (r > 0.0) ? (DTYPE)floor(r + 0.5) : (DTYPE)ceil(r - 0.5);
 }
+DTYPE sign(DTYPE r)
+{
+    if (r > 0)
+       return 1.0F;
+    else if (r == 0)
+       return 0.0F;
+    else
+       return -1.0F;
+}
 DTYPE isnonzero(DTYPE r)
 {
    return (r != 0.0) ? (DTYPE)1.0 : (DTYPE)0.0;
@@ -65,6 +79,20 @@ void _funcName(const XTensor * a, XTensor * b)                              \
    for (int i = 0; i < a->unitNum; i++)                                    \
        db[i] = (DTYPE)origFunc(d[i]);                                      \
 }
+#else
+/* define three marco separately, specify the respective function names (CPU mode) */
+#define _SIMPLE_UNARY_FUNCTION(_funcName, origFunc)                         \
+void _funcName(const XTensor * a, XTensor * b)                              \
+{                                                                           \
+    CheckNTErrors((XTensor::IsSameShaped(a, b)),                            \
+                  "Input tensors should have the same type!");              \
+    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");                 \
+    DTYPE * d = (DTYPE*)a->data;                                            \
+    DTYPE * db = (DTYPE*)b->data;                                           \
+    for (int i = 0; i < a->unitNum; i++)                                    \
+        db[i] = (DTYPE)origFunc(d[i]);                                      \
+}
+#endif
 #define _SIMPLE_UNARY_FUNCTION_ME(_funcNameMe, _funcName)                   \
 void _funcNameMe(XTensor * a)                                               \
@@ -72,8 +100,14 @@ void _funcNameMe(XTensor * a)                                               \
    _funcName(a, a);                                                        \
 }        
+#define SIMPLE_UNARY_FUNCTION_ME(funcNameMe, _funcName)                     \
+void funcNameMe(XTensor & a)                                                \
+{                                                                           \
+    _funcName(&a, &a);                                                      \
+}        
 #define SIMPLE_UNARY_FUNCTION(funcName, _funcName, operationId)             \
-XTensor funcName(const XTensor &a)                                          \
+XTensor funcName(const XTensor & a)                                         \
 {                                                                           \
    XTensor b(&a);                                                          \
    b.SetTMPFlag();                                                         \
@@ -83,191 +117,124 @@ XTensor funcName(const XTensor &a)                                          \
 }
 #define SIMPLE_UNARY_FUNCTION_VOID(funcName, _funcName, operationId)        \
-void funcName(const XTensor &a, XTensor &b, bool requireLink)               \
+void funcName(const XTensor & a, XTensor & b)                               \
 {                                                                           \
    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {                      \
        InitTensor(&b, &a);                                                 \
    }                                                                       \
    _funcName(&a, &b);                                                      \
-    if (requireLink) {                                                      \
+    if (b.enableGrad) {                                                     \
        XLink::MakeLink(&a, NULL, &b, operationId);                         \
    }                                                                       \
 }
+#ifdef USE_CUDA
 _SIMPLE_UNARY_FUNCTION(_Absolute, _CudaAbsolute, fabs)
-_SIMPLE_UNARY_FUNCTION_ME(_AbsoluteMe, _Absolute)
-SIMPLE_UNARY_FUNCTION(Absolute, _Absolute, MATH_ABSOLUTE)
-SIMPLE_UNARY_FUNCTION_VOID(Absolute, _Absolute, MATH_ABSOLUTE)
 _SIMPLE_UNARY_FUNCTION(_Ceil, _CudaCeil, ceil)
-_SIMPLE_UNARY_FUNCTION_ME(_CeilMe, _Ceil)
-SIMPLE_UNARY_FUNCTION(Ceil, _Ceil, MATH_CEIL)
-SIMPLE_UNARY_FUNCTION_VOID(Ceil, _Ceil, MATH_CEIL)
 _SIMPLE_UNARY_FUNCTION(_Exp, _CudaExp, exp)
-_SIMPLE_UNARY_FUNCTION_ME(_ExpMe, _Exp)
-SIMPLE_UNARY_FUNCTION(Exp, _Exp, MATH_EXP)
-SIMPLE_UNARY_FUNCTION_VOID(Exp, _Exp, MATH_EXP)
 _SIMPLE_UNARY_FUNCTION(_Floor, _CudaFloor, floor)
-_SIMPLE_UNARY_FUNCTION_ME(_FloorMe, _Floor)
-SIMPLE_UNARY_FUNCTION(Floor, _Floor, MATH_FLOOR)
-SIMPLE_UNARY_FUNCTION_VOID(Floor, _Floor, MATH_FLOOR)
 _SIMPLE_UNARY_FUNCTION(_IsNonZero, _CudaIsNonZero, isnonzero)
-_SIMPLE_UNARY_FUNCTION_ME(_IsNonZeroMe, _IsNonZero)
-SIMPLE_UNARY_FUNCTION(IsNonZero, _IsNonZero, MATH_ISNONZERO)
-SIMPLE_UNARY_FUNCTION_VOID(IsNonZero, _IsNonZero, MATH_ISNONZERO)
 _SIMPLE_UNARY_FUNCTION(_IsZero, _CudaIsZero, iszero)
-_SIMPLE_UNARY_FUNCTION_ME(_IsZeroMe, _IsZero)
-SIMPLE_UNARY_FUNCTION(IsZero, _IsZero, MATH_ISZERO)
-SIMPLE_UNARY_FUNCTION_VOID(IsZero, _IsZero, MATH_ISZERO)
 _SIMPLE_UNARY_FUNCTION(_Log, _CudaLog, log)
-_SIMPLE_UNARY_FUNCTION_ME(_LogMe, _Log)
+_SIMPLE_UNARY_FUNCTION(_Negate, _CudaNegate, negate)
-SIMPLE_UNARY_FUNCTION(Log, _Log, MATH_LOG)
-SIMPLE_UNARY_FUNCTION_VOID(Log, _Log, MATH_LOG)
 _SIMPLE_UNARY_FUNCTION(_Round, _CudaRound, round)
-_SIMPLE_UNARY_FUNCTION_ME(_RoundMe, _Round)
+_SIMPLE_UNARY_FUNCTION(_Sign, _CudaSign, sign)
-SIMPLE_UNARY_FUNCTION(Round, _Round, MATH_ROUND)
-SIMPLE_UNARY_FUNCTION_VOID(Round, _Round, MATH_ROUND)
 _SIMPLE_UNARY_FUNCTION(_Sqrt, _CudaSqrt, sqrt)
-_SIMPLE_UNARY_FUNCTION_ME(_SqrtMe, _Sqrt)
-SIMPLE_UNARY_FUNCTION(Sqrt, _Sqrt, MATH_SQRT)
-SIMPLE_UNARY_FUNCTION_VOID(Sqrt, _Sqrt, MATH_SQRT)
 _SIMPLE_UNARY_FUNCTION(_Square, _CudaSquare, square)
-_SIMPLE_UNARY_FUNCTION_ME(_SquareMe, _Square)
-SIMPLE_UNARY_FUNCTION(Square, _Square, MATH_SQUARE)
-SIMPLE_UNARY_FUNCTION_VOID(Square, _Square, MATH_SQUARE)
 _SIMPLE_UNARY_FUNCTION(_Sin, _CudaSin, sin)
-_SIMPLE_UNARY_FUNCTION_ME(_SinMe, _Sin)
-SIMPLE_UNARY_FUNCTION(Sin, _Sin, MATH_SIN)
-SIMPLE_UNARY_FUNCTION_VOID(Sin, _Sin, MATH_SIN)
 _SIMPLE_UNARY_FUNCTION(_Cos, _CudaCos, cos)
-_SIMPLE_UNARY_FUNCTION_ME(_CosMe, _Cos)
-SIMPLE_UNARY_FUNCTION(Cos, _Cos, MATH_COS)
-SIMPLE_UNARY_FUNCTION_VOID(Cos, _Cos, MATH_COS)
 _SIMPLE_UNARY_FUNCTION(_Tan, _CudaTan, tan)
-_SIMPLE_UNARY_FUNCTION_ME(_TanMe, _Tan)
-SIMPLE_UNARY_FUNCTION(Tan, _Tan, MATH_TAN)
-SIMPLE_UNARY_FUNCTION_VOID(Tan, _Tan, MATH_TAN)
 #else
-/* define three marco separately, specify the respective function names (CPU mode) */
-#define _SIMPLE_UNARY_FUNCTION(_funcName, origFunc)          \
-void _funcName(const XTensor * a, XTensor * b)                              \
-{                                                                           \
-    CheckNTErrors((XTensor::IsSameShaped(a, b)),                            \
-                  "Input tensors should have the same type!");              \
-    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");                 \
-    DTYPE * d = (DTYPE*)a->data;                                            \
-    DTYPE * db = (DTYPE*)b->data;                                           \
-    for (int i = 0; i < a->unitNum; i++)                                    \
-        db[i] = (DTYPE)origFunc(d[i]);                                      \
-}
-#define _SIMPLE_UNARY_FUNCTION_ME(_funcNameMe, _funcName)                   \
-void _funcNameMe(XTensor * a)                                               \
-{                                                                           \
-    _funcName(a, a);                                                        \
-}        
-#define SIMPLE_UNARY_FUNCTION(funcName, _funcName, operationId)             \
-XTensor funcName(const XTensor &a)                                          \
-{                                                                           \
-    XTensor b(&a);                                                          \
-    b.SetTMPFlag();                                                         \
-    _funcName(&a, &b);                                                      \
-    XLink::MakeLink(&a, NULL, &b, operationId);                             \
-    return b;                                                               \
-}
-#define SIMPLE_UNARY_FUNCTION_VOID(funcName, _funcName, operationId)        \
-void funcName(const XTensor &a, XTensor &b, bool requireLink)               \
-{                                                                           \
-    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {                      \
-        InitTensor(&b, &a);                                                 \
-    }                                                                       \
-    _funcName(&a, &b);                                                      \
-    if (requireLink) {                                                      \
-        XLink::MakeLink(&a, NULL, &b, operationId);                         \
-    }                                                                       \
-}
 _SIMPLE_UNARY_FUNCTION(_Absolute, fabs)
+_SIMPLE_UNARY_FUNCTION(_Ceil, ceil)
+_SIMPLE_UNARY_FUNCTION(_Exp, exp)
+_SIMPLE_UNARY_FUNCTION(_Floor, floor)
+_SIMPLE_UNARY_FUNCTION(_IsNonZero, isnonzero)
+_SIMPLE_UNARY_FUNCTION(_IsZero, iszero)
+_SIMPLE_UNARY_FUNCTION(_Log, log)
+_SIMPLE_UNARY_FUNCTION(_Negate, negate)
+_SIMPLE_UNARY_FUNCTION(_Round, round)
+_SIMPLE_UNARY_FUNCTION(_Sign, sign)
+_SIMPLE_UNARY_FUNCTION(_Sqrt, sqrt)
+_SIMPLE_UNARY_FUNCTION(_Square, square)
+_SIMPLE_UNARY_FUNCTION(_Sin, sin)
+_SIMPLE_UNARY_FUNCTION(_Cos, cos)
+_SIMPLE_UNARY_FUNCTION(_Tan, tan)
+#endif //  USE_CUDA
 _SIMPLE_UNARY_FUNCTION_ME(_AbsoluteMe, _Absolute)
+SIMPLE_UNARY_FUNCTION_ME(AbsoluteMe, _Absolute)
 SIMPLE_UNARY_FUNCTION(Absolute, _Absolute, MATH_ABSOLUTE)
 SIMPLE_UNARY_FUNCTION_VOID(Absolute, _Absolute, MATH_ABSOLUTE)
-_SIMPLE_UNARY_FUNCTION(_Ceil, ceil)
 _SIMPLE_UNARY_FUNCTION_ME(_CeilMe, _Ceil)
+SIMPLE_UNARY_FUNCTION_ME(CeilMe, _Ceil)
 SIMPLE_UNARY_FUNCTION(Ceil, _Ceil, MATH_CEIL)
 SIMPLE_UNARY_FUNCTION_VOID(Ceil, _Ceil, MATH_CEIL)
-_SIMPLE_UNARY_FUNCTION(_Exp, exp)
 _SIMPLE_UNARY_FUNCTION_ME(_ExpMe, _Exp)
+SIMPLE_UNARY_FUNCTION_ME(ExpMe, _Exp)
 SIMPLE_UNARY_FUNCTION(Exp, _Exp, MATH_EXP)
 SIMPLE_UNARY_FUNCTION_VOID(Exp, _Exp, MATH_EXP)
-_SIMPLE_UNARY_FUNCTION(_Floor, floor)
 _SIMPLE_UNARY_FUNCTION_ME(_FloorMe, _Floor)
+SIMPLE_UNARY_FUNCTION_ME(FloorMe, _Floor)
 SIMPLE_UNARY_FUNCTION(Floor, _Floor, MATH_FLOOR)
 SIMPLE_UNARY_FUNCTION_VOID(Floor, _Floor, MATH_FLOOR)
-_SIMPLE_UNARY_FUNCTION(_IsNonZero, isnonzero)
 _SIMPLE_UNARY_FUNCTION_ME(_IsNonZeroMe, _IsNonZero)
+SIMPLE_UNARY_FUNCTION_ME(IsNonZeroMe, _IsNonZero)
 SIMPLE_UNARY_FUNCTION(IsNonZero, _IsNonZero, MATH_ISNONZERO)
 SIMPLE_UNARY_FUNCTION_VOID(IsNonZero, _IsNonZero, MATH_ISNONZERO)
-_SIMPLE_UNARY_FUNCTION(_IsZero, iszero)
 _SIMPLE_UNARY_FUNCTION_ME(_IsZeroMe, _IsZero)
+SIMPLE_UNARY_FUNCTION_ME(IsZeroMe, _IsZero)
 SIMPLE_UNARY_FUNCTION(IsZero, _IsZero, MATH_ISZERO)
 SIMPLE_UNARY_FUNCTION_VOID(IsZero, _IsZero, MATH_ISZERO)
-_SIMPLE_UNARY_FUNCTION(_Log, log)
 _SIMPLE_UNARY_FUNCTION_ME(_LogMe, _Log)
+SIMPLE_UNARY_FUNCTION_ME(LogMe, _Log)
 SIMPLE_UNARY_FUNCTION(Log, _Log, MATH_LOG)
 SIMPLE_UNARY_FUNCTION_VOID(Log, _Log, MATH_LOG)
-_SIMPLE_UNARY_FUNCTION(_Round, round)
+_SIMPLE_UNARY_FUNCTION_ME(_NegateMe, _Negate)
+SIMPLE_UNARY_FUNCTION_ME(NegateMe, _Negate)
+SIMPLE_UNARY_FUNCTION(Negate, _Negate, MATH_NEGATE)
+SIMPLE_UNARY_FUNCTION_VOID(Negate, _Negate, MATH_NEGATE)
 _SIMPLE_UNARY_FUNCTION_ME(_RoundMe, _Round)
+SIMPLE_UNARY_FUNCTION_ME(RoundMe, _Round)
 SIMPLE_UNARY_FUNCTION(Round, _Round, MATH_ROUND)
 SIMPLE_UNARY_FUNCTION_VOID(Round, _Round, MATH_ROUND)
-_SIMPLE_UNARY_FUNCTION(_Sqrt, sqrt)
+_SIMPLE_UNARY_FUNCTION_ME(_SignMe, _Sign)
+SIMPLE_UNARY_FUNCTION_ME(SignMe, _Sign)
+SIMPLE_UNARY_FUNCTION(Sign, _Sign, MATH_SIGN)
+SIMPLE_UNARY_FUNCTION_VOID(Sign, _Sign, MATH_SIGN)
 _SIMPLE_UNARY_FUNCTION_ME(_SqrtMe, _Sqrt)
+SIMPLE_UNARY_FUNCTION_ME(SqrtMe, _Sqrt)
 SIMPLE_UNARY_FUNCTION(Sqrt, _Sqrt, MATH_SQRT)
 SIMPLE_UNARY_FUNCTION_VOID(Sqrt, _Sqrt, MATH_SQRT)
-_SIMPLE_UNARY_FUNCTION(_Square, square)
 _SIMPLE_UNARY_FUNCTION_ME(_SquareMe, _Square)
+SIMPLE_UNARY_FUNCTION_ME(SquareMe, _Square)
 SIMPLE_UNARY_FUNCTION(Square, _Square, MATH_SQUARE)
 SIMPLE_UNARY_FUNCTION_VOID(Square, _Square, MATH_SQUARE)
-_SIMPLE_UNARY_FUNCTION(_Sin, sin)
 _SIMPLE_UNARY_FUNCTION_ME(_SinMe, _Sin)
+SIMPLE_UNARY_FUNCTION_ME(SinMe, _Sin)
 SIMPLE_UNARY_FUNCTION(Sin, _Sin, MATH_SIN)
 SIMPLE_UNARY_FUNCTION_VOID(Sin, _Sin, MATH_SIN)
-_SIMPLE_UNARY_FUNCTION(_Cos, cos)
 _SIMPLE_UNARY_FUNCTION_ME(_CosMe, _Cos)
+SIMPLE_UNARY_FUNCTION_ME(CosMe, _Cos)
 SIMPLE_UNARY_FUNCTION(Cos, _Cos, MATH_COS)
 SIMPLE_UNARY_FUNCTION_VOID(Cos, _Cos, MATH_COS)
-_SIMPLE_UNARY_FUNCTION(_Tan, tan)
 _SIMPLE_UNARY_FUNCTION_ME(_TanMe, _Tan)
+SIMPLE_UNARY_FUNCTION_ME(TanMe, _Tan)
 SIMPLE_UNARY_FUNCTION(Tan, _Tan, MATH_TAN)
 SIMPLE_UNARY_FUNCTION_VOID(Tan, _Tan, MATH_TAN)
-/*_SIMPLE_UNARY_FUNCTION(_Round, round)
-_SIMPLE_UNARY_FUNCTION_ME(_RoundMe, _Round)
-SIMPLE_UNARY_FUNCTION(Round, _Round, MATH_ROUND)*/
-#endif
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/math/Unary.cu
+++ b/source/tensor/core/math/Unary.cu
@@ -30,6 +30,12 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA
 __device__
+DTYPE cudanegate(DTYPE x)
+{
+    return -x;
+}
+__device__
 DTYPE cudasquare(DTYPE x)
 {
    return x * x;
@@ -42,6 +48,17 @@ DTYPE cudaround(DTYPE r)
 }
 __device__
+DTYPE cudasign(DTYPE r)
+{
+    if (r > 0)
+       return 1.0F;
+    else if (r == 0)
+       return 0.0F;
+    else
+       return -1.0F;
+}
+__device__
 DTYPE cudaisnonzero(DTYPE r)
 {
    return (r != 0.0) ? (DTYPE)1.0 : (DTYPE)0.0;
@@ -72,7 +89,7 @@ void _Cuda##funcName(const XTensor * a, XTensor * b)                        \
 {                                                                           \
    CheckNTErrors((XTensor::IsSameShaped(a, b)),                            \
                  "Input tensors should have the same type!");              \
-    CheckNTErrors((a->isSparse == false), "TODO!");                         \
+    CheckNTErrors(a->isSparse == false, "TODO!");                           \
                                                                            \
    int gridSize[3];                                                        \
    int blockSize[3];                                                       \
@@ -107,7 +124,9 @@ SIMPLE_UNARY_FUNCTION_GPU(Floor, floor)
 SIMPLE_UNARY_FUNCTION_GPU(IsNonZero, cudaisnonzero)
 SIMPLE_UNARY_FUNCTION_GPU(IsZero, cudaiszero)
 SIMPLE_UNARY_FUNCTION_GPU(Log, log)
+SIMPLE_UNARY_FUNCTION_GPU(Negate, cudanegate)
 SIMPLE_UNARY_FUNCTION_GPU(Round, cudaround)
+SIMPLE_UNARY_FUNCTION_GPU(Sign, cudasign)
 SIMPLE_UNARY_FUNCTION_GPU(Sqrt, sqrt)
 SIMPLE_UNARY_FUNCTION_GPU(Square, cudasquare)

--- a/source/tensor/core/math/Unary.cuh
+++ b/source/tensor/core/math/Unary.cuh
@@ -92,6 +92,15 @@ void KernelLog(__half * a, __half * b, int size);
 /* set each entry to its logarithm value */
 void _CudaLog(const XTensor * a, XTensor * b);
+/* set each entry to its negative value (CUDA Kernel) */
+__global__
+void KernelNegate(DTYPE * a, DTYPE * b, int size);
+/* set each entry to its negative value (CUDA Kernel) with float16 data type*/
+__global__
+void KernelNegate(__half * a, __half * b, int size);
+/* set each entry to its negative value */
+void _CudaNegate(const XTensor * a, XTensor * b);
 /* set each entry to its round value (CUDA Kernel) */
 __global__
 void KernelRound(DTYPE * a, DTYPE * b, int size);
@@ -101,6 +110,15 @@ void KernelRound(__half * a, __half * b, int size);
 /* set each entry to its round value */
 void _CudaRound(const XTensor * a, XTensor * b);
+/* set each entry to its sign value (CUDA Kernel) */
+__global__
+void KernelSign(DTYPE * a, DTYPE * b, int size);
+/* set each entry to its sign value (CUDA Kernel) with float16 data type*/
+__global__
+void KernelSign(__half * a, __half * b, int size);
+/* set each entry to its sign value */
+void _CudaSign(const XTensor * a, XTensor * b);
 /* set each entry to its sqrt value (CUDA Kernel) */
 __global__
 void KernelSqrt(DTYPE * a, DTYPE * b, int size);

--- a/source/tensor/core/math/Unary.h
+++ b/source/tensor/core/math/Unary.h
@@ -31,144 +31,210 @@ void _Absolute(const XTensor * a, XTensor * b);
 /* set every entry to its absolute value (do it on site)
 keep the result in the input tensor a and return nothing */
 void _AbsoluteMe(XTensor * a);
+/* set every entry to its absolute value (do it on site)
+keep the result in the input tensor a and return nothing */
+void AbsoluteMe(XTensor & a);
 /* set every entry to its absolute value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Absolute(const XTensor & a);
 /* set every entry to its absolute value */
-void Absolute(const XTensor & a, XTensor & b, bool requireLink = false);
+void Absolute(const XTensor & a, XTensor & b);
 /* set every entry to its ceil value */
 void _Ceil(const XTensor * a, XTensor * b);
 /* set every entry to its ceil value (do it on site)
 keep the result in the input tensor a and return nothing */
 void _CeilMe(XTensor * a);
+/* set every entry to its ceil value (do it on site)
+keep the result in the input tensor a and return nothing */
+void CeilMe(XTensor & a);
 /* set every entry to its ceil value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Ceil(const XTensor & a);
 /* set every entry to its ceil value */
-void Ceil(const XTensor & a, XTensor & b, bool requireLink = false);
+void Ceil(const XTensor & a, XTensor & b);
 /* set every entry to its exponent value */
 void _Exp(const XTensor * a, XTensor * b);
 /* set every entry to its exponent value (do it on site)
 keep the result in the input tensor a and return nothing */
 void _ExpMe(XTensor * a);
+/* set every entry to its exponent value (do it on site)
+keep the result in the input tensor a and return nothing */
+void ExpMe(XTensor & a);
 /* set every entry to its exponent value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Exp(const XTensor & a);
 /* set every entry to its exponent value */
-void Exp(const XTensor & a, XTensor & b, bool requireLink = false);
+void Exp(const XTensor & a, XTensor & b);
 /* set every entry to its floor value */
 void _Floor(const XTensor * a, XTensor * b);
 /* set every entry to its floor value (do it on site)
 keep the result in the input tensor a and return nothing */
 void _FloorMe(XTensor * a);
+/* set every entry to its floor value (do it on site)
+keep the result in the input tensor a and return nothing */
+void FloorMe(XTensor & a);
 /* set every entry to its floor value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Floor(const XTensor & a);
 /* set every entry to its floor value */
-void Floor(const XTensor & a, XTensor & b, bool requireLink = false);
+void Floor(const XTensor & a, XTensor & b);
 /* if source entry is non-zero, set target entry to be one, otherwise zero */
 void _IsNonZero(const XTensor *a, XTensor *b);
 /* if source entry is non-zero, set target entry to be one, otherwise zero (do it on site)
 keep the result in the input tensor a and return nothing */
 void _IsNonZeroMe(XTensor *a);
+/* if source entry is non-zero, set target entry to be one, otherwise zero (do it on site)
+keep the result in the input tensor a and return nothing */
+void IsNonZeroMe(XTensor &a);
 /* if source entry is non-zero, set target entry to be one, otherwise zero (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor IsNonZero(const XTensor &a);
 /* if source entry is non-zero, set target entry to be one, otherwise zero */
-void IsNonZero(const XTensor &a, XTensor & b, bool requireLink = false);
+void IsNonZero(const XTensor &a, XTensor & b);
 /* if source entry is zero, set target entry to be one, otherwise zero */
 void _IsZero(const XTensor *a, XTensor *b);
 /* if source entry is zero, set target entry to be one, otherwise zero (do it on site)
 keep the result in the input tensor a and return nothing */
 void _IsZeroMe(XTensor *a);
+/* if source entry is zero, set target entry to be one, otherwise zero (do it on site)
+keep the result in the input tensor a and return nothing */
+void IsZeroMe(XTensor &a);
 /* if source entry is zero, set target entry to be one, otherwise zero (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor IsZero(const XTensor &a);
 /* if source entry is zero, set target entry to be one, otherwise zero */
-void IsZero(const XTensor &a, XTensor & b, bool requireLink = false);
+void IsZero(const XTensor &a, XTensor & b);
 /* set every entry to its logarithm value */
 void _Log(const XTensor * a, XTensor * b);
 /* set every entry to its logarithm value (do it on site)
 keep the result in the input tensor a and return nothing */
 void _LogMe(XTensor * a);
+/* set every entry to its logarithm value (do it on site)
+keep the result in the input tensor a and return nothing */
+void LogMe(XTensor & a);
 /* set every entry to its logarithm value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Log(const XTensor & a);
 /* set every entry to its logarithm value */
-void Log(const XTensor & a, XTensor & b, bool requireLink = false);
+void Log(const XTensor & a, XTensor & b);
+/* set every entry to its negative value */
+void _Negate(const XTensor * a, XTensor * b);
+/* set every entry to its negative value (do it on site)
+keep the result in the input tensor a and return nothing */
+void _NegateMe(XTensor * a);
+/* set every entry to its negative value (do it on site)
+keep the result in the input tensor a and return nothing */
+void NegateMe(XTensor & a);
+/* set every entry to its negative value (return an XTensor structure)
+make a new tensor to keep the result and return it */
+XTensor Negate(const XTensor & a);
+/* set every entry to its negative value */
+void Negate(const XTensor & a, XTensor & b);
 /* set every entry to its round value */
 void _Round(const XTensor * a, XTensor * b);
 /* set every entry to its round value (do it on site)
 keep the result in the input tensor a and return nothing */
 void _RoundMe(XTensor * a);
+/* set every entry to its round value (do it on site)
+keep the result in the input tensor a and return nothing */
+void RoundMe(XTensor & a);
 /* set every entry to its round value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Round(const XTensor & a);
 /* set every entry to its round value */
-void Round(const XTensor & a, XTensor & b, bool requireLink = false);
+void Round(const XTensor & a, XTensor & b);
+/* set every entry to its sign value */
+void _Sign(const XTensor * a, XTensor * b);
+/* set every entry to its sign value (do it on site)
+keep the result in the input tensor a and return nothing */
+void _SignMe(XTensor * a);
+/* set every entry to its sign value (do it on site)
+keep the result in the input tensor a and return nothing */
+void SignMe(XTensor & a);
+/* set every entry to its sign value (return an XTensor structure)
+make a new tensor to keep the result and return it */
+XTensor Sign(const XTensor & a);
+/* set every entry to its sign value */
+void Sign(const XTensor & a, XTensor & b);
 /* set every entry to its sqrt value */
 void _Sqrt(const XTensor * a, XTensor * b);
 /* set every entry to its sqrt value (do it on site)
 keep the result in the input tensor a and return nothing */
 void _SqrtMe(XTensor * a);
+/* set every entry to its sqrt value (do it on site)
+keep the result in the input tensor a and return nothing */
+void SqrtMe(XTensor & a);
 /* set every entry to its sqrt value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Sqrt(const XTensor & a);
 /* set every entry to its sqrt value */
-void Sqrt(const XTensor & a, XTensor & b, bool requireLink = false);
+void Sqrt(const XTensor & a, XTensor & b);
 /* set every entry to its square value */
 void _Square(const XTensor * a, XTensor * b);
 /* set every entry to its square value (do it on site)
 keep the result in the input tensor a and return nothing */
 void _SquareMe(XTensor * a);
+/* set every entry to its square value (do it on site)
+keep the result in the input tensor a and return nothing */
+void SquareMe(XTensor & a);
 /* set every entry to its square value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Square(const XTensor & a);
 /* set every entry to its square value */
-void Square(const XTensor & a, XTensor & b, bool requireLink = false);
+void Square(const XTensor & a, XTensor & b);
 /* set every entry to its sine value */
 void _Sin(const XTensor * a, XTensor * b);
 /* set every entry to its sine value (do it on site)
 keep the result in the input tensor a and return nothing */
 void _SinMe(XTensor * a);
+/* set every entry to its sine value (do it on site)
+keep the result in the input tensor a and return nothing */
+void SinMe(XTensor & a);
 /* set every entry to its sine value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Sin(const XTensor & a);
 /* set every entry to its sine value */
-void Sin(const XTensor & a, XTensor & b, bool requireLink = false);
+void Sin(const XTensor & a, XTensor & b);
 /* set every entry to its cosine value */
 void _Cos(const XTensor * a, XTensor * b);
 /* set every entry to its cosine value (do it on site)
 keep the result in the input tensor a and return nothing */
 void _CosMe(XTensor * a);
+/* set every entry to its cosine value (do it on site)
+keep the result in the input tensor a and return nothing */
+void CosMe(XTensor & a);
 /* set every entry to its cosine value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Cos(const XTensor & a);
 /* set every entry to its cosine value */
-void Cos(const XTensor & a, XTensor & b, bool requireLink = false);
+void Cos(const XTensor & a, XTensor & b);
 /* set every entry to its tangent value */
 void _Tan(const XTensor * a, XTensor * b);
 /* set every entry to its tangent value (do it on site)
 keep the result in the input tensor a and return nothing */
 void _TanMe(XTensor * a);
+/* set every entry to its tangent value (do it on site)
+keep the result in the input tensor a and return nothing */
+void TanMe(XTensor & a);
 /* set every entry to its tangent value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Tan(const XTensor & a);
 /* set every entry to its tangent value */
-void Tan(const XTensor & a, XTensor & b, bool requireLink = false);
+void Tan(const XTensor & a, XTensor & b);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/function/CrossEntropy.cpp
+++ b/source/tensor/function/CrossEntropy.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/*
- * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-17
- */
-#include <math.h>
-#include "CrossEntropy.h"
-#include "CrossEntropy.cuh"
-#include "../core/arithmetic/MultiplyDim.h"
-#include "../core/arithmetic/Multiply.h"
-#include "../core/math/Unary.h"
-#include "../core/math/ScaleAndShift.h"
-#include "../core/arithmetic/Negate.h"
-#include "../core/reduce/ReduceSum.h"
-#include "../core/reduce/ReduceSumAll.h"
-namespace nts{ // namespace nts(NiuTrans.Tensor)
-/*
-compute the cross entropy loss
-loss = sum_{i} (-gold_i * log(output_i))
-where gold and output are distributions 
->> output - model prediction
->> gold - gold standard
->> loss - compute loss
->> weight - a rescaling weight given to each class
->> padding - specify a target value that is ignored and does not contribute to the loss computation
->> leadingDim - the leading dimension for the output
-*/
-void _CrossEntropy(const XTensor * output, const XTensor * gold,
-                   XTensor * loss, const XTensor * weight, 
-                   const XTensor * padding, int leadingDim)
-{
-    int n = leadingDim < 0 ? output->order - 1 : leadingDim;
-    int unitNum = output->dimSize[n];
-    CheckNTErrors(n >= 0 && n < output->order, "Wrong leadingDim!");
-    CheckNTErrors(XTensor::IsSameShaped(output, gold), 
-                 "The output tensor and gold tensor must be of the same size!");
-    CheckNTErrors(weight == NULL || weight->unitNum == unitNum, "Wrong weight tensor!");
-    CheckNTErrors(padding == NULL || XTensor::IsSameShaped(padding, loss), 
-                 "The loss tensor and padding tensor must be same shape!");
-    CheckNTErrors(loss->order == output->order - 1, "Wrong loss dimension!");
-    CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, "TODO!");
-    XTensor * interBuf1 = NewTensorBuf(output, output->devID, output->mem);
-    XTensor * interBuf2 = NewTensorBuf(output, output->devID, output->mem);
-    _Log(output, interBuf1);
-    _Multiply(gold, interBuf1, interBuf2);
-    if(weight != NULL)
-        _MultiplyDimMe(interBuf2, weight, n);
-    _NegateMe(interBuf2);
-    _ReduceSum(interBuf2, loss, n);
-    if(padding != NULL)
-        _MultiplyMe(loss, padding);
-    DelTensorBuf(interBuf2);
-    DelTensorBuf(interBuf1);
-}
-/*
-compute the cross entropy loss (faster implementation with optimized code)
-loss = sum_{i} (-gold_i * log(output_i))
-where gold and output are distributions 
->> output - model prediction
->> gold - gold standard
->> loss - compute loss
->> weight - a rescaling weight given to each class
->> padding - specify a target value that is ignored and does not contribute to the loss computation
->> leadingDim - the leading dimension for the output
-*/
-void _CrossEntropyFast(const XTensor * output, const XTensor * gold,
-                       XTensor * loss, const XTensor * weight,
-                       const XTensor * padding, int leadingDim)
-{
-    int order = output->order;
-    int n = leadingDim < 0 ? output->order - 1 : leadingDim;
-    int leadingDimSize = output->GetDim(n);
-    CheckNTErrors(n >= 0 && n < output->order, 
-                 "Wrong leading dimension!");
-    CheckNTErrors(XTensor::IsSameShaped(output, gold), 
-                 "The output tensor and gold tensor must be of the same size!");
-    CheckNTErrors(weight == NULL || weight->unitNum == leadingDimSize, 
-                 "Wrong weight tensor!");
-    CheckNTErrors(padding == NULL || XTensor::IsSameShaped(padding, loss), 
-                 "The loss tensor and padding tensor must be same shape!");
-    CheckNTErrors(loss->order == output->order - 1, 
-                 "Wrong loss dimension!");
-    CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, 
-                 "TODO!");
-    for(int i = 0; i < order; i++){
-        if(i < n){
-            CheckNTErrors((output->GetDim(i) == loss->GetDim(i)), "Unmatched tensors!");
-        }
-        else if(i > n){
-            CheckNTErrors((output->GetDim(i) == loss->GetDim(i - 1)), "Unmatched tensors!");
-        }
-    }
-#ifdef USE_CUDA
-    if(output->devID >= 0) {
-        _CudaCrossEntropyFast(output, gold, loss, weight, padding, leadingDim);
-        return;
-    }
-#endif
-    int blockNum = 1;
-    int blockSize = 1;
-    int stride = 1;
-    for(int i = n + 1; i < order; i++)
-        stride *= output->GetDim(i);
-    blockSize = stride * leadingDimSize;
-    blockNum = output->unitNum / blockSize;
-    DTYPE * outputData = (DTYPE*)output->data;
-    DTYPE * goldData = (DTYPE*)gold->data;
-    DTYPE * lossData = (DTYPE*)loss->data;
-    DTYPE tmpLoss;
-    int lossPos;
-    int goldPos;
-    if(weight == NULL) {
-        if(padding == NULL) {
-            for(int i = 0; i < blockNum; i++) {
-                for(int j = 0; j < stride; j++) {
-                    tmpLoss = 0;
-                    lossPos = i * stride + j;
-                    for(int k = 0; k < leadingDimSize; k++) {
-                        goldPos = i * blockSize + j + k * stride;
-                        tmpLoss += -(*(goldData + goldPos)) * 
-                                    (DTYPE)log(*(outputData + goldPos));
-                    }
-                    *(lossData + lossPos) = tmpLoss;
-                }
-            }
-        }
-        else {
-            DTYPE * paddingData = (DTYPE*)padding->data;
-            for(int i = 0; i < blockNum; i++) {
-                for(int j = 0; j < stride; j++) {
-                    lossPos = i * stride + j;
-                    if(*(paddingData + lossPos) == 0)
-                        *(lossData + lossPos) = 0;
-                    else {
-                        tmpLoss = 0;
-                        for(int k = 0; k < leadingDimSize; k++) {
-                            goldPos = i * blockSize + j + k * stride;
-                            tmpLoss += -(*(goldData + goldPos)) * 
-                                        (DTYPE)log(*(outputData + goldPos));
-                        }
-                        *(lossData + lossPos) = tmpLoss;
-                    }
-                }
-            }            
-        }
-    }
-    else {
-        DTYPE * weightData = (DTYPE*)weight->data;
-        if(padding == NULL) {
-            for(int i = 0; i < blockNum; i++) {
-                for(int j = 0; j < stride; j++) {
-                    tmpLoss = 0;
-                    lossPos = i * stride + j;
-                    for(int k = 0; k < leadingDimSize; k++) {
-                        goldPos = i * blockSize + j + k * stride;
-                        tmpLoss += -(*(goldData + goldPos)) * 
-                                    (DTYPE)log(*(outputData + goldPos)) *
-                                    (*(weightData + k));
-                    }
-                    *(lossData + lossPos) = tmpLoss;                    
-                }
-            }
-        }
-        else {
-            DTYPE * paddingData = (DTYPE*)padding->data;
-            for(int i = 0; i < blockNum; i++) {
-                for(int j = 0; j < stride; j++) {
-                    lossPos = i * stride + j;
-                    if(*(paddingData + lossPos) == 0)
-                        *(lossData + lossPos) = 0;
-                    else {
-                        tmpLoss = 0;
-                        for(int k = 0; k < leadingDimSize; k++) {
-                            goldPos = i * blockSize + j + k * stride;
-                            tmpLoss += -(*(goldData + goldPos)) * 
-                                        (DTYPE)log(*(outputData + goldPos)) *
-                                        (*(weightData + k));
-                        }
-                        *(lossData + lossPos) = tmpLoss;
-                    }
-                }
-            }              
-        }
-    }
-}
-/*
-compute the cross entropy loss
-loss = sum_{i} (-gold_i * log(output_i))
-where gold and output are distributions 
->> output - model prediction
->> gold - gold standard
->> reduce - loss compute way, sum or mean
->> weight - a rescaling weight given to each class
->> padding - specify a target value that is ignored and does not contribute to the loss computation
->> leadingDim - the leading dimension for the output
-*/
-DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,
-                    LOSS_COMPUTE_WAY reduceWay, const XTensor * weight, 
-                    const XTensor * padding, int leadingDim)
-{
-    DTYPE loss = 0;
-    int order = output->order;
-    int n = leadingDim < 0 ? output->order - 1 : leadingDim;
-    int unitNum = output->dimSize[n];
-    CheckNTErrors(n >= 0 && n < output->order, "Wrong leadingDim!");
-    CheckNTErrors(XTensor::IsSameShaped(output, gold), 
-                 "The output tensor and gold tensor must be of the same size!");
-    CheckNTErrors(weight == NULL || weight->unitNum == unitNum, "Wrong weight tensor!");
-    CheckNTErrors(padding == NULL || padding->order == output->order - 1, 
-                 "The loss tensor and padding tensor must be same shape!");
-    CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, "TODO!");
-    int * dimSize = new int[order - 1];
-    for (int i = 0; i < order; i++) {
-        if(i < n)
-            dimSize[i] = output->dimSize[i];
-        else if(i > n)
-            dimSize[i - 1] = output->dimSize[i];
-    }
-    XTensor * lossBuf = NewTensorBuf(output->order - 1, dimSize, output->dataType, output->denseRatio, 
-                                     output->devID, output->mem);
-    _CrossEntropy(output, gold, lossBuf, weight, padding, leadingDim);
-    loss = _ReduceSumAll(lossBuf);
-    if(reduceWay == REDUCE_MEAN) {
-        int nonZeroNum;
-        if(padding == NULL) {
-            nonZeroNum = lossBuf->unitNum;
-        }
-        else {
-            XTensor * tmp = NewTensorBuf(padding, padding->devID, padding->mem);
-            _IsNonZero(padding, tmp);
-            nonZeroNum = (int)_ReduceSumAll(tmp);
-            DelTensorBuf(tmp);
-        }
-        loss = loss / (DTYPE)nonZeroNum;
-    }
-    else if(reduceWay == REDUCE_SUM) {
-        /* don't need to do anything */
-    }
-    else {
-        ShowNTErrors("TODO");
-    }
-    delete[] dimSize;
-    DelTensorBuf(lossBuf);
-    return loss;
-}
-/*
-compute the cross entropy loss (faster implementation with optimized code)
-loss = sum_{i} (-gold_i * log(output_i))
-where gold and output are distributions 
->> output - model prediction
->> gold - gold standard
->> reduceWay - loss compute way, sum or mean
->> weight - a rescaling weight given to each class
->> padding - specify a target value that is ignored and does not contribute to the loss computation
->> leadingDim - the leading dimension for the output
-<< return - the cross entropy loss that is a scalar
-*/
-DTYPE _CrossEntropyFast(const XTensor * output, const XTensor * gold,
-                        LOSS_COMPUTE_WAY reduceWay, const XTensor * weight,
-                        const XTensor * padding, int leadingDim)
-{
-    DTYPE loss = 0;
-    int order = output->order;
-    int n = leadingDim < 0 ? output->order - 1 : leadingDim;
-    int leadingDimSize = output->GetDim(n);
-    CheckNTErrors(n >= 0 && n < output->order, 
-                 "Wrong leadingDim!");
-    CheckNTErrors(XTensor::IsSameShaped(output, gold), 
-                 "The output tensor and gold tensor must be of the same size!");
-    CheckNTErrors(weight == NULL || weight->unitNum == leadingDimSize, 
-                 "Wrong weight tensor!");
-    CheckNTErrors(padding == NULL || padding->order == output->order - 1, 
-                 "Wrong padding tensor!");
-    CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, 
-                 "TODO!");
-    if(padding != NULL) {
-        for(int i = 0; i < order; i++){
-            if(i < n){
-                CheckNTErrors((output->GetDim(i) == padding->GetDim(i)), "Unmatched tensors!");
-            }
-            else if(i > n){
-                CheckNTErrors((output->GetDim(i) == padding->dimSize[i - 1]), "Unmatched tensors!");
-            }
-        }
-    }
-#ifdef USE_CUDA
-    if(output->devID >= 0) {
-        return _CudaCrossEntropyFast(output, gold, reduceWay, weight, padding, leadingDim);
-    }
-#endif
-    int blockNum = 1;
-    int blockSize = 1;
-    int stride = 1;
-    for(int i = n + 1; i < order; i++)
-        stride *= output->GetDim(i);
-    blockSize = stride * leadingDimSize;
-    blockNum = output->unitNum / blockSize;
-    DTYPE * outputData = (DTYPE*)output->data;
-    DTYPE * goldData = (DTYPE*)gold->data;
-    int paddingPos;
-    int goldPos;
-    int nonZeroNum = 0;
-    if(weight == NULL) {
-        if(padding == NULL) {
-            nonZeroNum = blockNum * stride;
-            for(int i = 0; i < blockNum; i++) {
-                for(int j = 0; j < stride; j++) {
-                    paddingPos = i * stride + j;
-                    for(int k = 0; k < leadingDimSize; k++) {
-                        goldPos = i * blockSize + j + k * stride;
-                        loss += -(*(goldData + goldPos)) * 
-                                 (DTYPE)log(*(outputData + goldPos));
-                    }
-                }
-            }
-        }
-        else {
-            DTYPE * paddingData = (DTYPE*)padding->data;
-            for(int i = 0; i < blockNum; i++) {
-                for(int j = 0; j < stride; j++) {
-                    paddingPos = i * stride + j;
-                    if(*(paddingData + paddingPos) == 0)
-                        continue;
-                    else {
-                        nonZeroNum += 1;
-                        for(int k = 0; k < leadingDimSize; k++) {
-                            goldPos = i * blockSize + j + k * stride;
-                            loss += -(*(goldData + goldPos)) * 
-                                     (DTYPE)log(*(outputData + goldPos));
-                        }    
-                    }
-                }
-            }
-        }
-    }
-    else {
-        DTYPE * weightData = (DTYPE*)weight->data;
-        if(padding == NULL) {
-            nonZeroNum = blockNum * stride;
-            for(int i = 0; i < blockNum; i++) {
-                for(int j = 0; j < stride; j++) {
-                    paddingPos = i * stride + j;
-                    for(int k = 0; k < leadingDimSize; k++) {
-                        goldPos = i * blockSize + j + k * stride;
-                        loss += -(*(goldData + goldPos)) * 
-                                 (DTYPE)log(*(outputData + goldPos)) *
-                                 (*(weightData + k));
-                    }
-                }
-            }
-        }
-        else {
-            DTYPE * paddingData = (DTYPE*)padding->data;
-            for(int i = 0; i < blockNum; i++) {
-                for(int j = 0; j < stride; j++) {
-                    paddingPos = i * stride + j;
-                    if(*(paddingData + paddingPos) == 0)
-                        continue;
-                    else {
-                        nonZeroNum += 1;
-                        for(int k = 0; k < leadingDimSize; k++) {
-                            goldPos = i * blockSize + j + k * stride;
-                            loss += -(*(goldData + goldPos)) * 
-                                     (DTYPE)log(*(outputData + goldPos)) *
-                                     (*(weightData + j));
-                        }    
-                    }
-                }
-            }
-        }
-    }
-    if(reduceWay == REDUCE_MEAN) {
-        loss = loss / (DTYPE)nonZeroNum;
-    }
-    else if(reduceWay == REDUCE_SUM) {
-        /* don't need to do anything */
-    }
-    else {
-        ShowNTErrors("TODO");
-    }
-    return loss;
-}
-/* 
-backward compuation for cross entropy function
-loss = sum_{i} (-t_i * log(y_i))
-dE/dy_i = -t_i / y_i
-where E is the error(loss) function that measure the errors in y
-with respect to gold standard, and y this the model output
->> dedy - dE/dy (for return)
->> output - model prediction
->> gold - gold standard
->> weight - a rescaling weight given to each class
->> padding - specify a target value that is ignored and does not contribute to the loss computation
->> leadingDim - the leading dimension for the output
-*/
-void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, 
-                           const XTensor * gold, const XTensor * weight,
-                           XTensor * padding, int leadingDim)
-{
-    int order = output->order;
-    int n = leadingDim < 0 ? output->order - 1 : leadingDim;
-    int leadingDimSize = output->GetDim(n);
-    CheckNTErrors(n >= 0 && n < output->order, 
-                 "Wrong leading dimension!");
-    CheckNTErrors(XTensor::IsSameShaped(dedy, output, gold), 
-                 "The output tensor and gold tensor must be of the same size!");
-    CheckNTErrors(weight == NULL || weight->unitNum == leadingDimSize, 
-                 "Wrong weight tensor!");
-    CheckNTErrors(padding == NULL || padding->order == output->order - 1, 
-                 "Wrong padding tensor!");
-    CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, 
-                 "TODO!");
-    if(padding != NULL) {
-        for(int i = 0; i < order; i++){
-            if(i < n){
-                CheckNTErrors((output->GetDim(i) == padding->GetDim(i)), "Unmatched tensors!");
-            }
-            else if(i > n){
-                CheckNTErrors((output->GetDim(i) == padding->dimSize[i - 1]), "Unmatched tensors!");
-            }
-        }    
-    }
-#ifdef USE_CUDA
-    if(output->devID >= 0) {
-        _CudaCrossEntropyBackward(dedy, output, gold, weight, padding, leadingDim);
-        return;
-    }
-#endif
-    int blockNum = 1;
-    int blockSize = 1;
-    int stride = 1;
-    for(int i = n + 1; i < order; i++)
-        stride *= output->GetDim(i);
-    blockSize = stride * leadingDimSize;
-    blockNum = output->unitNum / blockSize;
-    DTYPE * dedyData = (DTYPE*)dedy->data;
-    DTYPE * outputData = (DTYPE*)output->data;
-    DTYPE * goldData = (DTYPE*)gold->data;
-    int paddingPos;
-    int goldPos;
-    if(weight == NULL) {
-        if(padding == NULL) {
-            for(int i = 0; i < blockNum; i++) {
-                for(int j = 0; j < stride; j++) {
-                    for(int k = 0; k < leadingDimSize; k++) {
-                        goldPos = i * blockSize + j + k * stride;
-                        *(dedyData + goldPos) = -(*(goldData + goldPos)) / 
-                                                 (*(outputData + goldPos));
-                    }
-                }
-            }
-        }
-        else {
-            DTYPE * paddingData = (DTYPE*)padding->data;
-            for(int i = 0; i < blockNum; i++) {
-                for(int j = 0; j < stride; j++) {
-                    paddingPos = i * stride + j;
-                    for(int k = 0; k < leadingDimSize; k++) {
-                        goldPos = i * blockSize + j + k * stride;
-                        if(*(paddingData + paddingPos) == 0)
-                            *(dedyData + goldPos) = 0;
-                        else
-                            *(dedyData + goldPos) = -(*(goldData + goldPos)) / 
-                                                     (*(outputData + goldPos));
-                    }
-                }
-            }
-        }
-    }
-    else {
-        DTYPE * weightData = (DTYPE*)weight->data;
-        if(padding == NULL) {
-            for(int i = 0; i < blockNum; i++) {
-                for(int j = 0; j < stride; j++) {
-                    for(int k = 0; k < leadingDimSize; k++) {
-                        goldPos = i * blockSize + j + k * stride;
-                        *(dedyData + goldPos) = -(*(weightData + k)) * 
-                                                 (*(goldData + goldPos)) / 
-                                                 (*(outputData + goldPos));
-                    }
-                }
-            }
-        }
-        else {
-            DTYPE * paddingData = (DTYPE*)padding->data;
-            for(int i = 0; i < blockNum; i++) {
-                for(int j = 0; j < stride; j++) {
-                    paddingPos = i * stride + j;
-                    for(int k = 0; k < leadingDimSize; k++) {
-                        goldPos = i * blockSize + j + k * stride;
-                        if(*(paddingData + paddingPos) == 0)
-                            *(dedyData + goldPos) = 0;
-                        else
-                            *(dedyData + goldPos) = -(*(weightData + k)) * 
-                                                     (*(goldData + goldPos)) / 
-                                                     (*(outputData + goldPos));
-                    }
-                }
-            }
-        }
-    }
-    //if(padding != NULL) {
-    //    XTensor * tmp = NewTensor(padding);
-    //    _IsNonZero(padding, tmp);
-    //    int nonZeroNum = (int)_ReduceSumAll(tmp);
-    //    _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum);
-    //    delete tmp;
-    //}
-    //else {
-    //    _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)blockNum);
-    //}
-}
-} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/function/CrossEntropy.cu
+++ b/source/tensor/function/CrossEntropy.cu
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/*
- * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-17
- */
-#ifndef __CROSSENTROPY_CUH__
-#define __CROSSENTROPY_CUH__
-#include "../XTensor.h"
-#include "../XDevice.h"
-#include "CrossEntropy.cuh"
-#include "CrossEntropy.h"
-#include "../core/arithmetic/Div.h"
-#include "../core/arithmetic/Multiply.h"
-#include "../core/arithmetic/MultiplyDim.h"
-#include "../core/arithmetic/Negate.h"
-#include "../core/math/Unary.h"
-#include "../core/math/ScaleAndShift.h"
-#include "../core/reduce/ReduceSum.h"
-#include "../core/reduce/ReduceSumAll.h"
-#include "../core/shape/Transpose.h"
-#include "../core/shape/Unsqueeze.h"
-namespace nts{ // namespace nts(NiuTrans.Tensor)
-/*
-compute the cross entropy loss (cuda version) 
-loss = sum_{i} (-gold_i * log(output_i))
-where gold and output are distributions 
->> output - model prediction
->> gold - gold standard
->> loss - compute loss
->> weight - a rescaling weight given to each class
->> padding - specify a target value that is ignored and does not contribute to the loss computation
->> leadingDim - the leading dimension for the output
-*/
-void _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
-                           XTensor * loss, const XTensor * weight, 
-                           const XTensor * padding, int leadingDim)
-{
-    int n = leadingDim < 0 ? output->order - 1 : leadingDim;
-    XTensor * interBuf1 = NewTensorBuf(output, output->devID, output->mem);
-    XTensor * interBuf2 = NewTensorBuf(output, output->devID, output->mem);
-    _Log(output, interBuf1);
-    _Multiply(gold, interBuf1, interBuf2);
-    if(weight != NULL)
-        _MultiplyDimMe(interBuf2, weight, n);
-    _NegateMe(interBuf2);
-    _ReduceSum(interBuf2, loss, n);
-    if(padding != NULL)
-        _MultiplyMe(loss, padding);
-    DelTensorBuf(interBuf2);
-    DelTensorBuf(interBuf1);
-}
-/*
-compute the cross entropy loss (scalar version) 
-loss = sum_{i} (-gold_i * log(output_i))
-where gold and output are distributions 
->> output - model prediction
->> gold - gold standard
->> reduceWay - loss compute way, sum or mean
->> weight - a rescaling weight given to each class
->> padding - specify a target value that is ignored and does not contribute to the loss computation
->> leadingDim - the leading dimension for the output
-<< return - the cross entropy loss that is a scalar
-*/
-DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
-                            LOSS_COMPUTE_WAY reduceWay, const XTensor * weight,
-                            const XTensor * padding, int leadingDim)
-{
-    DTYPE loss = 0;
-    int order = output->order;
-    int n = leadingDim < 0 ? output->order - 1 : leadingDim;
-    int leadingDimSize = output->GetDim(n);
-    CheckNTErrors(n >= 0 && n < output->order, 
-                 "Wrong leadingDim!");
-    CheckNTErrors(XTensor::IsSameShaped(output, gold), 
-                 "The output tensor and gold tensor must be of the same size!");
-    CheckNTErrors(weight == NULL || weight->unitNum == leadingDimSize, 
-                 "Wrong weight tensor!");
-    CheckNTErrors(padding == NULL || padding->order == output->order - 1, 
-                 "Wrong padding tensor!");
-    CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, 
-                 "TODO!");
-    int * dimSize = new int[output->order - 1];
-    for (int i = 0; i < order; i++) {
-        if(i < n)
-            dimSize[i] = output->dimSize[i];
-        else if(i > n)
-            dimSize[i - 1] = output->dimSize[i];
-    }
-    XTensor * lossBuf = NewTensorBuf(output->order - 1, dimSize, output->dataType, output->denseRatio, 
-                                     output->devID, output->mem);
-    _CudaCrossEntropyFast(output, gold, lossBuf, weight, padding, leadingDim);
-    loss = _ReduceSumAll(lossBuf);
-    if(reduceWay == REDUCE_MEAN) {
-        int nonZeroNum;
-        if(padding == NULL) {
-            nonZeroNum = lossBuf->unitNum;
-        }
-        else {
-            XTensor * tmp = NewTensorBuf(padding, padding->devID, padding->mem);
-            _IsNonZero(padding, tmp);
-            nonZeroNum = (int)_ReduceSumAll(tmp);
-            DelTensorBuf(tmp);
-        }
-        loss = loss / (DTYPE)nonZeroNum;
-    }
-    else if(reduceWay == REDUCE_SUM) {
-        /* don't need to do anything */
-    }
-    else {
-        ShowNTErrors("TODO");
-    }
-    delete[] dimSize;
-    DelTensorBuf(lossBuf);
-    return loss;
-}
-/* 
-backward computation of cross entropy function 
-loss = sum_{i} (-t_i * log(y_i))
-dE/dy_i = -t_i / y_i
-where E is the error(loss) function that measure the errors in y
-with respect to gold standard, and y this the model output
->> dedy - dE/dy (for return)
->> output - model prediction
->> gold - gold standard
->> weight - a rescaling weight given to each class
->> padding - specify a target value that is ignored and does not contribute to the loss computation
->> leadingDim - the leading dimension for the output
-*/
-void _CudaCrossEntropyBackward(XTensor * dedy, const XTensor * output, 
-                               const XTensor * gold, const XTensor * weight,
-                               XTensor * padding, int leadingDim)
-{
-    int n = leadingDim < 0 ? output->order - 1 : leadingDim;
-    _Div(gold, output, dedy);
-    _NegateMe(dedy);
-    if(weight != NULL)
-        _MultiplyDimMe(dedy, weight, n);
-    if(padding != NULL) {
-        int paddingOrder = padding->order;
-        int * paddingDims = new int[paddingOrder];
-        memcpy(paddingDims, padding->dimSize, padding->order * sizeof(int));
-        padding->Reshape(padding->unitNum);
-        int order = dedy->order;
-        int * dims = new int[order];
-        memcpy(dims, dedy->dimSize, dedy->order * sizeof(int));
-        dedy->Reshape(dedy->unitNum/dedy->GetDim(n), dedy->GetDim(n));
-        _MultiplyDimMe(dedy, padding, 0);
-        padding->Reshape(paddingOrder, paddingDims);
-        dedy->Reshape(order, dims);
-        delete[] paddingDims;
-        delete[] dims;
-    }
-    //if(padding != NULL) {
-    //    XTensor * tmp = NewTensor(padding);
-    //    _IsNonZero(padding, tmp);
-    //    int nonZeroNum = (int)_ReduceSumAll(tmp);
-    //    _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum);
-    //    delete tmp;
-    //}
-    //else {
-    //    _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)blockNum);
-    //}
-}
-} // namespace nts(NiuTrans.Tensor)
-#endif // __CROSSENTROPY_CUH__
\ No newline at end of file
--- a/source/tensor/function/CrossEntropy.cuh
+++ b/source/tensor/function/CrossEntropy.cuh
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/*
- * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-17
- */
-#ifndef __CROSSENTROPY_CUH__
-#define __CROSSENTROPY_CUH__
-#include "../XTensor.h"
-#include "CrossEntropy.h"
-namespace nts{ // namespace nts(NiuTrans.Tensor)
-/* compute the cross entropy loss */
-void _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
-                           XTensor * loss, const XTensor * weight = NULL, 
-                           const XTensor * padding = NULL, int leadingDim = -1);
-/* compute the cross entropy loss */
-DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
-                            LOSS_COMPUTE_WAY reduceWay, const XTensor * weight = NULL, 
-                            const XTensor * padding = NULL, int leadingDim = -1);
-/* backward computation of cross entropy function */
-void _CudaCrossEntropyBackward(XTensor * dedy, const XTensor * output, 
-                               const XTensor * gold, const XTensor * weight = NULL, 
-                               XTensor * padding = NULL, int leadingDim = -1);
-} // namespace nts(NiuTrans.Tensor)
-#endif // __CROSSENTROPY_CUH__
\ No newline at end of file
--- a/source/tensor/function/CrossEntropy.h
+++ b/source/tensor/function/CrossEntropy.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/*
- * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-17
- */
-#ifndef __CROSSENTROPY_H__
-#define __CROSSENTROPY_H__
-#include "../XTensor.h"
-namespace nts{ // namespace nts(NiuTrans.Tensor)
-enum LOSS_COMPUTE_WAY{
-REDUCE_SUM,
-REDUCE_MEAN
-};
-/* compute the cross entropy loss */
-void _CrossEntropy(const XTensor * output, const XTensor * gold, 
-                   XTensor * loss, const XTensor * weight = NULL, 
-                   const XTensor * padding = NULL, int leadingDim = -1);
-/* compute the cross entropy loss */
-void _CrossEntropyFast(const XTensor * output, const XTensor * gold,
-                         XTensor * loss, const XTensor * weight = NULL, 
-                         const XTensor * padding = NULL, int leadingDim = -1);
-/* compute the cross entropy loss (return the loss) */
-DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,
-                    LOSS_COMPUTE_WAY reduceWay, const XTensor * weight = NULL, 
-                    const XTensor * padding = NULL, int leadingDim = -1);
-/* compute the cross entropy loss (return the loss) */
-DTYPE _CrossEntropyFast(const XTensor * output, const XTensor * gold,
-                        LOSS_COMPUTE_WAY reduceWay = REDUCE_MEAN, const XTensor * weight = NULL,
-                        const XTensor * padding = NULL, int leadingDim = -1);
-/* backward computation of cross entropy function */
-void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, 
-                           const XTensor * gold, const XTensor * weight = NULL, 
-                           XTensor * padding = NULL, int leadingDim = -1);
-} // namespace nts(NiuTrans.Tensor)
-#endif // __CROSSENTROPY_H__
\ No newline at end of file
--- a/source/tensor/function/Loss.cu
+++ b/source/tensor/function/Loss.cu
@@ -25,7 +25,6 @@
 #include "../core/math/Power.h"
 #include "../core/math/ScaleAndShift.h"
 #include "../core/math/Unary.h"
-#include "../core/arithmetic/Negate.h"
 #include "../core/arithmetic/Sum.h"
 #include "../core/arithmetic/Multiply.h"
 #include "../core/reduce/ReduceSum.h"

--- a/source/tensor/loss/CrossEntropy.cpp
+++ b/source/tensor/loss/CrossEntropy.cpp
@@ -28,7 +28,6 @@
 #include "../core/arithmetic/Multiply.h"
 #include "../core/math/Unary.h"
 #include "../core/math/ScaleAndShift.h"
-#include "../core/arithmetic/Negate.h"
 #include "../core/reduce/ReduceSum.h"
 #include "../core/reduce/ReduceSumAll.h"

--- a/source/tensor/loss/CrossEntropy.cu
+++ b/source/tensor/loss/CrossEntropy.cu
@@ -29,7 +29,6 @@
 #include "../core/arithmetic/Div.h"
 #include "../core/arithmetic/Multiply.h"
 #include "../core/arithmetic/MultiplyDim.h"
-#include "../core/arithmetic/Negate.h"
 #include "../core/math/Unary.h"
 #include "../core/math/ScaleAndShift.h"
 #include "../core/reduce/ReduceSum.h"

--- a/source/tensor/test/TNegate.h
+++ b/source/tensor/test/TNegate.h
@@ -22,7 +22,7 @@
 #ifndef __TEST_NEGATE_H__
 #define __TEST_NEGATE_H__
-#include "../core/arithmetic/Negate.h"
+#include "../core/math/Unary.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/test/TSign.h
+++ b/source/tensor/test/TSign.h
@@ -22,7 +22,7 @@
 #ifndef __TEST_SIGN_H__
 #define __TEST_SIGN_H__
-#include "../core/arithmetic/Sign.h"
+#include "../core/math/Unary.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)