new code

28fd2d94 · xiaotong · 992ee9e9 · 28fd2d94 · 28fd2d94 · 28fd2d94
Commit 28fd2d94 authored Jul 17, 2018 by xiaotong
--- a/source/network/XBackwardLoss.cpp
+++ b/source/network/XBackwardLoss.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-17
+ */
+#include "XBackwardLoss.h"
+#include "../tensor/XName.h"
+#include "../tensor/function/HardTanH.h"
+#include "../tensor/function/LogSoftmax.h"
+namespace nts{
+/* 
+compute dE/dx for a given function y = f(x) 
+>> gold - gold standard to measure error (or loss)
+>> y - output of the function
+>> x - input of the function
+>> dedy - dE/dy
+>> dedx - dE/dx
+>> funcID - id of the function f
+>> params - parameters of the function
+>> lossName - name of the loss, e.g., cross entropy
+*/
+void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x, 
+                        XTensor * dedy, XTensor * dedx,
+                        int funcID, void * params,
+                        LOSS_FUNCTION_NAME lossName)
+{
+    CheckNTErrors(gold && y && x, "Empty input tensors!");
+    CheckNTErrors(dedx, "Empty gradient tensors!");
+    CheckNTErrors((funcID & FUNCTION_BASE) != 0, "Illegal function id");
+    if(funcID == FUNC_HARDTANH){
+        _HardTanHBackward(gold, y, x, dedy, dedx, lossName);
+    }
+    else if(funcID == FUNC_LOGSOFTMAX){
+        int leadDim = *(int*)params;
+        _LogSoftmaxBackward(gold, y, x, dedy, dedx, leadDim, lossName);
+    }
+    else{
+        ShowNTErrors("wrong function found when call the backward process!");
+    }
+}
+/* 
+compute dE/dy for variable y and error(loss) function E
+>> gold - gold standard to measure error (or loss)
+>> y - output of the function
+>> dedy - dE/dy
+>> lossName - name of the loss, e.g., cross entropy
+*/
+void XLossGrad::Compute(XTensor * gold, XTensor * y, 
+                        XTensor * dedy, 
+                        LOSS_FUNCTION_NAME lossName)
+{
+    LossBackward(dedy, gold, y, lossName);
+}
+}
\ No newline at end of file
--- a/source/network/XBackwardLoss.h
+++ b/source/network/XBackwardLoss.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-17
+ * My students worked all night to prepare a submission to CWMT. Good luck
+ * to them!
+ */
+#include "../tensor/XTensor.h"
+#include "../tensor/function/FHeader.h"
+#ifndef __XBACKWARDLOSS_H__
+#define __XBACKWARDLOSS_H__
+namespace nts{
+/* this class computes the gradient (of a output node) 
+   with respect to the loss */
+class XLossGrad
+{
+public:
+    /* compute dE/dx for a given function y = f(x) */
+    void Compute(XTensor * gold, XTensor * y, XTensor * x, 
+                 XTensor * dedy, XTensor * dedx,
+                 int funcID, void * params,
+                 LOSS_FUNCTION_NAME lossName);
+    /* compute dE/dy for variable y and error(loss) function E */
+    void Compute(XTensor * gold, XTensor * y, 
+                 XTensor * dedy, 
+                 LOSS_FUNCTION_NAME lossName);
+};
+}
+#endif
\ No newline at end of file
--- a/source/network/XBackwardNode.cpp
+++ b/source/network/XBackwardNode.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-17
+ */
+#include "XBackwardNode.h"
+#include "../tensor/core/CHeader.h"
+#include "../tensor/XName.h"
+namespace nts{
+/* make gradient tensor for a node */
+void XNoder::MakeGrad(XTensor * node)
+{
+    if(node == NULL)
+        return;
+    if(!XTensor::IsIdentical(node, node->grad)){
+        delete node->grad;
+        node->grad = NewTensor(node);
+    }
+}
+/* the node is a leaf node (intput) or not */
+bool XNoder::IsLeaf(XTensor * node)
+{
+    if(node == NULL)
+        return false;
+    if(node->income.tailNum == 0)
+        return true;
+    else
+        return false;
+}
+/* the node is a root node (output) or not */
+bool XNoder::IsRoot(XTensor * node)
+{
+    if(node == NULL)
+        return false;
+    if(node->outgo.tailNum == 0)
+        return true;
+    else
+        return false;
+}
+/* the node keeps the gradinent or not */
+bool XNoder::IsGrad(XTensor * node)
+{
+    if(node == NULL)
+        return false;
+    if(node->isGrad)
+        return true;
+    else
+        return false;
+}
+/* 
+compute dE/dx of a node 
+Note that input of the node could be arbitrary tensors
+>> node - node of the network
+*/
+void XNodeGrad::Compute(XTensor * node)
+{
+    if(node == NULL || node->visitMark == NODE_FINISHED)
+        return;
+    if(!XNoder::IsLeaf(node)){
+    }
+    node->visitMark = NODE_FINISHED;
+}
+/* indicates whether the node is for a math operation */
+bool XNodeGrad::IsMathOP(XTensor * node)
+{
+    XLink &income = node->income;
+    return (income.typeID & MATH_BASE) != 0;
+}
+/* compute dE/dx as a math operation, e.g., sum, multiply ... */
+void XNodeGrad::ComputeMath(XTensor * node)
+{
+    CheckNTErrors(node->grad != NULL, "No gradient found!");
+    XLink &income = node->income;
+    int operID = income.typeID;
+    /* c = a + b * \beta
+       dE/da = dE/dc 
+       dE/db = dE/dc * \beta */
+    if(operID == MATH_SUM){
+        CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for SUM!");
+        XTensor * a = income.tails[0]; 
+        XTensor * b = income.tails[1];
+        DTYPE beta = income.GetParam(0);
+        XNoder::MakeGrad(a);
+        XNoder::MakeGrad(b);
+        _CopyValues(node->grad, a->grad);
+        if(beta != 1.0F)
+            _ScaleAndShift(node->grad, a->grad, beta);
+        else
+            _CopyValues(node->grad, b->grad);
+    }
+    /* c = a * b
+       dE/da = dE/dc * b
+       dE/db = dE/dc * a */
+    else if(operID == MATH_MULTIPLY){
+        CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for MULTIPLY!");
+        XTensor * a = income.tails[0]; 
+        XTensor * b = income.tails[1];
+        int leadDim = income.GetParamInt(0);
+        XNoder::MakeGrad(a);
+        XNoder::MakeGrad(b);
+        CheckNTErrors(XTensor::IsIdentical(a, b), "Wrong sized input tensors!");
+        _Multiply(node->grad, b, a->grad);
+        _Multiply(node->grad, a, b->grad);
+    }
+    else{
+        ShowNTErrors("TODO!");
+    }
+}
+}
\ No newline at end of file
--- a/source/network/XBackwardNode.h
+++ b/source/network/XBackwardNode.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-17
+ */
+#include "../tensor/XTensor.h"
+#include "../tensor/function/FHeader.h"
+#ifndef __XBACKWARDNODE_H__
+#define __XBACKWARDNODE_H__
+namespace nts{
+#define NODE_UNFINISHED 0
+#define NODE_DOING      1
+#define NODE_FINISHED   2
+/* node management */
+class XNoder
+{
+public:
+    /* make gradient tensor for a node */
+    static
+    void MakeGrad(XTensor * node);
+    /* the node is a leaf node (intput) or not */
+    static
+    bool IsLeaf(XTensor * node);
+    /* the node is a root node (output) or not */
+    static
+    bool IsRoot(XTensor * node);
+    /* the node keeps the gradinent or not */
+    static
+    bool IsGrad(XTensor * node);
+};
+/* this class computes the gradient for each node in the network */
+class XNodeGrad
+{
+public:
+    /* compute dE/dx of a node */
+    void Compute(XTensor * node);
+    /* indicates whether the node is for a math operation */
+    bool IsMathOP(XTensor * node);
+    /* compute dE/dx as a math operation, e.g., sum, multiply ... */
+    void ComputeMath(XTensor * node);
+};
+}
+#endif
\ No newline at end of file
--- a/source/network/XNet.cpp
+++ b/source/network/XNet.cpp
@@ -20,6 +20,9 @@
 */
 #include "XNet.h"
+#include "XBackwardLoss.h"
+#include "XBackwardNode.h"
+#include "../tensor/XName.h"
 namespace nts{
@@ -87,6 +90,44 @@ with a number of root nodes
 void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
 {
    Traverse(roots);
+    for(int i = 0; i < nodes.count; i++){
+        XTensor * node = (XTensor*)nodes.Get(i);
+        node->visitMark = NODE_UNFINISHED;
+    }
+    XLossGrad lossGrad;
+    /* we start with the gradient with respect to the loss for output layers */
+    for(int i = 0; i < roots.count; i++){
+        XTensor * root = (XTensor*)roots.Get(i);
+        XTensor * gold = (XTensor*)golds.Get(i);
+        XLink &income = root->income;
+        int funcID = income.typeID;
+        void * params = income.params;
+        /* we compute dE/dx if the output is generated by an activation function y = f(x).
+           Note that we do not need to obtain dE/dy here because it is no use in the 
+           folloing process of back-propagation */
+        if(income.tailNum == 1 && (funcID & FUNCTION_BASE)){
+            XTensor * x = income.tails[0];
+            XNoder::MakeGrad(x);
+            lossGrad.Compute(gold, root, x, NULL, x->grad, funcID, params, loss);
+            root->visitMark = NODE_FINISHED;
+        }
+        /* we compuate dE/dy (y is the output) if no predefined activation function is used */
+        else{
+            XNoder::MakeGrad(root);
+            lossGrad.Compute(gold, root, root->grad, loss);
+        }
+    }
+    /* back-propagation from output to input */
+    for(int i = nodes.count - 1; i >= 0; i--){
+        XTensor * node = (XTensor*)nodes.Get(i);
+        if(node->visitMark == NODE_FINISHED)
+            continue;
+    }
 }
 /* 
@@ -115,6 +156,15 @@ void XNet::Traverse(XList &roots)
    for (int i = 0; i < roots.count; i++)
        TarjanVisit((XTensor*)roots.Get(i), nodes, id);
+    for(int i = 0; i < nodes.count; i++){
+        XTensor * node = (XTensor*)nodes.Get(i);
+        if(XNoder::IsRoot(node))
+            outputs.Add(node);
+        if(XNoder::IsLeaf(node))
+            inputs.Add(node);
+        if(XNoder::IsGrad(node))
+            gradNodes.Add(node);
+    }
 }
 /* 

--- a/source/tensor/Main.cpp
+++ b/source/tensor/Main.cpp
@@ -45,11 +45,10 @@ int main( int argc, const char ** argv )
    //_CrtSetBreakAlloc(123);
    /* a tiny test */
-    if(true)
+    SmallTest();
-        SmallTest();
    //_CrtDumpMemoryLeaks();
-    return 0;
+    //return 0;
    if(argc > 1 && !strcmp(argv[1], "-test"))
        Test();

--- a/source/tensor/XLink.cpp
+++ b/source/tensor/XLink.cpp
@@ -25,7 +25,7 @@
 namespace nts{ // namespace nts(NiuTrans.Tensor)
-int XLink::paramSize = 64;
+int XLink::paramSize = PARAM_UNTI_SIZE;
 /* constuctor */
 XLink::XLink()
@@ -235,6 +235,26 @@ void XLink::AddParam(void * param, int size)
 }
 /* 
+get a paramter in default type 
+>> i - id the of the parameter
+*/
+DTYPE XLink::GetParam(int i)
+{
+    char * p = (char*)params + i * paramSize;
+    return *(DTYPE*)p;
+}
+/* 
+get a paramter in integer 
+>> i - id the of the parameter
+*/
+int XLink::GetParamInt(int i)
+{
+    char * p = (char*)params + i * paramSize;
+    return *(int*)p;
+}
+/* 
 create a hyperedge with two input tensors and a output tensor 
 >> t1 - a tail tensor
 >> t2 - another tail tensor

--- a/source/tensor/XLink.h
+++ b/source/tensor/XLink.h
@@ -34,6 +34,7 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 struct XTensor;
 #define MAX_OP_NAME_LENGTH 16
+#define PARAM_UNTI_SIZE    64
 /*
 This defines the link among tensors in networks. XLink can be
@@ -115,12 +116,18 @@ struct XLink
    /* add two tails in one time */
    void AddTwoTails(XTensor * t1, XTensor * t2);
-    /* add a integer parameter */
+    /* add a parameter in default type */
    void AddParam(DTYPE param);
-    /* add a integer parameter */
+    /* add a parameter */
    void AddParam(void * param, int size);
+    /* get a paramter in default type */
+    DTYPE GetParam(int i);
+    /* get a paramter in integer */
+    int GetParamInt(int i);
    /* create a hyper edge with two input tensors and a output tensor */
    static
    void MakeLink(const XTensor * t1, const XTensor * t2, XTensor * h, int id);

--- a/source/tensor/XName.cpp
+++ b/source/tensor/XName.cpp
@@ -26,7 +26,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* get operator name */
 const char * GetOPName(int type)
 {
-    if((type & MATH_ARITHMETIC) != 0){
+    if((type & MATH_BASE) != 0){
        if(type == MATH_ABSOLUTE)
            return "M_ABSOLUTE";
        else if(type == MATH_MATRIXMUL)
@@ -49,12 +49,6 @@ const char * GetOPName(int type)
            return "M_POWER";
        else if(type == MATH_SCALEANDSHIFT)
            return "M_SCALEANDSHIFT";
-        else if(type == GETANDSET_SELECT)
-            return "G_SELECT";
-        else if(type == MOVEMENT_COPYINDEXED)
-            return "M_COPYINDEXED";
-        else if(type == MOVEMENT_COPYVALUES)
-            return "M_COPYVALUES";
        else if(type == REDUCE_REDUCEMAX)
            return "R_REDUCEMAX";
        else if(type == REDUCE_REDUCEMEAN)
@@ -65,6 +59,14 @@ const char * GetOPName(int type)
            return "R_REDUCESUMSQUARED";
        else if(type == REDUCE_REDUCEVARIANCE)
            return "R_REDUCEVARIANCE";
+    }
+    else if((type & DATA_BASE) != 0){
+        if(type == GETANDSET_SELECT)
+            return "G_SELECT";
+        else if(type == MOVEMENT_COPYINDEXED)
+            return "M_COPYINDEXED";
+        else if(type == MOVEMENT_COPYVALUES)
+            return "M_COPYVALUES";
        else if(type == SHAPE_CONCATENATE)
            return "S_CONCATENATE";
        else if(type == SHAPE_MERGE)

--- a/source/tensor/XName.h
+++ b/source/tensor/XName.h
@@ -28,8 +28,9 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
-#define MATH_ARITHMETIC     0x00001000
+/* math operations */
-#define MATH_ABSOLUTE           MATH_ARITHMETIC + 1
+#define MATH_BASE               0x00001000
+#define MATH_ABSOLUTE           MATH_BASE + 1
 #define MATH_MATRIXMUL          MATH_ABSOLUTE + 1
 #define MATH_MATRIXMULBATCHED   MATH_MATRIXMUL + 1
 #define MATH_MULTIPLY           MATH_MATRIXMULBATCHED + 1
@@ -42,20 +43,22 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define MATH_POWER              MATH_NORMALIZE + 1
 #define MATH_SCALEANDSHIFT      MATH_POWER + 1
-#define GETANDSET               MATH_SCALEANDSHIFT + 1
+#define REDUCE                  MATH_SCALEANDSHIFT + 1
-#define GETANDSET_SELECT        GETANDSET + 1
-#define MOVEMENT                GETANDSET_SELECT + 1
-#define MOVEMENT_COPYINDEXED    MOVEMENT + 1
-#define MOVEMENT_COPYVALUES     MOVEMENT_COPYINDEXED + 1
-#define REDUCE                  MOVEMENT_COPYVALUES + 1
 #define REDUCE_REDUCEMAX        REDUCE + 1
 #define REDUCE_REDUCEMEAN       REDUCE_REDUCEMAX + 1
 #define REDUCE_REDUCESUM        REDUCE_REDUCEMEAN + 1
 #define REDUCE_REDUCESUMSQUARED REDUCE_REDUCESUM + 1
 #define REDUCE_REDUCEVARIANCE   REDUCE_REDUCESUMSQUARED + 1
+/* data and shape related operations */
+#define DATA_BASE               MATH_BASE * 2
+#define GETANDSET               DATA_BASE + 1
+#define GETANDSET_SELECT        GETANDSET + 1
+#define MOVEMENT                GETANDSET_SELECT + 1
+#define MOVEMENT_COPYINDEXED    MOVEMENT + 1
+#define MOVEMENT_COPYVALUES     MOVEMENT_COPYINDEXED + 1
 #define SHAPE                   REDUCE_REDUCEVARIANCE + 1
 #define SHAPE_CONCATENATE       SHAPE + 1
 #define SHAPE_MERGE             SHAPE_CONCATENATE + 1
@@ -64,6 +67,15 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define SHAPE_TRANSPOSE         SHAPE_SPLIT + 1
 #define SHAPE_UNSQUEEZE         SHAPE_TRANSPOSE + 1
+/* activation functions */
+#define FUNCTION_BASE           DATA_BASE * 2
+#define FUNC_HARDTANH           FUNCTION_BASE + 1
+#define FUNC_IDENTITY           FUNC_HARDTANH + 1
+#define FUNC_LOGSOFTMAX         FUNC_IDENTITY + 1
+#define FUNC_RECTIFY            FUNC_LOGSOFTMAX + 1
+#define FUNC_SIGMOID            FUNC_RECTIFY + 1
+#define FUNC_SOFTMAX            FUNC_SIGMOID + 1
 /* get operator name */
 const char * GetOPName(int type);

--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -352,6 +352,9 @@ judge whether the two matrices are in the same type and size
 */
 bool XTensor::IsIdentical(const XTensor * a, const XTensor * b)
 {
+    if(a == NULL || b == NULL)
+        return false;
    if(a->order != b->order)
        return false;

--- a/source/tensor/core/arithmetic/Multiply.cpp
+++ b/source/tensor/core/arithmetic/Multiply.cpp
@@ -142,16 +142,15 @@ void _MultiplyMe(XTensor * a, const XTensor * b, DTYPE alpha, int leadingDim)
 element-wise product of two tensors (return a XTensor structure)
 make a new tensor c to keep the result and return it
-c(i) = a(i)*b(i) + \alpha * c(i)
+c(i) = a(i)*b(i)
 where i is the index of the item
 >> a - tensor a
 >> b - tensor b
->> alpha - the coefficient
 >> leadingDim - the dimension along which we perform broadcasting
 << return - the product of the tensors
 */
-XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim)
+XTensor Multiply(const XTensor &a, const XTensor &b, int leadingDim)
 {
    CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!");
@@ -159,11 +158,10 @@ XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim
    c.SetTMP();
    /* call _Multiply function */
-    _Multiply(&a, &b, &c, alpha, leadingDim);
+    _Multiply(&a, &b, &c, 0, leadingDim);
    /* tensor connections */
    XLink::MakeLink(&a, &b, &c, MATH_MULTIPLY);
-    XLink::AddParamToHead(&c, alpha);
    XLink::AddParamToHeadInt(&c, leadingDim);
    return c;

--- a/source/tensor/core/arithmetic/Multiply.h
+++ b/source/tensor/core/arithmetic/Multiply.h
@@ -44,10 +44,10 @@ void _MultiplyMe(XTensor * a, const XTensor * b, DTYPE alpha = 0, int leadingDim
 /* 
 element-wise product of two tensors (return a XTensor structure)
 make a new tensor to keep the result and return it
-c(i) = a(i)*b(i) + \alpha * c(i) 
+c(i) = a(i)*b(i)
 where i is the index of the element 
 */
-XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha = 0, int leadingDim = 0);
+XTensor Multiply(const XTensor &a, const XTensor &b, int leadingDim = 0);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/function/HardTanH.cpp
+++ b/source/tensor/function/HardTanH.cpp
@@ -20,6 +20,7 @@
 */
 #include <stdlib.h>
+#include "../XName.h"
 #include "HardTanH.h"
 #include "HardTanH.cuh"
@@ -58,6 +59,26 @@ void _HardTanH(const XTensor * x, XTensor * y)
        ShowNTErrors("TODO!");
 }
+/* 
+hard tanh function (return a structure) 
+y =  1    if x > 1
+     x    if -1 <= x <= 1
+    -1    if x < -1
+>> x - input tensor
+<< return - y
+*/
+XTensor HardTanH(const XTensor &x)
+{
+    XTensor y(&x);
+    y.SetTMP();
+    _HardTanH(&x, &y);
+    XLink::MakeLink(&x, NULL, &y, FUNC_HARDTANH);
+    return y;
+}
 /*
 backward computation
@@ -77,16 +98,16 @@ hard tanh: y =  1    if x > 1
 >> dedx - dE/dx
 >> lossName - type of loss function, e.g., cross entropy
 */
-void HardTanHBackward(XTensor * gold, XTensor * y, XTensor * x, 
+void _HardTanHBackward(XTensor * gold, XTensor * y, XTensor * x, 
-                      XTensor * dedy, XTensor * dedx,
+                       XTensor * dedy, XTensor * dedx,
-                      LOSS_FUNCTION_NAME lossName)
+                       LOSS_FUNCTION_NAME lossName)
 {
    CheckNTErrors((gold == NULL || XTensor::IsIdentical(gold, y)), 
                        "The tensors must be of the same size!");
 #ifdef USE_CUDA
    if(x->devID >= 0 || y->devID >= 0){
-        CudaHardTanHBackward(gold, y, x, dedy, dedx, lossName);
+        _CudaHardTanHBackward(gold, y, x, dedy, dedx, lossName);
        return;
    }
 #endif

--- a/source/tensor/function/HardTanH.cu
+++ b/source/tensor/function/HardTanH.cu
@@ -129,9 +129,9 @@ hard tanh: y =  1    if x > 1
 >> dedx - dE/dx
 >> lossName - type of loss function, e.g., cross entropy
 */
-void CudaHardTanHBackward(XTensor * gold, XTensor * y, XTensor * x, 
+void _CudaHardTanHBackward(XTensor * gold, XTensor * y, XTensor * x, 
-                          XTensor * dedy, XTensor * dedx,
+                           XTensor * dedy, XTensor * dedx,
-                          LOSS_FUNCTION_NAME lossName)
+                           LOSS_FUNCTION_NAME lossName)
 {
    if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){

--- a/source/tensor/function/HardTanH.cuh
+++ b/source/tensor/function/HardTanH.cuh
@@ -40,9 +40,9 @@ void _CudaHardTanH(const XTensor * input, XTensor * output);
 /* de/dx (Cuda version) */
 extern "C"
-void CudaHardTanHBackward(XTensor * gold, XTensor * y, XTensor * x, 
+void _CudaHardTanHBackward(XTensor * gold, XTensor * y, XTensor * x, 
-                          XTensor * dedy, XTensor * dedx,
+                           XTensor * dedy, XTensor * dedx,
-                          LOSS_FUNCTION_NAME lossName);
+                           LOSS_FUNCTION_NAME lossName);
 #endif // USE_CUDA

--- a/source/tensor/function/HardTanH.h
+++ b/source/tensor/function/HardTanH.h
@@ -33,14 +33,15 @@ y =  1    if x > 1
     x    if -1 <= x <= 1
    -1    if x < -1
 */
-extern "C"
 void _HardTanH(const XTensor * x, XTensor * y);
+/* hard tanh function (return a structure) */
+XTensor HardTanH(const XTensor &x);
 /* de/dx */
-extern "C"
+void _HardTanHBackward(XTensor * gold, XTensor * y, XTensor * x, 
-void HardTanHBackward(XTensor * gold, XTensor * y, XTensor * x, 
+                       XTensor * dedy, XTensor * dedx,
-                      XTensor * dedy, XTensor * dedx,
+                       LOSS_FUNCTION_NAME lossName);
-                      LOSS_FUNCTION_NAME lossName);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/function/LogSoftmax.cpp
+++ b/source/tensor/function/LogSoftmax.cpp
@@ -122,7 +122,7 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
                blockMax->data = mp;
                blockSum->data = sp;
 #ifdef USE_CUDA
-                CudaLogSoftmaxSumMax(blockx, blocky, leadDim, blockSum, blockMax);
+                _CudaLogSoftmaxSumMax(blockx, blocky, leadDim, blockSum, blockMax);
 #else
                ShowNTErrors("Please specify USE_CUDA and recompile the code!");
 #endif
@@ -223,10 +223,10 @@ better numerical stability.
 >> lossName - type of loss function, e.g., cross entropy
 >> leadDim - leading dimension (along which we perform reduction)
 */
-void LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
+void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
-                        XTensor * dedy, XTensor * dedx,
+                         XTensor * dedy, XTensor * dedx,
-                        int leadDim,
+                         int leadDim,
-                        LOSS_FUNCTION_NAME lossName)
+                         LOSS_FUNCTION_NAME lossName)
 {
    CheckNTErrors((!dedx->isSparse), "The gradient matrix must be dense!");
    CheckNTErrors((gold != NULL), "The gold standard cannot be empty!");
@@ -234,7 +234,7 @@ void LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
    int leadDimRDI = y->order - leadDim - 1;
 #ifdef USE_CUDA
    if (gold->devID >= 0) {
-        CudaLogSoftmaxBackward(gold, y, x, dedy, dedx, leadDim, lossName);
+        _CudaLogSoftmaxBackward(gold, y, x, dedy, dedx, leadDim, lossName);
        return;
    }
 #endif

--- a/source/tensor/function/LogSoftmax.cu
+++ b/source/tensor/function/LogSoftmax.cu
@@ -139,7 +139,7 @@ log scale softmax y = log(e^x / \sum_{i} e^{x_i}) (Cuda version)
 >> sum - \sum_{i} e^{x_i}
 >> max - \max_{i} e^{x_i}
 */
-void CudaLogSoftmaxSumMax(XTensor * x, XTensor * y, int leadDim, XTensor * sum, XTensor * max)
+void _CudaLogSoftmaxSumMax(XTensor * x, XTensor * y, int leadDim, XTensor * sum, XTensor * max)
 {
    CheckNTErrors((x->devID >= 0), "Forward computation of log softmax must be run on GPUs.");
    CheckNTErrors((x->devID == y->devID), "Input tensors must be on the same GPU.");
@@ -353,7 +353,7 @@ better numerical stability.
 >> lossName - type of loss function, e.g., cross entropy
 >> leadDim - leading dimension (along which we perform reduction)
 */
-void CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
+void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
                            XTensor * dedy, XTensor * dedx,
                            int leadDim,
                            LOSS_FUNCTION_NAME lossName)

--- a/source/tensor/function/LogSoftmax.cuh
+++ b/source/tensor/function/LogSoftmax.cuh
@@ -30,16 +30,13 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA
 /* log scale softmax y = log(e^x / \sum_{i} e^{x_i}) (Cuda version) */
-extern "C"
 void _CudaLogSoftmax(const XTensor * input, XTensor * output, int leadDim);
 /* log scale softmax y = log(e^x / \sum_{i} e^{x_i}) (Cuda version) */
-extern "C"
+void _CudaLogSoftmaxSumMax(XTensor * x, XTensor * y, int leadDim, XTensor * sum, XTensor * max);
-void CudaLogSoftmaxSumMax(XTensor * x, XTensor * y, int leadDim, XTensor * sum, XTensor * max);
 /* de/dx (Cuda version) */
-extern "C"
+void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
-void CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
                            XTensor * dedy, XTensor * dedx,
                            int leadDim, 
                            LOSS_FUNCTION_NAME lossName);

--- a/source/tensor/function/LogSoftmax.h
+++ b/source/tensor/function/LogSoftmax.h
@@ -28,15 +28,13 @@
 namespace nts{ // namespace nts(NiuTrans.Tensor)
 /* log scale softmax y = log(e^x / \sum_{i} e^{x_i}) */
-extern "C"
 void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim);
 /* de/dx */
-extern "C"
+void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, 
-void LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, 
+                         XTensor * dedy, XTensor * dedx,
-                        XTensor * dedy, XTensor * dedx,
+                         int leadDim,
-                        int leadDim,
+                         LOSS_FUNCTION_NAME lossName);
-                        LOSS_FUNCTION_NAME lossName);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/function/Loss.cpp
+++ b/source/tensor/function/Loss.cpp
@@ -387,7 +387,7 @@ void LossBackward(XTensor * dedy, XTensor * t, XTensor * y,
    if (y->devID < 0) {
        CheckNTErrors((tLen <= y->unitNum), "Illegal input length!");
        CheckNTErrors((XTensor::IsIdentical(t, y)&& XTensor::IsIdentical(dedy, y)), 
-                            "The input tensors must be of the same size!");
+                      "The input tensors must be of the same size!");
        CheckNTErrors(((dedy->devID == t->devID) && (dedy->devID == y->devID)), "Tensor must be on the same device!");
        CheckNTErrors((t->order > leadDim), "Illegal leading dimension!");
        CheckNTErrors((t->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE),

--- a/source/tensor/function/Loss.cu
+++ b/source/tensor/function/Loss.cu
@@ -405,7 +405,7 @@ void CudaLossBackward(XTensor * dedy, XTensor * t, XTensor * y,
    loss = sum_{i} (-t_i * log(y_i)), where t and y are distributions 
    dloss/dy_i = -t_i / y_i
    */
-    if(LFName == CROSSENTROPY){
+    else if(LFName == CROSSENTROPY){
        if(t->isSparse){
            ShowNTErrors("TODO!");
        }
@@ -416,6 +416,9 @@ void CudaLossBackward(XTensor * dedy, XTensor * t, XTensor * y,
            KernelLossBackwardCrossEntropyBlock<<<blocks, threads>>>(dedyp, tp, yp, blockSize, tBeg * stride, tLen * stride, y->unitNum);
        }
    }
+    else{
+        ShowNTErrors("TODO");
+    }
    BacktoCudaDev(y->devID, devIDBackup);
 }

--- a/source/tensor/sample/fnnlm/FNNLM.cpp
+++ b/source/tensor/sample/fnnlm/FNNLM.cpp
@@ -773,7 +773,7 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
    /* for y = softmax(s), we get dE/ds
        where E is the error function (define by loss) */
-    LogSoftmaxBackward(&gold, &y, &s, NULL, &deds, 1, loss);
+    _LogSoftmaxBackward(&gold, &y, &s, NULL, &deds, 1, loss);
    /* for s = x * w, we get 
       dE/w_{i,j} = dE/ds_j * ds/dw_{i,j} 
@@ -818,7 +818,7 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
        /* backpropagation through the activation fucntion: 
           dE/ds = dE/dh * dh/ds */
-        HardTanHBackward(NULL, &h, &s, &dedh, &deds, NOLOSS);
+        _HardTanHBackward(NULL, &h, &s, &dedh, &deds, NOLOSS);
        /* gradient of the weight: dE/dw = x^T * dE/ds   */
        _MatrixMul(&x, X_TRANS, &deds, X_NOTRANS, &dedw);

--- a/source/tensor/test/THardTanH.cpp
+++ b/source/tensor/test/THardTanH.cpp
@@ -155,7 +155,7 @@ bool TestHardTanH2()
    _HardTanH(x, y);
 	/* call HardTanHBackward function */
-	HardTanHBackward(gold, y, x, dedy, dedx, SQUAREDERROR);
+	_HardTanHBackward(gold, y, x, dedy, dedx, SQUAREDERROR);
 	/* check results */
 	cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F) 
@@ -184,7 +184,7 @@ bool TestHardTanH2()
    _HardTanH(xGPU, yGPU);
 	/* call hardtanhbackward function */
-	HardTanHBackward(goldGPU, yGPU, xGPU, dedyGPU, dedxGPU, SQUAREDERROR);
+	_HardTanHBackward(goldGPU, yGPU, xGPU, dedyGPU, dedxGPU, SQUAREDERROR);
 	/* check results */
 	gpuTest = y->CheckData(yAnswer, unitNum, 1e-4F) 

--- a/source/tensor/test/TLogSoftmax.cpp
+++ b/source/tensor/test/TLogSoftmax.cpp
@@ -142,7 +142,7 @@ bool TestLogSoftmax2()
    _LogSoftmax(x, y, 1);
    /* call LogSoftmaxBackward function */
-    LogSoftmaxBackward(g, y, x, dedy, dedx, 1, CROSSENTROPY);
+    _LogSoftmaxBackward(g, y, x, dedy, dedx, 1, CROSSENTROPY);
    /* check result */
    cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F) 
@@ -170,7 +170,7 @@ bool TestLogSoftmax2()
    _LogSoftmax(xGPU, yGPU, 1);
    /* call LogSoftmaxBackward function */
-    LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, 1, CROSSENTROPY);
+    _LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, 1, CROSSENTROPY);
    /* check result */
    gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F) && dedxGPU->CheckData(dedxAnswer, unitNum, 1e-4F);
@@ -246,7 +246,7 @@ bool TestLogSoftmax3()
    _LogSoftmax(x, y, 1);
    /* call LogSoftmaxBackward function */
-    LogSoftmaxBackward(g, y, x, dedy, dedx, 1, SQUAREDERROR);
+    _LogSoftmaxBackward(g, y, x, dedy, dedx, 1, SQUAREDERROR);
    /* check result */
    cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F) 
@@ -274,7 +274,7 @@ bool TestLogSoftmax3()
    _LogSoftmax(xGPU, yGPU, 1);
    /* call LogSoftmaxBackward function */
-    LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, 1, SQUAREDERROR);
+    _LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, 1, SQUAREDERROR);
    /* check result */
    gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F) 

--- a/source/tensor/test/TMultiply.cpp
+++ b/source/tensor/test/TMultiply.cpp
@@ -86,7 +86,7 @@ bool TestMultiply1()
 	/* call Multiply function */
 	_Multiply(s1, s2, t, 0, 0);
 	_MultiplyMe(tMe, s2, 0, 0);
-    tUser = Multiply(*s1, *s2, 0, 0);
+    tUser = Multiply(*s1, *s2, 0);
 	/* check results */
 	cpuTest = t->CheckData(answer, tUnitNum) 
@@ -112,7 +112,7 @@ bool TestMultiply1()
 	/* call Multiply function */
 	_Multiply(sGPU1, sGPU2, tGPU, 0, 0);
 	_MultiplyMe(tMeGPU, sGPU2, 0, 0);
-    tUserGPU = Multiply(*sGPU1, *sGPU2, 0, 0);
+    tUserGPU = Multiply(*sGPU1, *sGPU2, 0);
 	/* check results */
 	gpuTest = tGPU->CheckData(answer, tUnitNum)
@@ -209,7 +209,7 @@ bool TestMultiply2()
 	/* call Multiply function */
 	_Multiply(s1, s2, t, 0, 0);
 	_MultiplyMe(tMe, s2, 0, 0);
-    tUser = Multiply(*s1, *s2, 0, 0);
+    tUser = Multiply(*s1, *s2, 0);
 	/* check results */
 	cpuTest = t->CheckData(answer, tUnitNum) 
@@ -235,11 +235,12 @@ bool TestMultiply2()
 	/* call Multiply function */
 	_Multiply(sGPU1, sGPU2, tGPU, 0, 0);
 	_MultiplyMe(tMeGPU, sGPU2, 0, 0);
-    tUserGPU = Multiply(*sGPU1, *sGPU2, 0, 0);
+    tUserGPU = Multiply(*sGPU1, *sGPU2, 0);
 	/* check results */
-	gpuTest = tGPU->CheckData(answer, tUnitNum)
+	gpuTest = tGPU->CheckData(answer, tUnitNum) && 
-        && tMeGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);
+              tMeGPU->CheckData(answer, tUnitNum) && 
+              tUserGPU.CheckData(answer, tUnitNum);
 	/* destroy variables */
    delete s1;