merge with xuchen branch

1348bcba · xuchen · d221ef9d · 1348bcba · 1348bcba · 1348bcba
Commit 1348bcba authored Oct 12, 2019 by xuchen
--- a/source/network/XNet.cpp
+++ b/source/network/XNet.cpp
@@ -77,104 +77,20 @@ backward propagation to obtain gradient
 >> root - root node (output) of the network
 >> loss - name of loss function
 */
-void XNet::Backward(XTensor &root, LOSS_FUNCTION_NAME loss)
+void XNet::Backward(XTensor &root)
 {
    TensorList roots(1);
    roots.Add(&root);
-    TensorList golds(1);
+    Backward(roots);
-    golds.Add(NULL);
-    TensorList paddings(1);
-    paddings.Add(NULL);
-    Backward(roots, golds, paddings, loss);
-}
-/*
-backward propagation to obtain gradient wrt. the loss/error function
->> root - root node (output) of the network
->> gold - gold standard for the output
->> loss - name of loss function
-*/
-void XNet::Backward(XTensor &root, XTensor &gold, LOSS_FUNCTION_NAME loss)
-{
-    TensorList roots(1);
-    roots.Add(&root);
-    TensorList golds(1);
-    golds.Add(&gold);
-    TensorList paddings(1);
-    paddings.Add(NULL);
-    Backward(roots, golds, paddings, loss);
-}
-/* 
-backward propagation to obtain gradient wrt. the loss/error function 
->> root - root node (output) of the network
->> gold - gold standard for the output
->> padding - specify a target value that is ignored and does not contribute to the gradient computation
->> loss - name of loss function
-*/
-void XNet::Backward(XTensor &root, XTensor &gold, XTensor &padding, LOSS_FUNCTION_NAME loss)
-{
-    TensorList roots(1);
-    roots.Add(&root);
-    TensorList golds(1);
-    golds.Add(&gold);
-    TensorList paddings(1);
-    paddings.Add(&padding);
-    Backward(roots, golds, paddings, loss);
-}
-/*
-backward propagation to obtain gradient
-with a number of root nodes
->> roots - a list of root nodes (output) of the network
->> loss - name of loss function
-*/
-void XNet::Backward(TensorList &roots, LOSS_FUNCTION_NAME loss)
-{
-    TensorList golds(roots.count);
-    TensorList paddings(roots.count);
-    for (int i = 0; i < roots.count; i++) {
-        golds.Add(NULL);
-        paddings.Add(NULL);
-    }
-    Backward(roots, golds, paddings, loss);
-}
-/*
-backward propagation to obtain gradient
-with a number of root nodes
->> roots - a list of root nodes (output) of the network
->> golds - a list of gold standard for the output
->> loss - name of loss function
-*/
-void XNet::Backward(TensorList &roots, TensorList &golds, LOSS_FUNCTION_NAME loss)
-{
-    TensorList paddings(roots.count);
-    for (int i = 0; i < roots.count; i++)
-        paddings.Add(NULL);
-    Backward(roots, golds, paddings, loss);
 }
 /* 
 backward propagation to obtain gradient wrt. the loss/error function
 with a number of root nodes 
 >> roots - a list of root nodes (output) of the network
->> golds - a list of gold standard for the output
->> paddings - specify a target value that is ignored
->> loss - name of loss function
 */
-void XNet::Backward(TensorList &roots, TensorList &golds, TensorList &paddings, LOSS_FUNCTION_NAME loss)
+void XNet::Backward(TensorList &roots)
 {
    Traverse(roots);
@@ -187,39 +103,6 @@ void XNet::Backward(TensorList &roots, TensorList &golds, TensorList &paddings, 
        node->visitMark = NODE_UNFINISHED;
    }
-    //XLossGrad lossGrad;
-    /* we start with the gradient with respect to the loss for output layers */
-    /*for(int i = 0; i < roots.count; i++){
-        XTensor * root = (XTensor*)roots.Get(i);
-        XTensor * gold = (XTensor*)golds.Get(i);
-        XTensor * padding = (XTensor*)paddings.Get(i);
-        XLink &income = root->income;
-        int funcID = income.typeID;
-        void * params = income.params;*/
-        /* we compute dE/dx if the output is generated by an activation function y = f(x).
-           Note that we do not need to obtain dE/dy here because it is no use in the 
-           folloing process of back-propagation */
-        /*if(gold != NULL && income.tailNum == 1 && (funcID & FUNCTION_BASE)){
-            if(funcID == FUNC_LOGSOFTMAX || funcID == FUNC_SOFTMAX) {
-                XTensor * x = income.tails[0];
-                XNoder::MakeGrad(x);
-                lossGrad.Compute(gold, root, x, NULL, x->grad, padding, funcID, params, loss);
-                root->visitMark = NODE_FINISHED;
-            }
-            else {
-                XNoder::MakeGrad(root);
-                lossGrad.Compute(gold, root, root->grad, padding, loss);
-            }
-        }*/
-        /* we compuate dE/dy (y is the output) if no predefined activation function is used */
-        /*else{
-            XNoder::MakeGrad(root);
-            lossGrad.Compute(gold, root, root->grad, NULL, loss);
-        }
-    }*/
    /* back-propagation from output to input */
    for(int i = nodes.count - 1; i >= 0; i--){
        XTensor * node = (XTensor*)nodes.Get(i);
@@ -460,7 +343,6 @@ void XNet::ShowNetwork(FILE * file, XTensor * node)
    }
 }
 /*
 search for a node in a top-down manner by its name
 >> top - the top most node

--- a/source/network/XNet.h
+++ b/source/network/XNet.h
@@ -61,25 +61,11 @@ struct XNet
    void Clear();
    /* backward propagation to obtain gradient */
-    void Backward(XTensor &root, LOSS_FUNCTION_NAME loss = NOLOSS);
+    void Backward(XTensor &root);
-    /* backward propagation to obtain gradient wrt. the loss/error function */
-    void Backward(XTensor &root, XTensor &gold, LOSS_FUNCTION_NAME loss = NOLOSS);
-    /* backward propagation to obtain gradient wrt. the loss/error function */
-    void Backward(XTensor &root, XTensor &gold, XTensor &padding, LOSS_FUNCTION_NAME loss = NOLOSS);
-    /* backward propagation to obtain gradient
-       with a number of root nodes */
-    void Backward(TensorList &roots, LOSS_FUNCTION_NAME loss = NOLOSS);
-    /* backward propagation to obtain gradient
-       with a number of root nodes */
-    void Backward(TensorList &roots, TensorList &golds, LOSS_FUNCTION_NAME loss = NOLOSS);
    /* backward propagation to obtain gradient wrt. the loss/error function
       with a number of root nodes */
-    void Backward(TensorList &roots, TensorList &golds, TensorList &paddings, LOSS_FUNCTION_NAME loss = NOLOSS);
+    void Backward(TensorList &roots);
    /* backward computation for a given node */
    void BackwardNode(XTensor * node, bool isEfficent = false);

--- a/source/sample/transformer/T2TAttention.cpp
+++ b/source/sample/transformer/T2TAttention.cpp
@@ -76,16 +76,11 @@ void T2TAttention::InitModel(int argc, char ** argv,
    InitTensor2DV2(&wbig, d, 3 * d, X_FLOAT, devID);
    float scale = 1.0F;
-    float finfoutk = (float)sqrt(6.0F * scale/(d + dk));
+    _SetDataFanInOut(&wk, scale);
-    float finfoutv = (float)sqrt(6.0F * scale/(d + dv));
+    _SetDataFanInOut(&wq, scale);
-    float finfouta = (float)sqrt(6.0F * scale / (d + d));
+    _SetDataFanInOut(&wv, scale);
-    float finfoutbig = (float)sqrt(6.0F * scale / (d + 3*d));
+    _SetDataFanInOut(&wa, scale);
+    _SetDataFanInOut(&wbig, scale);
-    wk.SetDataRand(-finfoutk, finfoutk);
-    wq.SetDataRand(-finfoutk, finfoutk);
-    wv.SetDataRand(-finfoutv, finfoutv);
-    wa.SetDataRand(-finfouta, finfouta);
-    wbig.SetDataRand(-finfoutbig, finfoutbig);
 }
 /* 

--- a/source/sample/transformer/T2TFNN.cpp
+++ b/source/sample/transformer/T2TFNN.cpp
@@ -67,12 +67,10 @@ void T2TFNN::InitModel(int argc, char ** argv, int myDevID)
    InitTensor1DV2(&b2, outSize, X_FLOAT, devID);
    float scale = 1.0F;
-    float finfout1 = (float)sqrt(6.0F * scale/(inSize + hSize));
+    _SetDataFanInOut(&w1, scale);
-    float finfout2 = (float)sqrt(6.0F * scale/(hSize + outSize));
+    _SetDataFanInOut(&w2, scale);
-    w1.SetDataRand(-finfout1, finfout1);
    b1.SetZeroAll();
-    w2.SetDataRand(-finfout2, finfout2);
    b2.SetZeroAll();
 }

--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -280,7 +280,7 @@ void XTensor::Init()
    isTmp =  false;
    isGrad = false;
    isVar  = false;
-    enableGrad = false;
+    enableGrad = true;
    visitMark = 0;
    grad = NULL;
 }
@@ -704,6 +704,12 @@ void XTensor::ReshapeMerged(const int i, const int j)
    Reshape(order - 1, dims);
 }
+/* return a tensor that datatype is same as the special tensor */
+XTensor XTensor::TypeAs(const XTensor input)
+{
+    return ConvertDataType(*this, input.dataType);
+}
 /* get the number of items in the data array */
 int XTensor::GetSize() const
 {
@@ -2977,4 +2983,28 @@ void DelTensorBuf(XTensor * tensor)
    delete tensor;
 }
+/* overloading of the plus-sign */
+XTensor operator+ (const DTYPE shift, const XTensor &tensor) 
+{
+    return ScaleAndShift(tensor, 1, shift);
+}
+/* overloading of the minus-sign */
+XTensor  operator- (const DTYPE shift, const XTensor &tensor)
+{
+    return ScaleAndShift(tensor, 1, -shift);
+}
+/* overloading of the multiply-sign */
+XTensor  operator* (const DTYPE scale, const XTensor &tensor)
+{
+    return ScaleAndShift(tensor, scale, 0);
+}
+/* overloading of the division-sign */
+XTensor  operator/ (const DTYPE scale, const XTensor &tensor)
+{
+    return ScaleAndShift(tensor, (DTYPE)1/scale, 0);
+}
 } /* end of the nts (NiuTrans.Tensor) namespace */
--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
@@ -283,6 +283,9 @@ public:
    /* reshape the tensor by merging two consecutive dimensions */
    void ReshapeMerged(const int i, const int j = -1);
+    /* return a tensor that datatype is same as the special tensor */
+    XTensor TypeAs(const XTensor input);
    /* get the number of items in the data array */
    int GetSize() const;
@@ -608,6 +611,18 @@ void DelTensor(XTensor * tensor);
 /* free the data space of a given tensor (on the buffer) */
 void DelTensorBuf(XTensor * tensor);
+/* overloading of the plus-sign */
+XTensor  operator+ (const DTYPE shift, const XTensor &tensor);
+/* overloading of the minus-sign */
+XTensor  operator- (const DTYPE shift, const XTensor &tensor);
+/* overloading of the multiply-sign */
+XTensor  operator* (const DTYPE scale, const XTensor &tensor);
+/* overloading of the division-sign */
+XTensor  operator/ (const DTYPE scale, const XTensor &tensor);
 } /* end of the nts (NiuTrans.Tensor) namespace */
 #endif
--- a/source/tensor/core/getandset/OnehotAndIndex.cpp
+++ b/source/tensor/core/getandset/OnehotAndIndex.cpp
@@ -21,6 +21,7 @@
 #include "OnehotAndIndex.h"
 #include "OnehotAndIndex.cuh"
+#include "SetData.h"
 namespace nts{ // namespace nts(NiuTrans.Tensor)
@@ -31,7 +32,7 @@ convert onehot tensor to index tensor
 >> index - index tensor, which value is an integer num
 >> size - the last dimension size of the onehot tensor
 */
-void _OnehotToIndex(XTensor * onehot, XTensor * index, int size)
+void _OnehotToIndex(const XTensor * onehot, XTensor * index, int size)
 {
    CheckNTErrors(onehot->GetDim(-1) == size, "Illegal tensor dimension!");
    CheckNTErrors(onehot->order == index->order + 1, "Illegal tensor order!");
@@ -78,7 +79,7 @@ make a new tensor to keep the result and return it
 >> size - the last dimension size of the onehot tensor
 << return - the index tensor
 */
-XTensor OnehotToIndex(XTensor & onehot, int size)
+XTensor OnehotToIndex(const XTensor & onehot, int size)
 {
    CheckNTErrors(onehot.GetDim(-1) == size, "Illegal tensor dimension!");
    CheckNTErrors(onehot.dataType == X_INT, "The onehot tensor must be in X_INT!")
@@ -99,7 +100,8 @@ convert index tensor to onehot tensor
 >> onehot - onehot tensor, which value is 0 or 1
 >> size - the last dimension size of the onehot tensor
 */
-void _IndexToOnehot(XTensor * index, XTensor * onehot, int size, float labelSmoothingP)
+void _IndexToOnehot(const XTensor * index, XTensor * onehot, 
+                    int size, float labelSmoothingP)
 {
    CheckNTErrors(onehot->GetDim(-1) == size, "Illegal tensor dimension!");
    CheckNTErrors(onehot->order == index->order + 1, "Illegal tensor order!");
@@ -109,11 +111,14 @@ void _IndexToOnehot(XTensor * index, XTensor * onehot, int size, float labelSmoo
    for (int i = 0; i < index->order; i++)
        CheckNTErrors(index->GetDim(i) == onehot->GetDim(i), "Illegal tensor order!");
-    onehot->SetZeroAll();
+    //onehot->SetZeroAll();
-#ifdef USE_CUDA
    float confidence = 1 - labelSmoothingP;
    float lowconfidence = labelSmoothingP / size;
+    _SetDataFixedFloat(onehot, lowconfidence);
+#ifdef USE_CUDA
    if(onehot->devID >= 0 && index->devID >= 0) {
        _CudaIndexToOnehot(index, onehot, size, confidence, lowconfidence);
        return;
@@ -129,7 +134,7 @@ void _IndexToOnehot(XTensor * index, XTensor * onehot, int size, float labelSmoo
    for (int i = 0; i < blockNum; i++) {
        int id = indexData[i];
        DTYPE * od = onehotData + i * stride;
-        od[id] = 1;
+        od[id] = confidence;
    }
 }
@@ -143,7 +148,7 @@ make a new tensor to keep the result and return it
 >> confidence - labelsmoothing
 << return - the onehot tensor
 */
-XTensor IndexToOnehot(XTensor & index, int size, float labelSmoothingP)
+XTensor IndexToOnehot(const XTensor & index, int size, float labelSmoothingP)
 {
    CheckNTErrors(index.dataType == X_INT, "The onehot tensor must be in X_INT!")

--- a/source/tensor/core/getandset/OnehotAndIndex.cu
+++ b/source/tensor/core/getandset/OnehotAndIndex.cu
@@ -61,7 +61,7 @@ convert onehot tensor to index tensor (cuda version)
 >> index - index tensor, which value is an integer num
 >> size - the last dimension size of the onehot tensor
 */
-void _CudaOnehotToIndex(XTensor * onehot, XTensor * index, int size)
+void _CudaOnehotToIndex(const XTensor * onehot, XTensor * index, int size)
 {
    int devID = onehot->devID;
@@ -111,13 +111,10 @@ void KernelIndexToOnehot(DTYPE * onehotData, int * indexData, int blockNum, int 
    int id = indexData[i];
-    //od[id] = 2.0;
-    //onehotData[i * stride + id] = 0.1;
    if (offset == id)
        od[offset] = confidence;
-    else{
+    //else
-        od[offset] = lowconfidence;
+    //    od[offset] = lowconfidence;
-    }
 }
 /* 
@@ -127,7 +124,8 @@ convert index tensor to onehot tensor (cuda version)
 >> onehot - onehot tensor, which value is 0 or 1
 >> size - the last dimension size of the onehot tensor
 */
-void _CudaIndexToOnehot(XTensor * index, XTensor * onehot, int size, float confidence, float lowconfidence)
+void _CudaIndexToOnehot(const XTensor * index, XTensor * onehot, 
+                        int size, float confidence, float lowconfidence)
 {
    int devID = onehot->devID;

--- a/source/tensor/core/getandset/OnehotAndIndex.cuh
+++ b/source/tensor/core/getandset/OnehotAndIndex.cuh
@@ -27,10 +27,11 @@
 namespace nts{ // namespace nts(NiuTrans.Tensor)
 /* convert onehot tensor to index tensor (cuda version) */
-void _CudaOnehotToIndex(XTensor * onehot, XTensor * index, int size);
+void _CudaOnehotToIndex(const XTensor * onehot, XTensor * index, int size);
 /* convert index tensor to onehot tensor (cuda version) */
-void _CudaIndexToOnehot(XTensor * index, XTensor * onehot, int size, float confidence, float lowconfidence);
+void _CudaIndexToOnehot(const XTensor * index, XTensor * onehot, 
+                        int size, float confidence, float lowconfidence);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/getandset/OnehotAndIndex.h
+++ b/source/tensor/core/getandset/OnehotAndIndex.h
@@ -27,18 +27,18 @@
 namespace nts{ // namespace nts(NiuTrans.Tensor)
 /* convert onehot tensor to index tensor */
-void _OnehotToIndex(XTensor * onehot, XTensor * index, int size);
+void _OnehotToIndex(const XTensor * onehot, XTensor * index, int size);
 /* convert onehot tensor to index tensor (return an XTensor structure)
 make a new tensor to keep the result and return it */
-XTensor OnehotToIndex(XTensor & onehot, int num);
+XTensor OnehotToIndex(const XTensor & onehot, int num);
 /* convert index tensor to onehot tensor */
-void _IndexToOnehot(XTensor * index, XTensor * onehot, int size, float labelSmoothingP);
+void _IndexToOnehot(const XTensor * index, XTensor * onehot, int size, float labelSmoothingP);
 /* convert index tensor to onehot tensor (return an XTensor structure)
 make a new tensor to keep the result and return it */
-XTensor IndexToOnehot(XTensor & index, int num, float labelSmoothingP);
+XTensor IndexToOnehot(const XTensor & index, int num, float labelSmoothingP);
 } // namespace nts(NiuTrans.Tensor)