bug fixes

d5269725 · xiaotong · 9eda6d83 · d5269725 · d5269725 · d5269725
Commit d5269725 authored Mar 07, 2021 by xiaotong
--- a/source/train/TTrain.cpp
+++ b/source/train/TTrain.cpp
@@ -74,6 +74,7 @@ void TestTrain()

    XConfig config;
    config.Add("dev", -1);
+    config.Add("lrate", 0.1F);

    TTDataLoader loader;
    loader.SetFileName("ttrain.txt");
@@ -217,9 +218,14 @@ void TTModel::SetConfig(XConfig &myConfig)
    config.CreateFromMe(myConfig);
 }

-/* initialize the model */
+/* 
+initialize the model 
+>> myConfig - configuration
+>> devID - device id
+*/
 void TTModel::Init(XConfig &myConfig, int devID)
 {
+    Clear();
    SetConfig(myConfig);

    vSize = MAX_INT_IN_TTRAIN + 1;
@@ -230,12 +236,25 @@ void TTModel::Init(XConfig &myConfig, int devID)
    InitTensor2D(&hiddenW, 3 * eSize, hSize, X_FLOAT, devID);
    InitTensor2D(&outputW, hSize, vSize, X_FLOAT, devID);

+    embeddingW.SetName("embeddingw");
+    hiddenW.SetName("hiddenw");
+    outputW.SetName("outputw");
+
    embeddingW.SetDataRand(-0.1F, 0.1F);
    hiddenW.SetDataRand(-0.1F, 0.1F);
    outputW.SetDataRand(-0.1F, 0.1F);
+    
+    AddParam(&embeddingW);
+    AddParam(&hiddenW);
+    AddParam(&outputW);
 }

-/* create the model */
+/* 
+create the model 
+>> devID - device id
+>> input - as it is
+>> output - as it is
+*/
 void TTModel::Forward(int devID, XTensor * input, XTensor * output)
 {
    XTensor embedding;
@@ -261,7 +280,10 @@ void TTModel::Clear()
    config.Clear();
 }

-/* clone the model */
+/* 
+clone the model 
+>> devID - device id
+*/
 XModel * TTModel::Clone(int devID)
 {
    TTModel * model = new TTModel();
@@ -293,8 +315,10 @@ bool TTModel::RunSimple(XList * inputs, XList * outputs, XList * golds, XList* l

    XNet net;

+    /* create the neural network and run it */
    Forward(devID, input, output);

+    /* gold standard in ong-hot representaiton */
    goldOneHot = IndexToOnehot(*gold, vSize, 0.0F);

    int* dims = new int[goldOneHot.order];
@@ -303,8 +327,10 @@ bool TTModel::RunSimple(XList * inputs, XList * outputs, XList * golds, XList* l
    dims[goldOneHot.order - 2] = goldOneHot.GetDim(goldOneHot.order - 1);
    goldOneHot.Reshape(goldOneHot.order - 1, dims);

+    /* loss */
    *loss = CrossEntropy(*output, goldOneHot);

+    /* back-propagation */
    net.Backward(*loss);

    delete[] dims;

--- a/source/train/XLeader.cpp
+++ b/source/train/XLeader.cpp
@@ -90,8 +90,11 @@ Set the server model. It distributes the server-side parameters on different dev
 */
 void XLeader::SetServerModel(XConfig * config, XModel * model, XList * memberModels)
 {
-    serverModel.params.Clear();
-    serverModel.params.AddList(&model->params);
+    serverModel.Clear();
+    for (int i = 0; i < model->params.count; i++) {
+        XTensor * param = (XTensor*)model->params[i];
+        serverModel.AddParam(param);
+    }

    /* TODO: we can place parameters on different devices */
 }

--- a/source/train/XModel.cpp
+++ b/source/train/XModel.cpp
@@ -97,6 +97,24 @@ bool XModel::RunMe(XList * args)
    return false;
 }

+/* 
+add a parameter tensor 
+>> param - add a 
+*/
+void XModel::AddParam(XTensor* param)
+{
+    //param->SetVarFlag();
+    params.Add(param);
+
+    PARAM_STATE * newFlags = new PARAM_STATE[params.count];
+    
+    memcpy(newFlags, flags, sizeof(PARAM_STATE) * (params.count - 1));
+    newFlags[params.count - 1] = PARAM_STATE_NOT_READY;
+
+    delete[] flags;
+    flags = newFlags;
+}
+
 /* refresh the model */
 void XModel::RefreshMe()
 {

--- a/source/train/XModel.h
+++ b/source/train/XModel.h
@@ -89,6 +89,8 @@ protected:
    bool RunMe(XList * args);

 public:
+    /* add a parameter tensor */
+    void AddParam(XTensor* param);

    /* refresh the model */
    void RefreshMe();
@@ -100,6 +102,7 @@ public:
    /* wrapper of Run() */
    static
    bool Run(XList * args);
+
 };

 }

--- a/source/train/XOptimizer.cpp
+++ b/source/train/XOptimizer.cpp
@@ -47,6 +47,9 @@ initialize the optimizer
 */
 void XOptimizer::Init(XConfig &config)
 {
+    nstep = config.GetInt("nstep", 100000);
+    nepoch = config.GetInt("nepoch", 50);
+    lrate = config.GetFloat("lrate", 0.1F);
 }

 /* clear the optimizer */

--- a/source/train/XWorkerCollect.cpp
+++ b/source/train/XWorkerCollect.cpp
@@ -76,7 +76,7 @@ void XWorkerCollect::CollectData(XList * sourceList, XModel * target, long sleep
        
        if (collectMode == DATA_COLLECT_P2P) {
            for (int j = 0; j < tp.count; j++) {
-                /* tp[j]->isGradFinished is true only if the model finishes the computation
+                /* tp[j]->isGradFinished is true only if the model finishes theA computation
                (in another process) */
                if (target->flags[j] == PARAM_STATE_COLLECTED || !tp[j]->isGradFinished)
                    continue;
@@ -91,7 +91,7 @@ void XWorkerCollect::CollectData(XList * sourceList, XModel * target, long sleep
                    if (source->flags[j] != PARAM_STATE_COLLECTED && sp[j]->isGradFinished) {

                        /* data transmit */
-                        CollectP2P(sp.GetItem(j)->grad, tp.GetItem(j)->grad);
+                        CollectP2P(sp[j]->grad, tp[j]->grad);

                        /* reset the flag */
                        source->flags[j] = PARAM_STATE_COLLECTED;
@@ -158,16 +158,16 @@ void XWorkerCollect::CollectData(XList * sourceList, XModel * target, long sleep
        if (finished == tp.count * sourceList->count)
            break;

-        /* reset the flags */
-        for (int j = 0; j < tp.count; j++)
-            target->flags[j] = PARAM_STATE_COLLECTED;
-
 #ifdef _WIN32
        Sleep((DWORD)sleepTime);
 #else
        sleep((unsigned)sleepTime / 1000);
 #endif
    }
+
+    /* reset the flags */
+    for (int j = 0; j < tp.count; j++)
+        target->flags[j] = PARAM_STATE_COLLECTED;
 }

 /* wrapper of CollectData */
@@ -203,7 +203,8 @@ void XWorkerCollect::CollectP2P(XTensor * source, XTensor * target)
    CheckNTErrors(IsSameShaped(source, target), "The two tensors should be of the same shape!");

    /* target += source */
-    Sum(*source, *target, *source);
+    if(source != target)
+        Sum(*source, *target, *source);
 }

 /* 

--- a/source/train/XWorkerUpdate.cpp
+++ b/source/train/XWorkerUpdate.cpp
@@ -72,6 +72,8 @@ void XWorkerUpdate::UpdateModel(XModel * model, XOptimizer * optimizer, long sle
                XTensor * param = params.GetItem(i);
                XTensor * grad = param->grad;

+                CheckNTErrors(grad != NULL, "No gradient!");
+
                /* update the parameter */
                optimizer->UpdateParam(param, grad, i);