bug fixes and updates of XWorkerBroadcast

8a6d5d3b · xiaotong · 1afdcdba · 8a6d5d3b · 8a6d5d3b · 8a6d5d3b
Commit 8a6d5d3b authored Mar 26, 2021 by xiaotong
--- a/source/train/TTrain.cpp
+++ b/source/train/TTrain.cpp
@@ -80,10 +80,10 @@ void TestTrain()
    config.Add("lrate", 0.1F);
    config.Add("nstep", 100000);
    config.Add("nepoch", 5);
-    config.Add("jobdev0", -1);
+    config.Add("jobdev0", 0);
-    config.Add("jobdev1", -1);
+    //config.Add("jobdev1", -1);
-    config.Add("jobdev2", -1);
+    //config.Add("jobdev2", -1);
-    config.Add("jobdev3", -1);
+    //config.Add("jobdev3", -1);
    //config.Add("jobdev4", -1);
    int serverDevID = config.GetInt("jobdev0", -1);

--- a/source/train/XLeader.cpp
+++ b/source/train/XLeader.cpp
@@ -415,7 +415,7 @@ void XLeader::MakeParamMap()
        }
        for (int j = 0, c = 0; j < jworkers.count; j++) {
-            XWorker * worker = (XWorker*)jworkers[i];
+            XWorker * worker = (XWorker*)jworkers[j];
            if (worker->GetWorkerType() == XWORKER_TYPE_JOB) {
                XModel * model = ((XWorkerJob*)jworkers[j])->GetModel();
                paramMap[i][c].tensor = model->params[i].tensor;
@@ -522,7 +522,7 @@ int XLeader::RunModel(XConfig * config, DataDistributeBase * dataDistributor, in
 }
 /* 
-update the model 
+update the model in a standard server-worker manner
 >> config - the configuration
 >> optimizer - the optimizer
 >> active - flag for each job worker (1 = active, 0 = not active)
@@ -555,7 +555,7 @@ void XLeader::RunUpdate(XConfig * config, XOptimizer * optimizer, const int * ac
    CheckNTErrors(jobQueues.count == serverModel.paramNum, "Incompatiable model!");
-    /* jobs in queue 2 (say jobQueue): collect the (gradient) data and other stuff.
+    /* jobs in queue 2 (say jobQueue): collect the (gradient) data.
    This is a reduce process. Then we add a job to to update the model. followed
    by a job to broadcast the lastest parameters to workers. NOTE that we
    would update a worker to the latest model parameters, even if it is not
@@ -583,6 +583,8 @@ void XLeader::RunUpdate(XConfig * config, XOptimizer * optimizer, const int * ac
        }
    }
+    XList * paramList = new XList[serverModel.paramNum];
    CheckNTErrors(modelCount == modelNum, "Wrong model number!");
    /* This is a simple implementation of the do-and-wait process */
@@ -620,6 +622,11 @@ void XLeader::RunUpdate(XConfig * config, XOptimizer * optimizer, const int * ac
                        collecter->AddJobCollectDataP2P(jobQueue, paramWorker.grad, paramServer.grad);
                        collecter->AddJobEnqueueFinished(jobQueue);
+                        /* We keep the worker parameter in a list. It would be used when we broadcast
+                           the updated paramter to the workers, that is, this is a list of worker 
+                           parameters. */
+                        paramList[j].Add(&paramWorker);
                        /* reset the flag */
                        paramWorker.flag = PARAM_STATE_COLLECTED;
                        finished++;
@@ -637,7 +644,7 @@ void XLeader::RunUpdate(XConfig * config, XOptimizer * optimizer, const int * ac
                                updater->AddJobEnqueueFinished(jobQueue);
                                /* broadcast the new parameter to other models */
-                                broadcaster->AddJobBroadcast(jobQueue, &serverModel, &membersAll, j);
+                                broadcaster->AddJobBroadcast(jobQueue, &paramServer, &paramList[j]);
                                broadcaster->AddJobEnqueueFinished(jobQueue);
                            }
                        }
@@ -658,6 +665,7 @@ void XLeader::RunUpdate(XConfig * config, XOptimizer * optimizer, const int * ac
    delete[] finishedCount;
    delete[] modelFlag;
+    delete[] paramList;
 }
 } /* end of the nts (NiuTrans.Tensor) namespace */
--- a/source/train/XTrainer.cpp
+++ b/source/train/XTrainer.cpp
@@ -89,7 +89,7 @@ run the trainer (this is the core process)
 >> optimizer - the optimizer
 */
 void XTrainer::Run(XConfig * config, DataDistributeBase * dataDistributor,
-                   XModel * model, XOptimizer * optimizer)
+    XModel * model, XOptimizer * optimizer)
 {
    CheckNTErrors(config != NULL, "No input config!");
    CheckNTErrors(dataDistributor != NULL, "No input data distributor!");
@@ -150,7 +150,7 @@ void XTrainer::Run(XConfig * config, DataDistributeBase * dataDistributor,
                if ((step + 1) % 100 == 0)
                    XPRINT5(1, stderr, "[INFO] elapsed=%.1fs epoch:%d step:%d sample:%d loss:%f\n",
-                            GetClockSec() - startT, epoch + 1, step + 1, leader.GetSampleNum(), loss);
+                        GetClockSec() - startT, epoch + 1, step + 1, leader.GetSampleNum(), loss);
                leader.ResetParamGrad();
@@ -166,10 +166,12 @@ void XTrainer::Run(XConfig * config, DataDistributeBase * dataDistributor,
        dataDistributor->End();
        if (step >= optimizer->nstep)
-            break;   
+            break;
    }
    delete[] ids;
+    XPRINT(1, stderr, "[INFO] Training Finished[DONE]");
 }
 /* show settings of training */

--- a/source/train/XWorkerBroadcast.cpp
+++ b/source/train/XWorkerBroadcast.cpp
@@ -53,22 +53,21 @@ void XWorkerBroadcast::SetBroadcastMode(DATA_BROADCAST_TYPE myMode)
 /* 
 broadcast data for a parameter 
 >> source - the data (as a model) that we want to broadcast
->> targetList - the target places that we recieve the data
+>> targetList - the target places where we recieve the data
->> pid - the parameter index
 */
-void XWorkerBroadcast::BroadcastData(XModel * source, XList * targetList, int pid)
+void XWorkerBroadcast::BroadcastData(XTensorKeeper * source, XList * targetList)
 {
-    CheckNTErrors(source->params[pid].flag == PARAM_STATE_UPDATED,
+    CheckNTErrors(source->flag == PARAM_STATE_UPDATED,
                  "The parameter is not ready for broadcasting");
    for (int i = 0; i < targetList->count; i++) {
-        XModel * target = (XModel*)targetList->GetItem(i);
+        XTensorKeeper * target = (XTensorKeeper*)targetList->GetItem(i);
        /* data transmit */
-        BroadcastP2P(source->params[pid].tensor, target->params[pid].tensor);
+        BroadcastP2P(source->tensor, target->tensor);
        /* update the flag */
-        target->params[pid].flag = PARAM_STATE_UPDATED;
+        target->flag = PARAM_STATE_UPDATED;
    }
 }
@@ -81,20 +80,17 @@ void XWorkerBroadcast::Broadcast(XList * args)
    int paramCount = 0;
    XWorkerBroadcast * broadcaster = (XWorkerBroadcast*)args->GetItem(paramCount++);
-    XModel * source = (XModel*)args->GetItem(paramCount++);
+    XTensorKeeper * source = (XTensorKeeper*)args->GetItem(paramCount++);
    /* target models */
    int targetNum = args->GetItemInt(paramCount++);
    XList target;
    for (int i = 0; i < targetNum; i++) {
-        XModel * model = (XModel*)args->GetItem(paramCount++);
+        XTensorKeeper * model = (XTensorKeeper*)args->GetItem(paramCount++);
        target.Add(model);
    }
-    /* parameter index */
+    broadcaster->BroadcastData(source, &target);
-    int p = args->GetInt(paramCount++);
-    broadcaster->BroadcastData(source, &target, p);
 }
 /* 
@@ -116,21 +112,18 @@ void XWorkerBroadcast::BroadcastP2P(XTensor * source, XTensor * target)
 add a new job of broadcasting data (for a parameter)
 >> jobQueue - the queue where we push jobs
 >> source - the data that we want to broadcast
->> targetList - the target places that we recieve the data
+>> targetList - the target places where we recieve the data
->> pid - the parameter index
 */
-bool XWorkerBroadcast::AddJobBroadcast(XQueue * jobQueue, XModel * source, XList * targetList, int pid)
+bool XWorkerBroadcast::AddJobBroadcast(XQueue * jobQueue, XTensorKeeper * source, XList * targetList)
 {
    CheckNTErrors(source != NULL, "no input source tensor!");
    CheckNTErrors(targetList != NULL, "no input target tensor list!");
-    CheckNTErrors(pid >= 0 && pid < source->paramNum, "illegal parameter index!");
    XList args;
    args.Add(this);
    args.Add(source);
    args.AddInt(targetList->count);
    args.AddList(targetList);
-    args.AddInt(pid);
    XQueue& queueRun = jobQueue != NULL ? *jobQueue : queue;

--- a/source/train/XWorkerBroadcast.h
+++ b/source/train/XWorkerBroadcast.h
@@ -61,7 +61,7 @@ public:
    void SetBroadcastMode(DATA_BROADCAST_TYPE myMode);
    /* broadcast data for a parameter */
-    void BroadcastData(XModel * source, XList * targetList, int pid);
+    void BroadcastData(XTensorKeeper * source, XList * targetList);
    /* wrapper of BroadcastDataSingle */
    static
@@ -71,7 +71,7 @@ public:
    void BroadcastP2P(XTensor * source, XTensor * target);
    /* add a new job of broadcasting data (for a parameter) */
-    bool AddJobBroadcast(XQueue * jobQueue, XModel * source, XList * targetList, int pid);
+    bool AddJobBroadcast(XQueue * jobQueue, XTensorKeeper * source, XList * targetList);
 };
 }