udpates

726f5e21 · xiaotong · 70b8f94b · 726f5e21 · 726f5e21 · 726f5e21
Commit 726f5e21 authored Mar 22, 2021 by xiaotong
--- a/source/train/XLeader.cpp
+++ b/source/train/XLeader.cpp
@@ -118,6 +118,12 @@ void XLeader::SetServerModel(XConfig * config, XModel * model)
    SetServerModel(config, model, &members);
 }

+/* get server model */
+XModel * XLeader::GetServerModel()
+{
+    return &serverModel;
+}
+    
 /* initialize the models for running them */
 void XLeader::InitForRun()
 {
@@ -356,12 +362,10 @@ void XLeader::AddJobParamterWorker(int n)
 run the model (for one time). Basically this is a map-reduce process.
 >> config - the configuration
 >> dataDistributor - data distributor
->> model - the neural network that we want to run
 >> optimizer - the optimization method
 << return - if we can fetch the new data
 */
-bool XLeader::Run(XConfig * config, DataDistributeBase * dataDistributor,
-                  XModel * model, XOptimizer * optimizer)
+bool XLeader::Run(XConfig * config, DataDistributeBase * dataDistributor, XOptimizer * optimizer)
 {
    CheckNTErrors(jworkers.count > 0, "No jworkers!");
    CheckNTErrors(cworkers.count > 0, "No cworkers!");
@@ -376,6 +380,36 @@ bool XLeader::Run(XConfig * config, DataDistributeBase * dataDistributor,
    
    InitForRun();

+    /* run models on job workers */
+    activeJobCount = RunModel(config, dataDistributor, active);
+
+    /* update the model on the server side */
+    if (activeJobCount > 0 && isToUpdate)
+        RunUpdate(config, optimizer, active);
+
+    WaitForFinishing(active, isToUpdate);
+
+    for (int i = 0; i < jworkers.count; i++) {
+        XWorkerJob * worker = (XWorkerJob*)jworkers[i];
+        worker->Clear();
+    }
+
+    delete[] active;
+
+    return activeJobCount > 0;
+}
+
+/* 
+run the model 
+>> config - the configuration
+>> dataDistributor - to load batches of samples
+>> active - flag for each job worker (1 = active, 0 = not active)
+<< return - number of active job workers
+*/
+int XLeader::RunModel(XConfig * config, DataDistributeBase * dataDistributor, int * active)
+{
+    int activeJobCount = 0;
+
    for (int i = 0; i < jworkers.count; i++)
        active[i] = 0;

@@ -390,9 +424,7 @@ bool XLeader::Run(XConfig * config, DataDistributeBase * dataDistributor,
        /* get a batch of samples */
        bool fetched = dataDistributor->GetBatchSimple(worker->GetInput(), worker->GetGold());

-        if (!fetched)
-            isDataOK = false;
-        else {
+        if (fetched){
            /* job in queue 1: refresh the model */
            worker->AddJobRefresh(jmodel);

@@ -412,7 +444,17 @@ bool XLeader::Run(XConfig * config, DataDistributeBase * dataDistributor,
        }
    }

-    if (activeJobCount > 0 && isToUpdate) {
+    return activeJobCount;
+}
+
+/* 
+update the model 
+>> config - the configuration
+>> optimizer - the optimizer
+>> active - flag for each job worker (1 = active, 0 = not active)
+*/
+void XLeader::RunUpdate(XConfig * config, XOptimizer * optimizer, const int * active)
+{
    /* workers */
    XWorkerCollect * collecter = (XWorkerCollect*)cworkers.GetItem(0);
    XWorkerUpdate * updater = (XWorkerUpdate*)uworkers.GetItem(0);
@@ -439,6 +481,10 @@ bool XLeader::Run(XConfig * config, DataDistributeBase * dataDistributor,
        jobQueues.Add(worker->GetJobQueue());
    }

+    if(1){
+
+    }
+
    /* jobs in queue 2: collect the (gradient) data and other stuff. This
       is a reduce process. The collector will add a job in queue 3
       to update the model. The updater will  add a job job in queue 4 to
@@ -449,18 +495,6 @@ bool XLeader::Run(XConfig * config, DataDistributeBase * dataDistributor,
                               &members, &membersAll, &serverModel,
                               optimizer, updater, broadcaster);
    collecter->AddJobEnqueueFinished();
-    }
-
-    WaitForFinishing(active, isToUpdate);
-
-    for (int i = 0; i < jworkers.count; i++) {
-        XWorkerJob * worker = (XWorkerJob*)jworkers[i];
-        worker->Clear();
-    }
-
-    delete[] active;
-
-    return isDataOK;
 }

 } /* end of the nts (NiuTrans.Tensor) namespace */
--- a/source/train/XLeader.h
+++ b/source/train/XLeader.h
@@ -116,6 +116,9 @@ public:
    /* set the server model */
    void SetServerModel(XConfig * config, XModel * model);

+    /* get server model */
+    XModel * GetServerModel();
+    
    /* initialize the models for running them */
    void InitForRun();

@@ -158,9 +161,14 @@ public:
    /* add a parameter worker (or a pipeline) */
    void AddJobParamterWorker(int n);

-    /* run the model (for one time) */
-    bool Run(XConfig * config, DataDistributeBase * dataDistributor, 
-             XModel * model, XOptimizer * optimizer);
+    /* run the model and update it (for one time) */
+    bool Run(XConfig * config, DataDistributeBase * dataDistributor, XOptimizer * optimizer);
+
+    /* run the model */
+    int RunModel(XConfig * config, DataDistributeBase * dataDistributor, int * active);
+
+    /* update the model */
+    void RunUpdate(XConfig * config, XOptimizer * optimizer, const int * active);
 };

 }

--- a/source/train/XTrainer.cpp
+++ b/source/train/XTrainer.cpp
@@ -144,7 +144,7 @@ void XTrainer::Run(XConfig * config, DataDistributeBase * dataDistributor,
                    optimizer->SetLearningRate(LRScheduler.MakeLRTransformer(lrate, step + 1, nwarmup));

                /* one step of udpate */
-                ok = leader.Run(config, dataDistributor, model, optimizer);
+                ok = leader.Run(config, dataDistributor, optimizer);

                float loss = leader.GetLoss() / leader.GetSampleNum();

@@ -159,7 +159,7 @@ void XTrainer::Run(XConfig * config, DataDistributeBase * dataDistributor,
            }
            else {
                /* one step with no udpate */
-                ok = leader.Run(config, dataDistributor, model, NULL);
+                ok = leader.Run(config, dataDistributor, NULL);
            }
        }