updates

fbd915c6 · xiaotong · d69372b3 · fbd915c6 · fbd915c6 · fbd915c6
Commit fbd915c6 authored Mar 11, 2021 by xiaotong
--- a/source/tensor/XUtility.cpp
+++ b/source/tensor/XUtility.cpp
@@ -485,6 +485,9 @@ unsigned int GetNextPower2(unsigned int n)
 /* sleep for a while */
 void XSleep(int sleepTime)
 {
+    if (sleepTime <= 0)
+        return;
+
 #ifdef  _WIN32
    Sleep((DWORD)sleepTime);
 #else

--- a/source/train/XModel.cpp
+++ b/source/train/XModel.cpp
@@ -39,6 +39,7 @@ XParamKeeper::XParamKeeper()
 {
    param = NULL;
    flag = PARAM_STATE_NOT_READY;
+    trainFlag = PARAM_STATE_NOT_READY;
    MUTEX_INIT(accessLock);
    MUTEX_INIT(trainLock);
 }
@@ -153,9 +154,36 @@ bool XModel::CheckParam()
 /* initial model for running the it */
 void XModel::InitForRun()
 {
+    RefreshMe();
+}
+
+/* lock the parameter states (wait for unlocking them when
+   a run of training is finished) */
+void XModel::LockParamsForTraining()
+{
    for (int i = 0; i < paramNum; i++) {
-        params[i].param->isGradFinished = false;
-        params[i].flag = PARAM_STATE_NOT_READY;
+        params[i].trainFlag = PARAM_STATE_NOT_READY;
+        MUTEX_LOCK(params[i].trainLock);
+
+        /* where is UNLOCK? We will do this when the training (a step)
+           is finsished. Then, WaitForUnlockedParams() can continue. In
+           such a way, we implement a START-WAIT process in each run
+           of training (a step) */
+    }
+}
+
+/* unlock the parameter states */
+void XModel::WaitForUnlockedParams()
+{
+    for (int i = 0; i < paramNum; i++) {
+        /* the lock proceeds only when the trainLock is unlocked 
+           in training. In this way, we are actually waiting for
+           the FINISHED signal from other workers/threads. */
+        MUTEX_LOCK(params[i].trainLock);
+
+        CheckNTErrors(params[i].trainFlag == PARAM_STATE_UPDATED,
+                      "the state of the parameter is wrong!");
+        MUTEX_UNLOCK(params[i].trainLock);
    }
 }

@@ -165,6 +193,7 @@ void XModel::RefreshMe()
    for (int i = 0; i < paramNum; i++) {
        params[i].param->isGradFinished = false;
        params[i].flag = PARAM_STATE_NOT_READY;
+        params[i].trainFlag = PARAM_STATE_NOT_READY;
    }
 }


--- a/source/train/XModel.h
+++ b/source/train/XModel.h
@@ -60,6 +60,11 @@ public:
    /* the parameter state */
    PARAM_STATE flag;

+    /* the state of the entire training process 
+      (choosing from PARAM_STATE_NOT_READY and 
+      PARAM_STATE_UPDATED */
+    PARAM_STATE trainFlag;
+
    /* a mutex for locking and unlocking the parameter */
    MUTEX_HANDLE accessLock;

@@ -120,6 +125,13 @@ public:
    /* check if the parameters are well-defined for training */
    bool CheckParam();

+    /* lock the parameter states (wait for unlocking them when
+       a run of training is finished) */
+    void LockParamsForTraining();
+
+    /* wait for unlocked the parameter states */
+    void WaitForUnlockedParams();
+    
    /* initial model for running the it */
    void InitForRun();


--- a/source/train/XWorkerBroadcast.cpp
+++ b/source/train/XWorkerBroadcast.cpp
@@ -223,4 +223,49 @@ bool XWorkerBroadcast::AddJobBroadcast(XModel * source, XList * targetList)
    return true;
 }

+
+/* 
+mark the state of the parameter to FINISHED 
+>> source - the model that we are updating
+>> pid - the parameter index
+*/
+void XWorkerBroadcast::FinishUpdateSingle(XModel * source, int pid)
+{
+    source->params[pid].trainFlag = PARAM_STATE_UPDATED;
+    MUTEX_UNLOCK(source->params[pid].trainLock);
+}
+
+/* wrapper of FinishUpdateSingle */
+void XWorkerBroadcast::FinishSingle(XList * args)
+{
+    XWorkerBroadcast * broadcaster = (XWorkerBroadcast*)args->GetItem(0);
+    XModel * source = (XModel*)args->GetItem(1);
+    int pid = args->GetInt(2);
+
+    broadcaster->FinishUpdateSingle(source, pid);
+}
+
+/* 
+add a new job of finishing the update
+>> source - the model that we are updating
+>> pid - the parameter index
+*/
+bool XWorkerBroadcast::AddJobFinish(XModel * source, int pid)
+{
+    CheckNTErrors(source != NULL, "no input source tensor!");
+    CheckNTErrors(pid >= 0 && pid < source->paramNum, "illegal parameter index!");
+
+    XList args;
+    args.Add(this);
+    args.Add(source);
+    args.AddInt(pid);
+
+    if (isInstantRun)
+        XWorkerBroadcast::FinishSingle(&args);
+    else
+        queue.EnqueueJob((void*)(char*)XWorkerBroadcast::FinishSingle, &args);
+
+    return true;
+}
+
 }
--- a/source/train/XWorkerBroadcast.h
+++ b/source/train/XWorkerBroadcast.h
@@ -82,6 +82,16 @@ public:

    /* add a new job of broadcasting data (for a model) */
    bool AddJobBroadcast(XModel * source, XList * targetList);
+
+    /* mark the state of the parameter to FINISHED */
+    void FinishUpdateSingle(XModel * source, int pid);
+
+    /* wrapper of FinishUpdateSingle */
+    static
+    void FinishSingle(XList * args);
+
+    /* add a new job of finishing the update */
+    bool AddJobFinish(XModel * source, int pid);
 };

 }