Commit c0bcb78b by xiaotong

updates of XLeader

parent 0483b910
...@@ -184,47 +184,6 @@ void XLeader::MakeAll(XConfig * config, XModel * model) ...@@ -184,47 +184,6 @@ void XLeader::MakeAll(XConfig * config, XModel * model)
MakeParamMap(); MakeParamMap();
} }
/*
wait for finished states (i.e., all workers finish their jobs)
>> activeJobWorkers - indicates whether each job worker is active
>> isToUpdate - indicates whether the model is updated
*/
void XLeader::WaitForFinishing(const int* activeJobWorkers, const int isToUpdate)
{
int activeCount = 0;
for (int i = 0; i < jworkers.count; i++) {
if (activeJobWorkers[i] > 0) {
XWorker* worker = (XWorker*)jworkers[i];
worker->DequeueFinishedJob();
activeCount++;
CheckNTErrors(worker->GetFinishedNumInQueue() == 0, "Incorrect job number!");
}
}
if (activeCount > 0 && isToUpdate) {
for (int i = 0; i < cworkers.count; i++) {
XWorker* worker = (XWorker*)cworkers[i];
for(int j = 0; j < serverModel.paramNum * activeCount; j++)
worker->DequeueFinishedJob();
CheckNTErrors(worker->GetFinishedNumInQueue() == 0, "Incorrect job number!");
}
for (int i = 0; i < uworkers.count; i++) {
XWorker* worker = (XWorker*)uworkers[i];
for (int j = 0; j < serverModel.paramNum; j++)
worker->DequeueFinishedJob();
CheckNTErrors(worker->GetFinishedNumInQueue() == 0, "Incorrect job number!");
}
for (int i = 0; i < bworkers.count; i++) {
XWorker* worker = (XWorker*)bworkers[i];
for (int j = 0; j < serverModel.paramNum; j++)
worker->DequeueFinishedJob();
CheckNTErrors(worker->GetFinishedNumInQueue() == 0, "Incorrect job number!");
}
}
}
/* get loss */ /* get loss */
float XLeader::GetLoss() float XLeader::GetLoss()
{ {
......
...@@ -138,9 +138,6 @@ public: ...@@ -138,9 +138,6 @@ public:
/* prepare for running */ /* prepare for running */
void MakeAll(XConfig * config, XModel * model); void MakeAll(XConfig * config, XModel * model);
/* wait for finished states (i.e., all workers finish their jobs) */
void WaitForFinishing(const int * activeJobWorkers, const int isToUpdate);
/* get loss */ /* get loss */
float GetLoss(); float GetLoss();
......
...@@ -49,6 +49,21 @@ public: ...@@ -49,6 +49,21 @@ public:
/* deconstructor */ /* deconstructor */
~XLeaderAllReduce(); ~XLeaderAllReduce();
/* create workers and other stuff used in training */
void MakeAll(XConfig * config, XModel * model, const int * devIDs, const int jobWorkerNum);
/* wait for finished states (i.e., all workers finish their jobs) */
void WaitForFinishing(const int * activeJobWorkers, const int isToUpdate);
/* run the model and update it (for one time) */
bool Run(XConfig* config, DataDistributeBase* dataDistributor, XOptimizer* optimizer);
/* run the model */
int RunModel(XConfig* config, DataDistributeBase* dataDistributor, int* active);
/* update the model */
void RunUpdate(XConfig* config, XOptimizer* optimizer, const int* active);
}; };
} }
......
...@@ -44,7 +44,7 @@ XLeaderPS::~XLeaderPS() ...@@ -44,7 +44,7 @@ XLeaderPS::~XLeaderPS()
} }
/* /*
create workers create workers and other stuff
>> config - configuration >> config - configuration
>> model - the model that we run >> model - the model that we run
>> devIDs - device ids of the workers (the first id is for server) >> devIDs - device ids of the workers (the first id is for server)
...@@ -63,6 +63,47 @@ void XLeaderPS::MakeAll(XConfig * config, XModel * model, const int * devIDs, co ...@@ -63,6 +63,47 @@ void XLeaderPS::MakeAll(XConfig * config, XModel * model, const int * devIDs, co
} }
/* /*
wait for finished states (i.e., all workers finish their jobs)
>> activeJobWorkers - indicates whether each job worker is active
>> isToUpdate - indicates whether the model is updated
*/
void XLeaderPS::WaitForFinishing(const int* activeJobWorkers, const int isToUpdate)
{
int activeCount = 0;
for (int i = 0; i < jworkers.count; i++) {
if (activeJobWorkers[i] > 0) {
XWorker* worker = (XWorker*)jworkers[i];
worker->DequeueFinishedJob();
activeCount++;
CheckNTErrors(worker->GetFinishedNumInQueue() == 0, "Incorrect job number!");
}
}
if (activeCount > 0 && isToUpdate) {
for (int i = 0; i < cworkers.count; i++) {
XWorker* worker = (XWorker*)cworkers[i];
for (int j = 0; j < serverModel.paramNum * activeCount; j++)
worker->DequeueFinishedJob();
CheckNTErrors(worker->GetFinishedNumInQueue() == 0, "Incorrect job number!");
}
for (int i = 0; i < uworkers.count; i++) {
XWorker* worker = (XWorker*)uworkers[i];
for (int j = 0; j < serverModel.paramNum; j++)
worker->DequeueFinishedJob();
CheckNTErrors(worker->GetFinishedNumInQueue() == 0, "Incorrect job number!");
}
for (int i = 0; i < bworkers.count; i++) {
XWorker* worker = (XWorker*)bworkers[i];
for (int j = 0; j < serverModel.paramNum; j++)
worker->DequeueFinishedJob();
CheckNTErrors(worker->GetFinishedNumInQueue() == 0, "Incorrect job number!");
}
}
}
/*
run the model (for one time). Basically this is a map-reduce process. run the model (for one time). Basically this is a map-reduce process.
>> config - the configuration >> config - the configuration
>> dataDistributor - data distributor >> dataDistributor - data distributor
...@@ -133,8 +174,8 @@ int XLeaderPS::RunModel(XConfig* config, DataDistributeBase* dataDistributor, in ...@@ -133,8 +174,8 @@ int XLeaderPS::RunModel(XConfig* config, DataDistributeBase* dataDistributor, in
/* job in queue 1: run the model */ /* job in queue 1: run the model */
worker->AddJobNeuralNet(jmodel, worker->AddJobNeuralNet(jmodel,
worker->GetInput(), worker->GetOutput(), worker->GetInput(), worker->GetOutput(),
worker->GetGold(), worker->GetLoss()); worker->GetGold(), worker->GetLoss());
/* job in queue 1: make a record of the run */ /* job in queue 1: make a record of the run */
worker->AddJobRecord(&serverRecord); worker->AddJobRecord(&serverRecord);
......
...@@ -50,6 +50,9 @@ public: ...@@ -50,6 +50,9 @@ public:
/* create workers and other stuff used in training */ /* create workers and other stuff used in training */
void MakeAll(XConfig * config, XModel * model, const int * devIDs, const int jobWorkerNum); void MakeAll(XConfig * config, XModel * model, const int * devIDs, const int jobWorkerNum);
/* wait for finished states (i.e., all workers finish their jobs) */
void WaitForFinishing(const int * activeJobWorkers, const int isToUpdate);
/* run the model and update it (for one time) */ /* run the model and update it (for one time) */
bool Run(XConfig* config, DataDistributeBase* dataDistributor, XOptimizer* optimizer); bool Run(XConfig* config, DataDistributeBase* dataDistributor, XOptimizer* optimizer);
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论