Commit c0bcb78b by xiaotong

updates of XLeader

parent 0483b910
......@@ -184,47 +184,6 @@ void XLeader::MakeAll(XConfig * config, XModel * model)
MakeParamMap();
}
/*
wait for finished states (i.e., all workers finish their jobs)
>> activeJobWorkers - indicates whether each job worker is active
>> isToUpdate - indicates whether the model is updated
*/
void XLeader::WaitForFinishing(const int* activeJobWorkers, const int isToUpdate)
{
int activeCount = 0;
for (int i = 0; i < jworkers.count; i++) {
if (activeJobWorkers[i] > 0) {
XWorker* worker = (XWorker*)jworkers[i];
worker->DequeueFinishedJob();
activeCount++;
CheckNTErrors(worker->GetFinishedNumInQueue() == 0, "Incorrect job number!");
}
}
if (activeCount > 0 && isToUpdate) {
for (int i = 0; i < cworkers.count; i++) {
XWorker* worker = (XWorker*)cworkers[i];
for(int j = 0; j < serverModel.paramNum * activeCount; j++)
worker->DequeueFinishedJob();
CheckNTErrors(worker->GetFinishedNumInQueue() == 0, "Incorrect job number!");
}
for (int i = 0; i < uworkers.count; i++) {
XWorker* worker = (XWorker*)uworkers[i];
for (int j = 0; j < serverModel.paramNum; j++)
worker->DequeueFinishedJob();
CheckNTErrors(worker->GetFinishedNumInQueue() == 0, "Incorrect job number!");
}
for (int i = 0; i < bworkers.count; i++) {
XWorker* worker = (XWorker*)bworkers[i];
for (int j = 0; j < serverModel.paramNum; j++)
worker->DequeueFinishedJob();
CheckNTErrors(worker->GetFinishedNumInQueue() == 0, "Incorrect job number!");
}
}
}
/* get loss */
float XLeader::GetLoss()
{
......
......@@ -138,9 +138,6 @@ public:
/* prepare for running */
void MakeAll(XConfig * config, XModel * model);
/* wait for finished states (i.e., all workers finish their jobs) */
void WaitForFinishing(const int * activeJobWorkers, const int isToUpdate);
/* get loss */
float GetLoss();
......
......@@ -49,6 +49,21 @@ public:
/* deconstructor */
~XLeaderAllReduce();
/* create workers and other stuff used in training */
void MakeAll(XConfig * config, XModel * model, const int * devIDs, const int jobWorkerNum);
/* wait for finished states (i.e., all workers finish their jobs) */
void WaitForFinishing(const int * activeJobWorkers, const int isToUpdate);
/* run the model and update it (for one time) */
bool Run(XConfig* config, DataDistributeBase* dataDistributor, XOptimizer* optimizer);
/* run the model */
int RunModel(XConfig* config, DataDistributeBase* dataDistributor, int* active);
/* update the model */
void RunUpdate(XConfig* config, XOptimizer* optimizer, const int* active);
};
}
......
......@@ -44,7 +44,7 @@ XLeaderPS::~XLeaderPS()
}
/*
create workers
create workers and other stuff
>> config - configuration
>> model - the model that we run
>> devIDs - device ids of the workers (the first id is for server)
......@@ -63,6 +63,47 @@ void XLeaderPS::MakeAll(XConfig * config, XModel * model, const int * devIDs, co
}
/*
wait for finished states (i.e., all workers finish their jobs)
>> activeJobWorkers - indicates whether each job worker is active
>> isToUpdate - indicates whether the model is updated
*/
void XLeaderPS::WaitForFinishing(const int* activeJobWorkers, const int isToUpdate)
{
int activeCount = 0;
for (int i = 0; i < jworkers.count; i++) {
if (activeJobWorkers[i] > 0) {
XWorker* worker = (XWorker*)jworkers[i];
worker->DequeueFinishedJob();
activeCount++;
CheckNTErrors(worker->GetFinishedNumInQueue() == 0, "Incorrect job number!");
}
}
if (activeCount > 0 && isToUpdate) {
for (int i = 0; i < cworkers.count; i++) {
XWorker* worker = (XWorker*)cworkers[i];
for (int j = 0; j < serverModel.paramNum * activeCount; j++)
worker->DequeueFinishedJob();
CheckNTErrors(worker->GetFinishedNumInQueue() == 0, "Incorrect job number!");
}
for (int i = 0; i < uworkers.count; i++) {
XWorker* worker = (XWorker*)uworkers[i];
for (int j = 0; j < serverModel.paramNum; j++)
worker->DequeueFinishedJob();
CheckNTErrors(worker->GetFinishedNumInQueue() == 0, "Incorrect job number!");
}
for (int i = 0; i < bworkers.count; i++) {
XWorker* worker = (XWorker*)bworkers[i];
for (int j = 0; j < serverModel.paramNum; j++)
worker->DequeueFinishedJob();
CheckNTErrors(worker->GetFinishedNumInQueue() == 0, "Incorrect job number!");
}
}
}
/*
run the model (for one time). Basically this is a map-reduce process.
>> config - the configuration
>> dataDistributor - data distributor
......@@ -133,8 +174,8 @@ int XLeaderPS::RunModel(XConfig* config, DataDistributeBase* dataDistributor, in
/* job in queue 1: run the model */
worker->AddJobNeuralNet(jmodel,
worker->GetInput(), worker->GetOutput(),
worker->GetGold(), worker->GetLoss());
worker->GetInput(), worker->GetOutput(),
worker->GetGold(), worker->GetLoss());
/* job in queue 1: make a record of the run */
worker->AddJobRecord(&serverRecord);
......
......@@ -50,6 +50,9 @@ public:
/* create workers and other stuff used in training */
void MakeAll(XConfig * config, XModel * model, const int * devIDs, const int jobWorkerNum);
/* wait for finished states (i.e., all workers finish their jobs) */
void WaitForFinishing(const int * activeJobWorkers, const int isToUpdate);
/* run the model and update it (for one time) */
bool Run(XConfig* config, DataDistributeBase* dataDistributor, XOptimizer* optimizer);
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论