Commit 8a6d5d3b by xiaotong

bug fixes and updates of XWorkerBroadcast

parent 1afdcdba
......@@ -80,10 +80,10 @@ void TestTrain()
config.Add("lrate", 0.1F);
config.Add("nstep", 100000);
config.Add("nepoch", 5);
config.Add("jobdev0", -1);
config.Add("jobdev1", -1);
config.Add("jobdev2", -1);
config.Add("jobdev3", -1);
config.Add("jobdev0", 0);
//config.Add("jobdev1", -1);
//config.Add("jobdev2", -1);
//config.Add("jobdev3", -1);
//config.Add("jobdev4", -1);
int serverDevID = config.GetInt("jobdev0", -1);
......
......@@ -415,7 +415,7 @@ void XLeader::MakeParamMap()
}
for (int j = 0, c = 0; j < jworkers.count; j++) {
XWorker * worker = (XWorker*)jworkers[i];
XWorker * worker = (XWorker*)jworkers[j];
if (worker->GetWorkerType() == XWORKER_TYPE_JOB) {
XModel * model = ((XWorkerJob*)jworkers[j])->GetModel();
paramMap[i][c].tensor = model->params[i].tensor;
......@@ -522,7 +522,7 @@ int XLeader::RunModel(XConfig * config, DataDistributeBase * dataDistributor, in
}
/*
update the model
update the model in a standard server-worker manner
>> config - the configuration
>> optimizer - the optimizer
>> active - flag for each job worker (1 = active, 0 = not active)
......@@ -555,7 +555,7 @@ void XLeader::RunUpdate(XConfig * config, XOptimizer * optimizer, const int * ac
CheckNTErrors(jobQueues.count == serverModel.paramNum, "Incompatiable model!");
/* jobs in queue 2 (say jobQueue): collect the (gradient) data and other stuff.
/* jobs in queue 2 (say jobQueue): collect the (gradient) data.
This is a reduce process. Then we add a job to to update the model. followed
by a job to broadcast the lastest parameters to workers. NOTE that we
would update a worker to the latest model parameters, even if it is not
......@@ -583,6 +583,8 @@ void XLeader::RunUpdate(XConfig * config, XOptimizer * optimizer, const int * ac
}
}
XList * paramList = new XList[serverModel.paramNum];
CheckNTErrors(modelCount == modelNum, "Wrong model number!");
/* This is a simple implementation of the do-and-wait process */
......@@ -620,6 +622,11 @@ void XLeader::RunUpdate(XConfig * config, XOptimizer * optimizer, const int * ac
collecter->AddJobCollectDataP2P(jobQueue, paramWorker.grad, paramServer.grad);
collecter->AddJobEnqueueFinished(jobQueue);
/* We keep the worker parameter in a list. It would be used when we broadcast
the updated paramter to the workers, that is, this is a list of worker
parameters. */
paramList[j].Add(&paramWorker);
/* reset the flag */
paramWorker.flag = PARAM_STATE_COLLECTED;
finished++;
......@@ -637,7 +644,7 @@ void XLeader::RunUpdate(XConfig * config, XOptimizer * optimizer, const int * ac
updater->AddJobEnqueueFinished(jobQueue);
/* broadcast the new parameter to other models */
broadcaster->AddJobBroadcast(jobQueue, &serverModel, &membersAll, j);
broadcaster->AddJobBroadcast(jobQueue, &paramServer, &paramList[j]);
broadcaster->AddJobEnqueueFinished(jobQueue);
}
}
......@@ -658,6 +665,7 @@ void XLeader::RunUpdate(XConfig * config, XOptimizer * optimizer, const int * ac
delete[] finishedCount;
delete[] modelFlag;
delete[] paramList;
}
} /* end of the nts (NiuTrans.Tensor) namespace */
......@@ -89,7 +89,7 @@ run the trainer (this is the core process)
>> optimizer - the optimizer
*/
void XTrainer::Run(XConfig * config, DataDistributeBase * dataDistributor,
XModel * model, XOptimizer * optimizer)
XModel * model, XOptimizer * optimizer)
{
CheckNTErrors(config != NULL, "No input config!");
CheckNTErrors(dataDistributor != NULL, "No input data distributor!");
......@@ -150,7 +150,7 @@ void XTrainer::Run(XConfig * config, DataDistributeBase * dataDistributor,
if ((step + 1) % 100 == 0)
XPRINT5(1, stderr, "[INFO] elapsed=%.1fs epoch:%d step:%d sample:%d loss:%f\n",
GetClockSec() - startT, epoch + 1, step + 1, leader.GetSampleNum(), loss);
GetClockSec() - startT, epoch + 1, step + 1, leader.GetSampleNum(), loss);
leader.ResetParamGrad();
......@@ -166,10 +166,12 @@ void XTrainer::Run(XConfig * config, DataDistributeBase * dataDistributor,
dataDistributor->End();
if (step >= optimizer->nstep)
break;
break;
}
delete[] ids;
XPRINT(1, stderr, "[INFO] Training Finished[DONE]");
}
/* show settings of training */
......
......@@ -53,22 +53,21 @@ void XWorkerBroadcast::SetBroadcastMode(DATA_BROADCAST_TYPE myMode)
/*
broadcast data for a parameter
>> source - the data (as a model) that we want to broadcast
>> targetList - the target places that we recieve the data
>> pid - the parameter index
>> targetList - the target places where we recieve the data
*/
void XWorkerBroadcast::BroadcastData(XModel * source, XList * targetList, int pid)
void XWorkerBroadcast::BroadcastData(XTensorKeeper * source, XList * targetList)
{
CheckNTErrors(source->params[pid].flag == PARAM_STATE_UPDATED,
CheckNTErrors(source->flag == PARAM_STATE_UPDATED,
"The parameter is not ready for broadcasting");
for (int i = 0; i < targetList->count; i++) {
XModel * target = (XModel*)targetList->GetItem(i);
XTensorKeeper * target = (XTensorKeeper*)targetList->GetItem(i);
/* data transmit */
BroadcastP2P(source->params[pid].tensor, target->params[pid].tensor);
BroadcastP2P(source->tensor, target->tensor);
/* update the flag */
target->params[pid].flag = PARAM_STATE_UPDATED;
target->flag = PARAM_STATE_UPDATED;
}
}
......@@ -81,20 +80,17 @@ void XWorkerBroadcast::Broadcast(XList * args)
int paramCount = 0;
XWorkerBroadcast * broadcaster = (XWorkerBroadcast*)args->GetItem(paramCount++);
XModel * source = (XModel*)args->GetItem(paramCount++);
XTensorKeeper * source = (XTensorKeeper*)args->GetItem(paramCount++);
/* target models */
int targetNum = args->GetItemInt(paramCount++);
XList target;
for (int i = 0; i < targetNum; i++) {
XModel * model = (XModel*)args->GetItem(paramCount++);
XTensorKeeper * model = (XTensorKeeper*)args->GetItem(paramCount++);
target.Add(model);
}
/* parameter index */
int p = args->GetInt(paramCount++);
broadcaster->BroadcastData(source, &target, p);
broadcaster->BroadcastData(source, &target);
}
/*
......@@ -116,21 +112,18 @@ void XWorkerBroadcast::BroadcastP2P(XTensor * source, XTensor * target)
add a new job of broadcasting data (for a parameter)
>> jobQueue - the queue where we push jobs
>> source - the data that we want to broadcast
>> targetList - the target places that we recieve the data
>> pid - the parameter index
>> targetList - the target places where we recieve the data
*/
bool XWorkerBroadcast::AddJobBroadcast(XQueue * jobQueue, XModel * source, XList * targetList, int pid)
bool XWorkerBroadcast::AddJobBroadcast(XQueue * jobQueue, XTensorKeeper * source, XList * targetList)
{
CheckNTErrors(source != NULL, "no input source tensor!");
CheckNTErrors(targetList != NULL, "no input target tensor list!");
CheckNTErrors(pid >= 0 && pid < source->paramNum, "illegal parameter index!");
XList args;
args.Add(this);
args.Add(source);
args.AddInt(targetList->count);
args.AddList(targetList);
args.AddInt(pid);
XQueue& queueRun = jobQueue != NULL ? *jobQueue : queue;
......
......@@ -61,7 +61,7 @@ public:
void SetBroadcastMode(DATA_BROADCAST_TYPE myMode);
/* broadcast data for a parameter */
void BroadcastData(XModel * source, XList * targetList, int pid);
void BroadcastData(XTensorKeeper * source, XList * targetList);
/* wrapper of BroadcastDataSingle */
static
......@@ -71,7 +71,7 @@ public:
void BroadcastP2P(XTensor * source, XTensor * target);
/* add a new job of broadcasting data (for a parameter) */
bool AddJobBroadcast(XQueue * jobQueue, XModel * source, XList * targetList, int pid);
bool AddJobBroadcast(XQueue * jobQueue, XTensorKeeper * source, XList * targetList);
};
}
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论