Commit c6f50a22 by xiaotong

load batch of sequence on both langauge sides

parent 430f0dfc
......@@ -67,17 +67,17 @@ void AttDecoder::InitModel(int argc, char ** argv,
/*
make the decoding network
>> input - the input tensor of the decoder
>> encoderOutput - the output tensor of the encoder
>> inputDec - the input tensor of the decoder
>> outputEnc - the output tensor of the encoder
>> mask - the mask that indicate each position is valid
>> isTraining - indicates whether the model is used for training
<< return - the output tensor of the encoder
*/
XTensor AttDecoder::Make(XTensor &input, XTensor &encoderOutput, XTensor &mask, bool isTraining)
XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, bool isTraining)
{
XTensor x;
x = embedder.Make(input);
x = embedder.Make(inputDec);
/* dropout */
if(isTraining && dropoutP > 0)
......@@ -106,7 +106,7 @@ XTensor AttDecoder::Make(XTensor &input, XTensor &encoderOutput, XTensor &mask,
/*****************************/
/* encoder-decoder attention */
ende = attentionsEnde[i].Make(encoderOutput, x, encoderOutput, mask, isTraining);
ende = attentionsEnde[i].Make(outputEnc, x, outputEnc, mask, isTraining);
/* dropout */
if(isTraining && dropoutP > 0)
......
......@@ -48,7 +48,7 @@ public:
int myDevID = -1, XMem * myMem = NULL);
/* make the decoding network */
XTensor Make(XTensor &input, XTensor &encoderOutput, XTensor &mask, bool isTraining);
XTensor Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, bool isTraining);
};
}
......
......@@ -90,13 +90,27 @@ make the encoding network
>> isTraining - indicates whether we are training the model
<< return - encoding result
*/
XTensor T2TModel::MakeEncoding(XTensor &input, XTensor &mask, bool isTraining)
XTensor T2TModel::MakeEncoder(XTensor &input, XTensor &mask, bool isTraining)
{
return encoder.Make(input, mask, isTraining);
}
/*
make the entire network for language modeling (with the output softmax layer)
make the decoding network
>> inputDec - input tensor of the decoder
>> outputEnc - output tensor of the encoder
>> output - output tensor (distribution)
>> mask - the mask for positions that are/not involved in computation
>> isTraining - indicates whether we are training the model
<< return - encoding result
*/
XTensor T2TModel::MakeDecoder(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, bool isTraining)
{
return decoder.Make(inputDec, outputEnc, mask, isTraining);
}
/*
make the network for language modeling (with the output softmax layer)
>> input - input tensor
>> output - output tensor (distribution)
>> padding - padding of the sequences
......@@ -145,7 +159,7 @@ void T2TModel::MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool is
//_Sum(&mask, padding3, &mask);
encoding = MakeEncoding(input, mask, isTraining);
encoding = MakeEncoder(input, mask, isTraining);
outputLayer.Make(encoding, output);
delete[] dims;
......@@ -156,6 +170,43 @@ void T2TModel::MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool is
}
/*
make the network for machine translation (with the output softmax layer)
>> inputEnc - input tensor of the encoder
>> inputDec - input tensor of the decoder
>> output - output tensor (distribution)
>> padding - padding of the sequences
>> isTraining - indicates whether the model is for training
*/
void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTensor &padding, bool isTraining)
{
XTensor encoding;
XTensor decoding;
XTensor maskEnc;
XTensor maskDec;
/* generate mask to see "previous" words on the decoder side */
int len = inputDec.GetDim(inputDec.order - 2);
int * dims = new int[inputDec.order + 1];
for(int i = 0; i < inputDec.order; i++)
dims[i + 1] = inputDec.GetDim(i);
dims[0] = nhead;
dims[inputDec.order] = len;
InitTensor(&maskDec, inputDec.order + 1, dims, X_FLOAT, 1.0F, inputDec.devID, inputDec.mem);
/* a upper triangular matrix where the cells of the upper triangular are set to -1e-9.
this matrix can be used to prevent the attention to current or following words in
a given sequence. */
_SetDataLowTri(&maskDec, 1e9F, 0);
_ScaleAndShiftMe(&maskDec, 1.0F, -1e9F);
encoding = MakeEncoder(inputEnc, maskEnc, isTraining);
decoding = MakeDecoder(inputDec, encoding, maskDec, isTraining);
outputLayer.Make(decoding, output);
delete[] dims;
}
/*
get parameter matrics
>> list - the list that keeps the parameter matrics
*/
......
......@@ -69,11 +69,17 @@ public:
void InitModel(int argc, char ** argv);
/* make the encoding network */
XTensor MakeEncoding(XTensor &input, XTensor &mask, bool isTraining);
XTensor MakeEncoder(XTensor &input, XTensor &mask, bool isTraining);
/* make the entire network for langauge modeling (with the output softmax layer) */
/* make the encoding network */
XTensor MakeDecoder(XTensor &inputEnc, XTensor &inputDec, XTensor &mask, bool isTraining);
/* make the network for langauge modeling (with the output softmax layer) */
void MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool isTraining);
/* make the network for machine translation (with the output softmax layer) */
void MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTensor &padding, bool isTraining);
/* get parameter matrics */
void GetParams(XList &list);
......
......@@ -101,6 +101,7 @@ void T2TTrainer::Init(int argc, char ** argv)
LoadParamInt(argc, argv, "d", &d, 512);
LoadParamInt(argc, argv, "nwarmup", &nwarmup, 4000);
LoadParamInt(argc, argv, "vsize", &vSize, 1);
LoadParamInt(argc, argv, "vsizetgt", &vSizeTgt, vSize);
LoadParamBool(argc, argv, "sorted", &isLenSorted, false);
LoadParamInt(argc, argv, "bufsize", &bufSize, 50000);
LoadParamBool(argc, argv, "adam", &useAdam, false);
......@@ -189,7 +190,9 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
/* label smoothed gold standard (if needed) */
XTensor goldSmoothed;
while (LoadBatch(file, true, &batch, &padding, &gold, NULL, 1, vSize, sBatchSize, wBatchSize, isLenSorted, wc, devID, mem)) {
while (LoadBatch(file, true, &batch, &padding, &gold, NULL, vSize, vSizeTgt,
sBatchSize, wBatchSize, isLenSorted, wc, devID, mem))
{
CheckNTErrors(batch.order == 3, "wrong tensor order of the sequence batch");
......@@ -197,7 +200,13 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
XTensor output;
/* make the network */
model->MakeLM(batch, output, padding, true);
if(model->isLM)
model->MakeLM(batch, output, padding, true);
else if(model->isMT)
model->MakeMT(batch, gold, output, padding, true);
else{
ShowNTErrors("Illegal model type!");
}
/* back-propagation for obtaining gradients */
if (labelSmoothingP > 0)
......@@ -222,13 +231,6 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
/* back-propagation */
net.Backward(output, g, CROSSENTROPY);
/*for(int i = 0; i < net.nodes.count; i++){
XTensor * node = (XTensor*)net.nodes.Get(i);
XLink::ShowNode(stderr, node);
}
exit(0);*/
gradStep += 1;
loss += -prob;
wordCount += wc;
......@@ -335,7 +337,9 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
ClearBuf();
while(LoadBatch(file, true, &batch, &padding, &gold, seqs, 1, vSize, 1, 1, false, wc, devID, mem)){
while(LoadBatch(file, true, &batch, &padding, &gold, seqs, vSize, vSizeTgt,
1, 1, false, wc, devID, mem))
{
CheckNTErrors(batch.order == 3, "wrong tensor order of the sequence batch");
......@@ -343,7 +347,13 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
XTensor output;
/* make the network */
model->MakeLM(batch, output, padding, false);
if(model->isLM)
model->MakeLM(batch, output, padding, false);
else if(model->isMT)
model->MakeMT(batch, gold, output, padding, false);
else{
ShowNTErrors("Illegal model type!");
}
int bSize = batch.GetDim(0);
int length = batch.GetDim(1);
......@@ -532,7 +542,6 @@ int T2TTrainer::LoadBuf(FILE * file, bool isSorted, int step)
offset = 0;
for(int i = 0; i < seqCount; i++){
SampleNode &node = nodes[count];
//fprintf(stderr, "%d %d %d\n", node.size, node.id, node.value);
memcpy(buf2 + offset, node.p, sizeof(int) * node.size);
for(int j = 0; j < step; j++){
seqLen2[count + j] = seqLen[node.id + j];
......@@ -562,7 +571,7 @@ void T2TTrainer::ClearBuf()
nextSeq = -1;
}
/*
/*
load a batch of sequences
>> file - the handle to the data file
>> isLM - indicates whether the data is used for training lms
......@@ -570,8 +579,8 @@ load a batch of sequences
>> padding - padding of the input sequences
>> output - the batch of the output sequences
>> seqs - keep the sequences in an array
>> step - the step we go over when move to the next sequence
>> vs - vocabulary size
>> vsEnc - size of the encoder vocabulary
>> vsDec - size of the decoder vocabulary
>> sBatch - batch size of sequences
>> wBatch - batch size of words
>> isSorted - indicates whether the sequences are sorted by length
......@@ -582,12 +591,47 @@ load a batch of sequences
int T2TTrainer::LoadBatch(FILE * file, bool isLM,
XTensor * batch, XTensor * padding, XTensor * output,
int * seqs,
int step, int vs, int sBatch, int wBatch,
int vsEnc, int vsDec, int sBatch, int wBatch,
bool isSorted, int &wCount,
int devID, XMem * mem)
{
if(isLM){
return LoadBatchLM(file, batch, padding, output, seqs,
vsEnc, sBatch, wBatch,
isSorted, wCount, devID, mem);
}
else{
return LoadBatchMT(file, batch, padding, output, seqs,
vsEnc, vsDec, sBatch, wBatch,
isSorted, wCount, devID, mem);
}
}
/*
load a batch of sequences (for LM)
>> file - the handle to the data file
>> isLM - indicates whether the data is used for training lms
>> batch - the batch of the input sequences
>> padding - padding of the input sequences
>> output - the batch of the output sequences
>> seqs - keep the sequences in an array
>> vs - vocabulary size
>> sBatch - batch size of sequences
>> wBatch - batch size of words
>> isSorted - indicates whether the sequences are sorted by length
>> wCount - word count
>> devID - device id
>> mem - memory pool
*/
int T2TTrainer::LoadBatchLM(FILE * file,
XTensor * batch, XTensor * padding, XTensor * output,
int * seqs,
int vs, int sBatch, int wBatch,
bool isSorted, int &wCount,
int devID, XMem * mem)
{
if(nextSeq < 0 || nextSeq >= nseqBuf)
LoadBuf(file, isSorted, step);
LoadBuf(file, isSorted, 1);
int seq = MAX(nextSeq, 0);
int wc = 0;
......@@ -614,74 +658,175 @@ int T2TTrainer::LoadBatch(FILE * file, bool isLM,
if(sc <= 0)
return 0;
if(isLM){
int dims[MAX_TENSOR_DIM_NUM];
dims[0] = sc;
dims[1] = max;
dims[2] = vs;
InitTensor(batch, 3, dims, X_FLOAT, 1.0F, devID, mem);
InitTensor2D(padding, sc, max, X_FLOAT, devID, mem);
InitTensor(output, 3, dims, X_FLOAT, 1.0F, devID, mem);
if(batch->grad == NULL)
XNoder::MakeGrad(batch);
else
InitTensor(batch->grad, 3, dims, X_FLOAT, 1.0F, devID, mem);
if(padding->grad == NULL)
XNoder::MakeGrad(padding);
else
InitTensor2D(padding->grad, sc, max, X_FLOAT, devID, mem);
if(output->grad == NULL)
XNoder::MakeGrad(output);
else
InitTensor(output->grad, 3, dims, X_FLOAT, 1.0F, devID, mem);
batch->SetZeroAll();
padding->SetZeroAll();
output->SetZeroAll();
batch->grad->SetZeroAll();
padding->grad->SetZeroAll();
output->grad->SetZeroAll();
int seqSize = 0;
//fprintf(tf, "batch %d(%d)\n", tc++, sc);
/* this might be slow on GPUs :( */
for(int s = seq; s < seq + sc; s++){
int len = isDoubledEnd ? seqLen[s] : seqLen[s] - 1;
CheckNTErrors(len <= max, "Something is wrong!");
for(int w = 0; w < len; w++){
batch->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w]);
padding->Set2D(1.0F, s - seq, w);
if(w > 0)
output->Set3D(1.0F, s - seq, w - 1, buf[seqOffset[s] + w]);
if(w == len - 1){
if(isDoubledEnd)
output->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w]);
else
output->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w + 1]);
}
wCount++;
/*fprintf(tf, "%d", buf[seqOffset[s] + w]);
if(w < seqLen[s] - 1)
fprintf(tf, " ");
int dims[MAX_TENSOR_DIM_NUM];
dims[0] = sc;
dims[1] = max;
dims[2] = vs;
InitTensor(batch, 3, dims, X_FLOAT, 1.0F, devID, mem);
InitTensor2D(padding, sc, max, X_FLOAT, devID, mem);
InitTensor(output, 3, dims, X_FLOAT, 1.0F, devID, mem);
XNoder::MakeGrad(batch);
XNoder::MakeGrad(padding);
XNoder::MakeGrad(output);
batch->SetZeroAll();
padding->SetZeroAll();
output->SetZeroAll();
batch->grad->SetZeroAll();
padding->grad->SetZeroAll();
output->grad->SetZeroAll();
int seqSize = 0;
//fprintf(tf, "batch %d(%d)\n", tc++, sc);
/* this might be slow on GPUs :( */
for(int s = seq; s < seq + sc; s++){
int len = isDoubledEnd ? seqLen[s] : seqLen[s] - 1;
CheckNTErrors(len <= max, "Something is wrong!");
for(int w = 0; w < len; w++){
batch->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w]);
padding->Set2D(1.0F, s - seq, w);
if(w > 0)
output->Set3D(1.0F, s - seq, w - 1, buf[seqOffset[s] + w]);
if(w == len - 1){
if(isDoubledEnd)
output->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w]);
else
fprintf(tf, "\n");*/
if(seqs != NULL)
seqs[seqSize++] = buf[seqOffset[s] + w];
output->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w + 1]);
}
wCount++;
/*fprintf(tf, "%d", buf[seqOffset[s] + w]);
if(w < seqLen[s] - 1)
fprintf(tf, " ");
else
fprintf(tf, "\n");*/
if(seqs != NULL)
seqs[seqSize++] = buf[seqOffset[s] + w];
}
if(seqs != NULL){
for(int w = len; w < max; w++)
seqs[seqSize++] = -1;
}
if(seqs != NULL){
for(int w = len; w < max; w++)
seqs[seqSize++] = -1;
}
}
fflush(tf);
fflush(tf);
return sc;
}
/*
load a batch of sequences (for MT)
>> file - the handle to the data file
>> batch - the batch of the input sequences
>> padding - padding of the input sequences
>> output - the batch of the output sequences
>> seqs - keep the sequences in an array
>> vsEnc - size of the encoder vocabulary
>> vsDec - size of the decoder vocabulary
>> sBatch - batch size of sequences
>> wBatch - batch size of words
>> isSorted - indicates whether the sequences are sorted by length
>> wCount - word count
>> devID - device id
>> mem - memory pool
*/
int T2TTrainer::LoadBatchMT(FILE * file,
XTensor * batch, XTensor * padding, XTensor * output,
int * seqs,
int vsEnc, int vsDec, int sBatch, int wBatch,
bool isSorted, int &wCount,
int devID, XMem * mem)
{
if(nextSeq < 0 || nextSeq >= nseqBuf)
LoadBuf(file, isSorted, 2);
int seq = MAX(nextSeq, 0);
int wcEnc = 0;
int wcDec = 0;
int wnEnc = 0;
int wnDec = 0;
int maxEnc = 0;
int maxDec = 0;
int sc = 0;
CheckNTErrors((nseqBuf - seq) % 2 == 0, "Input sequence must be paired!");
while(seq + sc < nseqBuf){
/* source-side sequence */
wnEnc = seqLen[seq + sc];
wcEnc += wnEnc;
sc += 1;
if(maxEnc < wnEnc)
maxEnc = wnEnc;
/* target-side sequence */
wnDec = seqLen[seq + sc];
wcDec += wnDec;
sc += 1;
if(maxDec < wnDec)
maxDec = wnDec;
if(sc >= sBatch * 2 && wcEnc >= wBatch)
break;
}
nextSeq = seq + sc;
if(sc <= 0)
return 0;
int sCount = sc/2;
int seqSize = 0;
int dimsEnc[3] = {sCount, maxEnc, vsEnc};
int dimsDec[3] = {sCount, maxDec, vsDec};
InitTensor(batch, 3, dimsEnc, X_FLOAT, 1.0F, devID, mem);
InitTensor2D(padding, sCount, maxDec, X_FLOAT, devID, mem);
InitTensor(output, 3, dimsDec, X_FLOAT, 1.0F, devID, mem);
batch->SetZeroAll();
padding->SetZeroAll();
output->SetZeroAll();
wCount = 0;
/* batch of the source-side sequences */
for(int s = seq; s < seq + sc; s += 2){
int len = seqLen[s];
int sent = (s - seq)/2;
for(int w = 0; w < len; w++){
batch->Set3D(1.0F, sent, w, buf[seqOffset[s] + w]);
wCount++;
}
}
/* batch of the target-side sequences */
for(int s = seq + 1; s < seq + sc; s += 2){
int len = seqLen[s];
int sent = (s - seq - 1)/2;
for(int w = 0; w < len; w++){
padding->Set2D(1.0F, sent, w);
if(w > 0)
output->Set3D(1.0F, sent, w - 1, buf[seqOffset[s] + w]);
if(w == len - 1)
output->Set3D(1.0F, sent, w, buf[seqOffset[s] + w]);
wCount++;
if(seqs != NULL)
seqs[seqSize++] = buf[seqOffset[s] + w];
}
if(seqs != NULL){
for(int w = len; w < maxDec; w++)
seqs[seqSize++] = -1;
}
}
return sc;
......
......@@ -79,6 +79,9 @@ public:
/* vocabulary size of the source side */
int vSize;
/* vocabulary size of the target side */
int vSizeTgt;
/* learning rate */
float lrate;
......@@ -160,10 +163,24 @@ public:
int LoadBatch(FILE * file, bool isLM,
XTensor * batch, XTensor * padding, XTensor * output,
int * seqs,
int step, int vs, int sBatch, int wBatch,
int vsEnc, int vsDec, int sBatch, int wBatch,
bool isSorted, int &wCount,
int devID, XMem * mem);
/* load a batch of sequences (for language modeling) */
int LoadBatchLM(FILE * file,
XTensor * batch, XTensor * padding, XTensor * output,
int * seqs, int vs, int sBatch, int wBatch,
bool isSorted, int &wCount,
int devID, XMem * mem);
/* load a batch of sequences (for machine translation) */
int LoadBatchMT(FILE * file,
XTensor * batch, XTensor * padding, XTensor * output,
int * seqs, int vsEnc, int vsDec, int sBatch, int wBatch,
bool isSorted, int &wCount,
int devID, XMem * mem);
/* shuffle the data file */
void Shuffle(const char * srcFile, const char * tgtFile);
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论