Commit e1ed713a by xuchen

optimize the t2t code

parent bdf5c952
...@@ -34,7 +34,12 @@ namespace nts{ ...@@ -34,7 +34,12 @@ namespace nts{
/* compute dE/dx of a node */ /* compute dE/dx of a node */
void XShapeGrad::MakeGrad(XTensor * node, bool isEfficient) void XShapeGrad::MakeGrad(XTensor * node, bool isEfficient)
{ {
CheckNTErrors(node->grad != NULL, "No gradient found!"); if (!isEfficient) {
CheckNTErrors(node->grad != NULL, "No gradient found!");
}
else {
CheckNTErrors(!node->isGrad || node->grad != NULL, "No gradient found!");
}
XLink &income = node->income; XLink &income = node->income;
int operID = income.typeID; int operID = income.typeID;
......
...@@ -131,32 +131,20 @@ XTensor T2TEmbedder::Make(XTensor &input) ...@@ -131,32 +131,20 @@ XTensor T2TEmbedder::Make(XTensor &input)
XTensor wordEmbedding; XTensor wordEmbedding;
XTensor posEmbedding; XTensor posEmbedding;
bool match = (posEmbedding.order == input.order); /* make positional embeddings */
if(match){ XTensor position;
for(int i = 0; i < input.order; i++){ XTensor embTMP;
if(dims[i] != posEmbedding.GetDim(i))
match = false;
}
}
/* we make positional embeddings first */
//if(!match){
if(true){
InitTensor(&posEmbedding, input.order + 1, dims, X_FLOAT, devID);
XTensor * posTMP = NewTensorBuf(2, dims + 1, X_FLOAT, devID); InitTensor1D(&position, input.GetDim(-1), X_INT, devID);
position.Range(0, position.unitNum, 1);
_CopyValues(&posEmbeddingBase, 0, posTMP->unitNum, posTMP, 0); embTMP = Gather(posEmbeddingBase, position);
_Unsqueeze(posTMP, &posEmbedding, 0, dims[0]); posEmbedding = Unsqueeze(embTMP, 0, dims[0]);
DelTensorBuf(posTMP);
}
/* then we make word embeddings */ /* make word embeddings */
wordEmbedding = Gather(w, input); wordEmbedding = Gather(w, input);
wordEmbedding = Linear(wordEmbedding, (float)sqrt((float)eSize)); wordEmbedding = Linear(wordEmbedding, (float)sqrt((float)eSize));
/* we sum over the two embeddings */ /* sum over the two embeddings */
return wordEmbedding + posEmbedding; return wordEmbedding + posEmbedding;
} }
......
...@@ -114,64 +114,28 @@ make the network for language modeling (with the output softmax layer) ...@@ -114,64 +114,28 @@ make the network for language modeling (with the output softmax layer)
*/ */
void T2TModel::MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool isTraining) void T2TModel::MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool isTraining)
{ {
XTensor encoding; int len = padding.GetDim(padding.order - 1);
int * dims = new int[padding.order + 2];
/* generate mask to see "previous" words only */ for(int i = 0; i < padding.order; i++)
//int len = input.GetDim(input.order - 2); dims[i + 1] = padding.GetDim(i);
//int * dims = new int[input.order + 1];
//for(int i = 0; i < input.order; i++)
// dims[i + 1] = input.GetDim(i);
//dims[0] = nhead;
//dims[input.order] = len;
//XTensor mask(input.order + 1, dims, X_FLOAT, 1.0F, input.devID, input.mem);
int len = input.GetDim(input.order - 1);
int * dims = new int[input.order + 2];
for(int i = 0; i < input.order; i++)
dims[i + 1] = input.GetDim(i);
dims[0] = nhead; dims[0] = nhead;
dims[input.order + 1] = len; dims[padding.order + 1] = len;
XTensor mask; XTensor mask;
InitTensor(&mask, input.order + 2, dims, X_FLOAT, padding.devID); InitTensor(&mask, padding.order + 2, dims, X_FLOAT, padding.devID);
delete[] dims;
/* a upper triangular matrix where the cells of the upper triangular are set to -1e-9. /* a upper triangular matrix where the cells of the upper triangular are set to -1e-9.
this matrix can be used to prevent the attention to current or following words in this matrix can be used to prevent the attention to current or following words in
a given sequence. */ a given sequence. */
_SetDataLowTri(&mask, 1e9F, 0); _SetDataLowTri(&mask, 1e9F, 0);
_ScaleAndShiftMe(&mask, 1.0F, -1e9F); ScaleAndShiftMe(mask, 1.0F, -1e9F);
int * dimsPadding = new int[padding.order + 2];
for(int i = 0; i < padding.order - 1; i++)
dimsPadding[i] = padding.GetDim(i);
dimsPadding[padding.order - 1] = padding.GetDim(-1);
dimsPadding[padding.order] = padding.GetDim(-1);
XTensor * padding2 = NewTensorBuf(padding.order + 1, dimsPadding, padding.dataType,
padding.devID);
for(int i = 0; i < padding2->order; i++)
dimsPadding[i + 1] = padding2->GetDim(i);
dimsPadding[0] = nhead;
//XTensor * padding3 = NewTensorBuf(padding.order + 2, dimsPadding, padding.dataType, /* forward */
// padding.devID); XTensor encoding;
//
///* mask of the padding */
//_Unsqueeze(&padding, padding2, padding.order - 1, padding.GetDim(-1));
//_Unsqueeze(padding2, padding3, 0, nhead);
//
//_ScaleAndShiftMe(padding3, 1e9F, -1e9F);
//
////_Sum(&mask, padding3, &mask);
encoding = MakeEncoder(input, mask, isTraining); encoding = MakeEncoder(input, mask, isTraining);
outputLayer->Make(encoding, output); outputLayer->Make(encoding, output);
delete[] dims;
delete[] dimsPadding;
//DelTensorBuf(padding3);
DelTensorBuf(padding2);
} }
/* /*
...@@ -183,7 +147,9 @@ make the network for machine translation (with the output softmax layer) ...@@ -183,7 +147,9 @@ make the network for machine translation (with the output softmax layer)
>> paddingDec - padding of the sequences (on the decoder side) >> paddingDec - padding of the sequences (on the decoder side)
>> isTraining - indicates whether the model is for training >> isTraining - indicates whether the model is for training
*/ */
void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTensor &paddingEnc, XTensor &paddingDec, bool isTraining) void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output,
XTensor &paddingEnc, XTensor &paddingDec,
bool isTraining)
{ {
XTensor encoding; XTensor encoding;
XTensor decoding; XTensor decoding;
...@@ -192,10 +158,10 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe ...@@ -192,10 +158,10 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe
XTensor maskEncDec; XTensor maskEncDec;
/* encoder mask */ /* encoder mask */
MakeMTMaskEnc(inputEnc, paddingEnc, maskEnc); MakeMTMaskEnc(paddingEnc, maskEnc);
/* decoder mask */ /* decoder mask */
MakeMTMaskDec(inputEnc, inputDec, paddingEnc, paddingDec, maskDec, maskEncDec); MakeMTMaskDec(paddingEnc, paddingDec, maskDec, maskEncDec);
encoding = MakeEncoder(inputEnc, maskEnc, isTraining); encoding = MakeEncoder(inputEnc, maskEnc, isTraining);
...@@ -289,40 +255,21 @@ make the mask of the encoder ...@@ -289,40 +255,21 @@ make the mask of the encoder
>> paddingEnc - padding of the encoder input >> paddingEnc - padding of the encoder input
>> maskEnc - mask of the encoder self-attention >> maskEnc - mask of the encoder self-attention
*/ */
void T2TModel::MakeMTMaskEnc(XTensor &inputEnc, XTensor &paddingEnc, XTensor &maskEnc) void T2TModel::MakeMTMaskEnc(XTensor &paddingEnc, XTensor &maskEnc)
{ {
/* padding on the source side */ XTensor padding2;
int * dimsPadding = new int[paddingEnc.order + 2]; XTensor padding3;
for (int i = 0; i < paddingEnc.order - 1; i++)
dimsPadding[i] = paddingEnc.GetDim(i);
dimsPadding[paddingEnc.order - 1] = paddingEnc.GetDim(-1);
dimsPadding[paddingEnc.order] = paddingEnc.GetDim(-1);
XTensor * padding2 = NewTensorBuf(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType,
paddingEnc.devID);
for (int i = 0; i < padding2->order; i++)
dimsPadding[i + 1] = padding2->GetDim(i);
dimsPadding[0] = nhead;
XTensor * padding3 = NewTensorBuf(paddingEnc.order + 2, dimsPadding, paddingEnc.dataType,
paddingEnc.devID);
/* mask of the padding */ /* mask of the padding */
_Unsqueeze(&paddingEnc, padding2, paddingEnc.order - 1, paddingEnc.GetDim(-1)); Unsqueeze(paddingEnc, padding2, paddingEnc.order - 1, paddingEnc.GetDim(-1));
_Unsqueeze(padding2, padding3, 0, nhead); Unsqueeze(padding2, padding3, 0, nhead);
ScaleAndShiftMe(padding3, 1e9F, -1e9F);
_ScaleAndShiftMe(padding3, 1e9F, -1e9F);
InitTensor(&maskEnc, padding3); InitTensor(&maskEnc, &padding3);
maskEnc.SetZeroAll(); maskEnc.SetZeroAll();
/* generate the mask on the source language side (for padding) */ /* generate the mask on the source language side (for padding) */
_Sum(&maskEnc, padding3, &maskEnc); SumMe(maskEnc, padding3);
DelTensorBuf(padding3);
DelTensorBuf(padding2);
delete[] dimsPadding;
} }
/* /*
...@@ -334,54 +281,33 @@ make the mask of the decoder ...@@ -334,54 +281,33 @@ make the mask of the decoder
>> maksDec - mask of the decoder self-attention >> maksDec - mask of the decoder self-attention
>> maksEncDec - mask of the decoder enc-dec attention >> maksEncDec - mask of the decoder enc-dec attention
*/ */
void T2TModel::MakeMTMaskDec(XTensor &inputEnc, XTensor &inputDec, void T2TModel::MakeMTMaskDec(XTensor &paddingEnc, XTensor &paddingDec,
XTensor &paddingEnc, XTensor &paddingDec,
XTensor &maskDec, XTensor &maskEncDec) XTensor &maskDec, XTensor &maskEncDec)
{ {
int len = inputDec.GetDim(inputDec.order - 1); int len = paddingDec.GetDim(paddingDec.order - 1);
int * dims = new int[inputDec.order + 2]; int * dims = new int[paddingDec.order + 2];
for(int i = 0; i < inputDec.order; i++) for(int i = 0; i < paddingDec.order; i++)
dims[i + 1] = inputDec.GetDim(i); dims[i + 1] = paddingDec.GetDim(i);
dims[0] = nhead; dims[0] = nhead;
dims[inputDec.order + 1] = len; dims[paddingDec.order + 1] = len;
InitTensor(&maskDec, inputDec.order + 2, dims, X_FLOAT, paddingDec.devID); InitTensor(&maskDec, paddingDec.order + 2, dims, X_FLOAT, paddingDec.devID);
/* An upper triangular matrix where the cells of the upper triangular are set to -1e-9. /* An upper triangular matrix where the cells of the upper triangular are set to -1e-9.
This matrix can be used to block the attention to current or following words in This matrix can be used to block the attention to current or following words in
a given sequence. */ a given sequence. */
_SetDataLowTri(&maskDec, 1e9F, 0); _SetDataLowTri(&maskDec, 1e9F, 0);
ScaleAndShiftMe(maskDec, 1.0F, -1e9F);
//maskDec.Dump(stderr, "mask: ");
_ScaleAndShiftMe(&maskDec, 1.0F, -1e9F);
//maskDec.Dump(stderr, "mask: ");
/* encoder-decoder mask that prevents the attention to padding dummy words */ /* encoder-decoder mask that prevents the attention to padding dummy words */
dims[inputDec.order + 1] = inputEnc.GetDim(inputEnc.order - 1); XTensor maskEncDecTMP;
InitTensor(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, paddingEnc.devID);
XTensor * maskEncDecTMPEnc = NewTensorBuf(paddingEnc.order + 1, dims + 1, paddingEnc.dataType, Unsqueeze(paddingEnc, maskEncDecTMP, paddingEnc.order - 1, paddingDec.GetDim(-1));
paddingEnc.devID); ScaleAndShiftMe(maskEncDecTMP, 1e9F, -1e9F);
XTensor * maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID); Unsqueeze(maskEncDecTMP, maskEncDec, 0, dims[0]);
_Unsqueeze(&paddingEnc, maskEncDecTMPEnc, paddingEnc.order - 1, paddingDec.GetDim(-1));
//paddingEnc.Dump(stderr, "paddingenc:");
//maskEncDecTMPEnc->Dump(stderr, "maskencdectmpenc:");
_ScaleAndShiftMe(maskEncDecTMPEnc, 1e9F, -1e9F);
//maskEncDecTMPEnc->Dump(stderr, "maskencdectmpenc:");
_Unsqueeze(maskEncDecTMPEnc, &maskEncDec, 0, dims[0]);
//maskEncDecTMPEnc->Dump(stderr, "maskencdectmpenc:");
DelTensorBuf(maskEncDecTMPDec);
DelTensorBuf(maskEncDecTMPEnc);
delete[] dims; delete[] dims;
} }
/* /*
get parameter matrics get parameter matrics
>> list - the list that keeps the parameter matrics >> list - the list that keeps the parameter matrics
......
...@@ -87,11 +87,10 @@ public: ...@@ -87,11 +87,10 @@ public:
XTensor &maskEnc, XTensor &maskDec, XTensor &maskEncDec); XTensor &maskEnc, XTensor &maskDec, XTensor &maskEncDec);
/* make the mask of the encoder */ /* make the mask of the encoder */
void MakeMTMaskEnc(XTensor &inputEnc, XTensor &paddingEnc, XTensor &maskEnc); void MakeMTMaskEnc(XTensor &paddingEnc, XTensor &maskEnc);
/* make the mask of the decoder */ /* make the mask of the decoder */
void MakeMTMaskDec(XTensor &inputEnc, XTensor &inputDec, void MakeMTMaskDec(XTensor &paddingEnc, XTensor &paddingDec,
XTensor &paddingEnc, XTensor &paddingDec,
XTensor &maskDec, XTensor &maskEncDec); XTensor &maskDec, XTensor &maskEncDec);
/* get parameter matrics */ /* get parameter matrics */
......
...@@ -171,7 +171,7 @@ void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding, ...@@ -171,7 +171,7 @@ void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding,
dims[inputEnc->order - 1] = 1; dims[inputEnc->order - 1] = 1;
InitTensor(&first, inputEnc->order, dims, X_INT, inputEnc->devID); InitTensor(&first, inputEnc->order, dims, X_INT, inputEnc->devID);
_SetDataFixedInt(&first, startSymbol); first.SetDataFixed(startSymbol);
/* add a new word into the input sequence of the decoder side */ /* add a new word into the input sequence of the decoder side */
if (inputLast == NULL) { if (inputLast == NULL) {
...@@ -195,13 +195,13 @@ void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding, ...@@ -195,13 +195,13 @@ void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding,
XTensor paddingDec; XTensor paddingDec;
InitTensor(&paddingDec, inputDec.order, dims, X_INT, paddingEnc->devID); InitTensor(&paddingDec, inputDec.order, dims, X_INT, paddingEnc->devID);
SetDataFixedInt(paddingDec, 1); paddingDec.SetDataFixed(1);
XTensor maskDec; XTensor maskDec;
XTensor maskEncDec; XTensor maskEncDec;
/* decoder mask */ /* decoder mask */
m->MakeMTMaskDec(*inputEnc, inputDec, *paddingEnc, paddingDec, maskDec, maskEncDec); m->MakeMTMaskDec(*paddingEnc, paddingDec, maskDec, maskEncDec);
/* make the decoding network */ /* make the decoding network */
decoding = decoder.Make(inputDec, *encoding, maskDec, maskEncDec, false); decoding = decoder.Make(inputDec, *encoding, maskDec, maskEncDec, false);
......
...@@ -89,7 +89,7 @@ void T2TSearch::Search(T2TModel * model, XTensor * input, XTensor * padding, XTe ...@@ -89,7 +89,7 @@ void T2TSearch::Search(T2TModel * model, XTensor * input, XTensor * padding, XTe
Prepare(input->unitNum/input->GetDim(-1), beamSize); Prepare(input->unitNum/input->GetDim(-1), beamSize);
/* encoder mask */ /* encoder mask */
model->MakeMTMaskEnc(*input, *padding, maskEnc); model->MakeMTMaskEnc(*padding, maskEnc);
//input->Dump(stderr, "input:"); //input->Dump(stderr, "input:");
//maskEnc.Dump(stderr, "maskenc:"); //maskEnc.Dump(stderr, "maskenc:");
...@@ -503,7 +503,7 @@ void T2TSearch::Dump(XTensor * output) ...@@ -503,7 +503,7 @@ void T2TSearch::Dump(XTensor * output)
int * words = new int[maxLength]; int * words = new int[maxLength];
InitTensor(output, 3, dims, X_INT); InitTensor(output, 3, dims, X_INT);
SetDataFixedInt(*output, -1); output->SetDataFixed(-1);
/* heap for an input sentence in the batch */ /* heap for an input sentence in the batch */
for(int h = 0; h < batchSize; h++){ for(int h = 0; h < batchSize; h++){
......
...@@ -119,7 +119,7 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model ...@@ -119,7 +119,7 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
int ws =0; int ws =0;
int wordCount = 0; int wordCount = 0;
int wordCountTotal = 0; int wordCountTotal = 0;
int wordCountBatch = 0; int batchCountTotal = 0;
bool isEnd = false; bool isEnd = false;
float loss = 0; float loss = 0;
float lr = 0; float lr = 0;
...@@ -174,9 +174,6 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model ...@@ -174,9 +174,6 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
/* gold standard */ /* gold standard */
XTensor gold; XTensor gold;
/* label smoothed gold standard (if needed) */
XTensor goldSmoothed;
while (batchLoader.LoadBatch(file, model->isLM, while (batchLoader.LoadBatch(file, model->isLM,
&batchEnc, &paddingEnc, &batchDec, &paddingDec, &gold, &label, &batchEnc, &paddingEnc, &batchDec, &paddingDec, &gold, &label,
NULL, vSize, vSizeTgt, NULL, vSize, vSizeTgt,
...@@ -197,51 +194,34 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model ...@@ -197,51 +194,34 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
ShowNTErrors("Illegal model type!"); ShowNTErrors("Illegal model type!");
} }
/* back-propagation for obtaining gradients */ /* get loss and probabilities */
//if (labelSmoothingP > 0)
// LabelSmooth(&gold, &goldSmoothed, labelSmoothingP);
XTensor labelOnehot; XTensor labelOnehot;
XTensor lossTensor;
labelOnehot = IndexToOnehot(label, vSizeTgt, labelSmoothingP); labelOnehot = IndexToOnehot(label, vSizeTgt, labelSmoothingP);
/* make paddings for the output */
//if (output.GetDim(0) > 0)
//PadOutput(&output, &labelOnehot, &paddingDec);
/* get probabilities */
//float prob = GetProb(&output, &labelOnehot, NULL);
XTensor lossTensor;
lossTensor = CrossEntropy(output, labelOnehot, paddingDec); lossTensor = CrossEntropy(output, labelOnehot, paddingDec);
float prob = ReduceSumAll(lossTensor); float lossBatch = ReduceSumAll(lossTensor);
DTYPE lossLocal = prob / wc; DTYPE lossLocal = lossBatch / wc;
bool doUpdate = (!IsNAN(lossLocal) && !IsINF(lossLocal) && lossLocal < 1e3F); bool doUpdate = (!IsNAN(lossLocal) && !IsINF(lossLocal) && lossLocal < 1e3F);
//XTensor &g = labelSmoothingP > 0 ? goldSmoothed : gold;
if (doUpdate) { if (doUpdate) {
/* recale the output for normalized loss */
//RescaleOutput(&output, &labelOnehot, &paddingDec);
/* back-propagation */ /* back-propagation */
net.Backward(lossTensor); net.Backward(lossTensor);
//net.Backward(output, labelOnehot, paddingDec, CROSSENTROPY);
//net.Backward(output, label, labelSmoothingP, CROSSENTROPY);
gradStep += 1; gradStep += 1;
loss += prob; loss += lossBatch;
wordCount += wc; wordCount += wc;
wordCountTotal += wc; wordCountTotal += wc;
batchCountTotal += ws;
//totalW = wc + ws;
wordCountBatch += ws;
/* update the parameters */ /* update the parameters */
if(gradStep == updateStep){ if(gradStep == updateStep){
/* learning rate */ /* learning rate */
lr = lrate * (1.0F / (float)sqrt((float)d)) * (float)MIN(pow((float)validStep + 1, -0.5F - lrbias), ((float)validStep + 1) * pow((float)nwarmup, -1.5F - lrbias)); lr = lrate * (1.0F / (float)sqrt((float)d)) *
(float)MIN(pow((float)validStep + 1, -0.5F - lrbias),
((float)validStep + 1) * pow((float)nwarmup, -1.5F - lrbias));
/* model update */ /* model update */
Update(model, lr); Update(model, lr);
...@@ -260,8 +240,10 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model ...@@ -260,8 +240,10 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
if (step % 100 == 0) { if (step % 100 == 0) {
double elapsed = GetClockSec() - startT; double elapsed = GetClockSec() - startT;
XPRINT8(0, stderr, "[INFO] elapsed=%.1fs, step=%d, epoch=%d, tword=%d, sword=%d, loss=%.3f, ppl=%.3f, sppl=%.3f", XPRINT8(0, stderr, "[INFO] elapsed=%.1fs, step=%d, epoch=%d, total word=%d, total batch=%d, loss=%.3f, ppl=%.3f, sppl=%.3f",
elapsed, step, epoch, wordCountTotal, wordCountBatch, loss/wordCount, exp(loss/wordCount), exp(prob/wc)); elapsed, step, epoch,
wordCountTotal, batchCountTotal,
loss/wordCount, exp(loss/wordCount), exp(lossBatch /wc));
if (!doUpdate) if (!doUpdate)
XPRINT(0, stderr, " (no update)"); XPRINT(0, stderr, " (no update)");
XPRINT(0, stderr, "\n"); XPRINT(0, stderr, "\n");
...@@ -301,12 +283,11 @@ test the model ...@@ -301,12 +283,11 @@ test the model
>> ofn - output data file >> ofn - output data file
>> model - model that is trained >> model - model that is trained
*/ */
void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model) void T2TTrainer::Validate(const char * fn, const char * ofn, T2TModel * model)
{ {
int wc = 0; int wc = 0;
int ws = 0; int ws = 0;
int wordCount = 0; int wordCount = 0;
int wordCountTotal = 0;
int sentCount = 0; int sentCount = 0;
float loss = 0; float loss = 0;
...@@ -316,14 +297,8 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model) ...@@ -316,14 +297,8 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
FILE * ofile = fopen(ofn, "wb"); FILE * ofile = fopen(ofn, "wb");
CheckNTErrors(ofile, "Cannot open the output file"); CheckNTErrors(ofile, "Cannot open the output file");
int devID = model->devID;
XNet net;
double startT = GetClockSec(); double startT = GetClockSec();
wordCount = 0;
/* batch of input sequences */ /* batch of input sequences */
XTensor batchEnc; XTensor batchEnc;
XTensor batchDec; XTensor batchDec;
...@@ -346,7 +321,7 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model) ...@@ -346,7 +321,7 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
while(batchLoader.LoadBatch(file, model->isLM, while(batchLoader.LoadBatch(file, model->isLM,
&batchEnc, &paddingEnc, &batchDec, &paddingDec, &gold, &label, &batchEnc, &paddingEnc, &batchDec, &paddingDec, &gold, &label,
seqs, vSize, vSizeTgt, seqs, vSize, vSizeTgt,
1, 1, false, ws, wc, devID, false)) 1, 1, false, ws, wc, model->devID, false))
{ {
CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch"); CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch");
...@@ -366,15 +341,11 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model) ...@@ -366,15 +341,11 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
int length = output.GetDim(1); int length = output.GetDim(1);
/* prediction probabilities */ /* prediction probabilities */
XTensor probs;
InitTensor1D(&probs, bSize * length);
XTensor labelOnehot; XTensor labelOnehot;
XTensor lossTensor;
labelOnehot = IndexToOnehot(label, vSizeTgt, 0); labelOnehot = IndexToOnehot(label, vSizeTgt, 0);
lossTensor = CrossEntropy(output, labelOnehot, paddingDec);
/* get probabilities */ float lossBatch = ReduceSumAll(lossTensor);
float prob = GetProb(&output, &labelOnehot, &probs);
/* dump the test result */ /* dump the test result */
for(int s = 0; s < bSize; s++){ for(int s = 0; s < bSize; s++){
...@@ -390,7 +361,7 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model) ...@@ -390,7 +361,7 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
fprintf(ofile, "||| "); fprintf(ofile, "||| ");
for(int i = 0; i < length; i++){ for(int i = 0; i < length; i++){
if(seq[i] >= 0){ if(seq[i] >= 0){
DTYPE p = probs.Get1D(s * length + i); DTYPE p = lossTensor.Get2D(s, i);
fprintf(ofile, "%.3e ", p); fprintf(ofile, "%.3e ", p);
sum += p; sum += p;
} }
...@@ -400,12 +371,12 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model) ...@@ -400,12 +371,12 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
fprintf(ofile, "||| %e\n", sum); fprintf(ofile, "||| %e\n", sum);
} }
loss += -prob; loss += lossBatch;
wordCount += wc; wordCount += wc;
wordCountTotal += wc; sentCount += bSize;
sentCount += 1;
} }
fclose(file); fclose(file);
fclose(ofile); fclose(ofile);
...@@ -413,8 +384,8 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model) ...@@ -413,8 +384,8 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
double elapsed = GetClockSec() - startT; double elapsed = GetClockSec() - startT;
XPRINT3(0, stderr, "[INFO] test finished (took %.1fs, word=%d, and ppl=%.3f)\n", XPRINT5(0, stderr, "[INFO] test finished (took %.1fs, sentence=%d, word=%d, loss=%.3f and ppl=%.3f)\n",
elapsed,wordCountTotal, exp(loss / wordCount)); elapsed, sentCount, wordCount, loss / wordCount, exp(loss / wordCount));
} }
/* /*
...@@ -428,64 +399,25 @@ make a checkpoint ...@@ -428,64 +399,25 @@ make a checkpoint
void T2TTrainer::MakeCheckpoint(T2TModel * model, const char * validFN, const char * modelFN, const char * label, int id) void T2TTrainer::MakeCheckpoint(T2TModel * model, const char * validFN, const char * modelFN, const char * label, int id)
{ {
char * fn = new char[MAX_LINE_LENGTH]; char * fn = new char[MAX_LINE_LENGTH];
char * fn2 = new char[MAX_LINE_LENGTH];
sprintf(fn, "%s.%s.%03d", modelFN, label, id); sprintf(fn, "%s.%s.%03d", modelFN, label, id);
sprintf(fn2, "%s.%s.%03d.output", modelFN, label, id);
model->Dump(fn); model->Dump(fn);
//if(validFN != NULL){
//T2TTrainer trainer;
//trainer.Init(argNum, argArray);
//trainer.Test(validFN, fn2, model);
//}
delete[] fn; delete[] fn;
delete[] fn2;
}
/* char* fn2 = new char[MAX_LINE_LENGTH];
get word probabilities for a batch of sequences sprintf(fn2, "%s.%s.%03d.output", modelFN, label, id);
>> output - word distribution for each position if(validFN != NULL){
>> gold - gold standard T2TTrainer trainer;
>> wordProbs - word probability for gold prediction trainer.Init(argNum, argArray);
*/ trainer.Validate(validFN, fn2, model);
float T2TTrainer::GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs) }
{ delete[] fn2;
XTensor probs;
InitTensorV2(&probs, output);
_Multiply(output, gold, &probs);
/* probability of each word */
XTensor wprobs;
InitTensor1D(&wprobs, output->unitNum/output->GetDim(-1), X_FLOAT, output->devID);
int dims[2] = {output->unitNum/output->GetDim(-1), output->GetDim(-1)};
probs.Reshape(2, dims);
_ReduceSum(&probs, &wprobs, 1);
if(wordProbs != NULL)
_CopyValues(&wprobs, wordProbs);
/* reshape the tensor to fit it into the reduce procedure
TODO: XTensor supports scalars */
dims[0] = 1;
dims[1] = probs.unitNum;
probs.Reshape(2, dims);
/* probability for the batch */
XTensor result;
InitTensor1D(&result, 1, X_FLOAT, output->devID);
_ReduceSum(&probs, &result, 1);
return result.Get1D(0);
} }
/* /*
update the model by delta rule update the model by delta rule
\theta_new = \theta - \lrate * grad \theta_{new} = \theta - \lrate * grad
where where
\lrate = d^-0.5 * min(stepNum^-0.5, stepNum * warmupStepNum^-1.5) \lrate = d^-0.5 * min(stepNum^{-0.5}, stepNum * warmupStepNum^{-1.5})
>> model - the t2t model >> model - the t2t model
>> lr - learning rate >> lr - learning rate
*/ */
...@@ -531,7 +463,6 @@ void T2TTrainer::Update(T2TModel * model, const float lr) ...@@ -531,7 +463,6 @@ void T2TTrainer::Update(T2TModel * model, const float lr)
_Sum(para, v2, para, -e); _Sum(para, v2, para, -e);
DelTensorBuf(v2); DelTensorBuf(v2);
} }
else{ else{
/* the delta rule */ /* the delta rule */
...@@ -574,86 +505,4 @@ void T2TTrainer::PrepareModel(T2TModel * model) ...@@ -574,86 +505,4 @@ void T2TTrainer::PrepareModel(T2TModel * model)
adamBeta2T = 1.0F; adamBeta2T = 1.0F;
} }
/*
do padding on the output
>> output - output tensor of the network
>> gold - gold standard
>> padding - padding of a batch of sentences
>> lsP - smoothing factor
*/
void T2TTrainer::PadOutput(XTensor * output, XTensor * gold, XTensor * padding)
{
if(output == NULL || padding == NULL)
return;
int on = output->order;
int * dimso = new int[on];
memcpy(dimso, output->dimSize, sizeof(int) * on);
output->Reshape(output->unitNum/dimso[output->order - 1], dimso[output->order - 1]);
XTensor * padding2 = NewTensorBuf(1, &padding->unitNum, X_FLOAT, padding->devID);
_CopyValues(padding, padding2);
_MultiplyDim(output, padding2, output, 0);
_ScaleAndShiftMe(padding2, 1e9F, -1e9F);
_SumDim(output, padding2, output, 0);
output->Reshape(on, dimso);
if(gold != NULL){
gold->Reshape(gold->unitNum/dimso[gold->order - 1], dimso[gold->order - 1]);
_CopyValues(padding, padding2);
_MultiplyDim(gold, padding2, gold, 0);
gold->Reshape(on, dimso);
}
delete[] dimso;
DelTensorBuf(padding2);
}
/*
recale the output and gold tensors for normalized loss
>> output - output tensor of the network
>> gold - gold standard
>> padding - padding of a batch of sentences
*/
void T2TTrainer::RescaleOutput(XTensor * output, XTensor * gold, XTensor * padding)
{
CheckNTErrors(output->order == 3, "Wrong dimension number!");
CheckNTErrors(gold->order == 3, "Wrong dimension number!");
DTYPE count = _ReduceSumAll(padding);
_ExpMe(output);
_ScaleAndShiftMe(output, 1/count);
_LogMe(output);
_ScaleAndShiftMe(gold, 1/count);
}
/*
perform label smoothing
>> gold - gold standard
>> smoothed - result of label smoothing
>> p - smoothing factor
*/
void T2TTrainer::LabelSmooth(XTensor * gold, XTensor * smoothed, DTYPE p)
{
CheckNTErrors(p >= 0 && p <= 1.0F, "Smoothing factor must be in range [0,1]");
int n = gold->GetDim(-1);
DTYPE q = 1.0F - p;
DTYPE gift = p / n;
InitTensor(smoothed, gold);
_CopyValues(gold, smoothed);
if(p == 0)
return;
_ScaleAndShiftMe(smoothed, q, gift);
}
} }
...@@ -125,28 +125,16 @@ public: ...@@ -125,28 +125,16 @@ public:
void Train(const char * fn, const char * validFN, const char * modelFN, T2TModel * model); void Train(const char * fn, const char * validFN, const char * modelFN, T2TModel * model);
/* test the model */ /* test the model */
void Test(const char * fn, const char * ofn, T2TModel * model); void Validate(const char * fn, const char * ofn, T2TModel * model);
/* make a checkpoint */ /* make a checkpoint */
void MakeCheckpoint(T2TModel * model, const char * validFN, const char * modelFN, const char * label, int id); void MakeCheckpoint(T2TModel * model, const char * validFN, const char * modelFN, const char * label, int id);
/* get word probabilities for a batch of sequences */
float GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs);
/* update the model by delta rule */ /* update the model by delta rule */
void Update(T2TModel * model, const float lr); void Update(T2TModel * model, const float lr);
/* prepare model for training */ /* prepare model for training */
void PrepareModel(T2TModel * model); void PrepareModel(T2TModel * model);
/* do padding on the output */
void PadOutput(XTensor * output, XTensor * gold, XTensor * padding);
/* recale the output and gold tensors for normalized loss */
void RescaleOutput(XTensor * output, XTensor * gold, XTensor * padding);
/* perform label smoothing */
void LabelSmooth(XTensor * gold, XTensor * smoothed, DTYPE p);
}; };
......
...@@ -94,7 +94,7 @@ int TransformerMain(int argc, const char ** argv) ...@@ -94,7 +94,7 @@ int TransformerMain(int argc, const char ** argv)
else{ else{
T2TTrainer tester; T2TTrainer tester;
tester.Init(argc, args); tester.Init(argc, args);
tester.Test(testFN, outputFN, &model); tester.Validate(testFN, outputFN, &model);
} }
} }
......
...@@ -28,7 +28,6 @@ ...@@ -28,7 +28,6 @@
#ifndef __XTENSOR_H__ #ifndef __XTENSOR_H__
#define __XTENSOR_H__ #define __XTENSOR_H__
#include <math.h>
#include "XGlobal.h" #include "XGlobal.h"
#include "XMem.h" #include "XMem.h"
#include "XPRunner.h" #include "XPRunner.h"
...@@ -416,11 +415,11 @@ public: ...@@ -416,11 +415,11 @@ public:
bool BinarySearch(int key, DTYPE &value, void * &position) const; bool BinarySearch(int key, DTYPE &value, void * &position) const;
/* dump data to a file */ /* dump data to a file */
void Dump(FILE * file, const char * label = NULL, const int n = -1, const int beg = 0, const int verbose = 0); void Dump(FILE * file = stderr, const char * label = NULL, const int n = -1, const int beg = 0, const int verbose = 0);
/* dump data to a file */ /* dump data to a file */
static static
void Dump(const XTensor * tensor, FILE * file, const char * label = NULL, const int n = -1, const int beg = 0, const int verbose = 0); void Dump(const XTensor * tensor, FILE * file = stderr, const char * label = NULL, const int n = -1, const int beg = 0, const int verbose = 0);
/* dump data to a binary file */ /* dump data to a binary file */
void BinaryDump(FILE * file); void BinaryDump(FILE * file);
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论