Commit e1ed713a by xuchen

optimize the t2t code

parent bdf5c952
......@@ -34,7 +34,12 @@ namespace nts{
/* compute dE/dx of a node */
void XShapeGrad::MakeGrad(XTensor * node, bool isEfficient)
{
if (!isEfficient) {
CheckNTErrors(node->grad != NULL, "No gradient found!");
}
else {
CheckNTErrors(!node->isGrad || node->grad != NULL, "No gradient found!");
}
XLink &income = node->income;
int operID = income.typeID;
......
......@@ -131,32 +131,20 @@ XTensor T2TEmbedder::Make(XTensor &input)
XTensor wordEmbedding;
XTensor posEmbedding;
bool match = (posEmbedding.order == input.order);
if(match){
for(int i = 0; i < input.order; i++){
if(dims[i] != posEmbedding.GetDim(i))
match = false;
}
}
/* we make positional embeddings first */
//if(!match){
if(true){
InitTensor(&posEmbedding, input.order + 1, dims, X_FLOAT, devID);
/* make positional embeddings */
XTensor position;
XTensor embTMP;
XTensor * posTMP = NewTensorBuf(2, dims + 1, X_FLOAT, devID);
_CopyValues(&posEmbeddingBase, 0, posTMP->unitNum, posTMP, 0);
_Unsqueeze(posTMP, &posEmbedding, 0, dims[0]);
DelTensorBuf(posTMP);
}
InitTensor1D(&position, input.GetDim(-1), X_INT, devID);
position.Range(0, position.unitNum, 1);
embTMP = Gather(posEmbeddingBase, position);
posEmbedding = Unsqueeze(embTMP, 0, dims[0]);
/* then we make word embeddings */
/* make word embeddings */
wordEmbedding = Gather(w, input);
wordEmbedding = Linear(wordEmbedding, (float)sqrt((float)eSize));
/* we sum over the two embeddings */
/* sum over the two embeddings */
return wordEmbedding + posEmbedding;
}
......
......@@ -114,64 +114,28 @@ make the network for language modeling (with the output softmax layer)
*/
void T2TModel::MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool isTraining)
{
XTensor encoding;
/* generate mask to see "previous" words only */
//int len = input.GetDim(input.order - 2);
//int * dims = new int[input.order + 1];
//for(int i = 0; i < input.order; i++)
// dims[i + 1] = input.GetDim(i);
//dims[0] = nhead;
//dims[input.order] = len;
//XTensor mask(input.order + 1, dims, X_FLOAT, 1.0F, input.devID, input.mem);
int len = input.GetDim(input.order - 1);
int * dims = new int[input.order + 2];
for(int i = 0; i < input.order; i++)
dims[i + 1] = input.GetDim(i);
int len = padding.GetDim(padding.order - 1);
int * dims = new int[padding.order + 2];
for(int i = 0; i < padding.order; i++)
dims[i + 1] = padding.GetDim(i);
dims[0] = nhead;
dims[input.order + 1] = len;
dims[padding.order + 1] = len;
XTensor mask;
InitTensor(&mask, input.order + 2, dims, X_FLOAT, padding.devID);
InitTensor(&mask, padding.order + 2, dims, X_FLOAT, padding.devID);
delete[] dims;
/* a upper triangular matrix where the cells of the upper triangular are set to -1e-9.
this matrix can be used to prevent the attention to current or following words in
a given sequence. */
_SetDataLowTri(&mask, 1e9F, 0);
_ScaleAndShiftMe(&mask, 1.0F, -1e9F);
int * dimsPadding = new int[padding.order + 2];
for(int i = 0; i < padding.order - 1; i++)
dimsPadding[i] = padding.GetDim(i);
dimsPadding[padding.order - 1] = padding.GetDim(-1);
dimsPadding[padding.order] = padding.GetDim(-1);
XTensor * padding2 = NewTensorBuf(padding.order + 1, dimsPadding, padding.dataType,
padding.devID);
for(int i = 0; i < padding2->order; i++)
dimsPadding[i + 1] = padding2->GetDim(i);
dimsPadding[0] = nhead;
ScaleAndShiftMe(mask, 1.0F, -1e9F);
//XTensor * padding3 = NewTensorBuf(padding.order + 2, dimsPadding, padding.dataType,
// padding.devID);
//
///* mask of the padding */
//_Unsqueeze(&padding, padding2, padding.order - 1, padding.GetDim(-1));
//_Unsqueeze(padding2, padding3, 0, nhead);
//
//_ScaleAndShiftMe(padding3, 1e9F, -1e9F);
//
////_Sum(&mask, padding3, &mask);
/* forward */
XTensor encoding;
encoding = MakeEncoder(input, mask, isTraining);
outputLayer->Make(encoding, output);
delete[] dims;
delete[] dimsPadding;
//DelTensorBuf(padding3);
DelTensorBuf(padding2);
}
/*
......@@ -183,7 +147,9 @@ make the network for machine translation (with the output softmax layer)
>> paddingDec - padding of the sequences (on the decoder side)
>> isTraining - indicates whether the model is for training
*/
void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTensor &paddingEnc, XTensor &paddingDec, bool isTraining)
void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output,
XTensor &paddingEnc, XTensor &paddingDec,
bool isTraining)
{
XTensor encoding;
XTensor decoding;
......@@ -192,10 +158,10 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe
XTensor maskEncDec;
/* encoder mask */
MakeMTMaskEnc(inputEnc, paddingEnc, maskEnc);
MakeMTMaskEnc(paddingEnc, maskEnc);
/* decoder mask */
MakeMTMaskDec(inputEnc, inputDec, paddingEnc, paddingDec, maskDec, maskEncDec);
MakeMTMaskDec(paddingEnc, paddingDec, maskDec, maskEncDec);
encoding = MakeEncoder(inputEnc, maskEnc, isTraining);
......@@ -289,40 +255,21 @@ make the mask of the encoder
>> paddingEnc - padding of the encoder input
>> maskEnc - mask of the encoder self-attention
*/
void T2TModel::MakeMTMaskEnc(XTensor &inputEnc, XTensor &paddingEnc, XTensor &maskEnc)
void T2TModel::MakeMTMaskEnc(XTensor &paddingEnc, XTensor &maskEnc)
{
/* padding on the source side */
int * dimsPadding = new int[paddingEnc.order + 2];
for (int i = 0; i < paddingEnc.order - 1; i++)
dimsPadding[i] = paddingEnc.GetDim(i);
dimsPadding[paddingEnc.order - 1] = paddingEnc.GetDim(-1);
dimsPadding[paddingEnc.order] = paddingEnc.GetDim(-1);
XTensor * padding2 = NewTensorBuf(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType,
paddingEnc.devID);
for (int i = 0; i < padding2->order; i++)
dimsPadding[i + 1] = padding2->GetDim(i);
dimsPadding[0] = nhead;
XTensor * padding3 = NewTensorBuf(paddingEnc.order + 2, dimsPadding, paddingEnc.dataType,
paddingEnc.devID);
XTensor padding2;
XTensor padding3;
/* mask of the padding */
_Unsqueeze(&paddingEnc, padding2, paddingEnc.order - 1, paddingEnc.GetDim(-1));
_Unsqueeze(padding2, padding3, 0, nhead);
_ScaleAndShiftMe(padding3, 1e9F, -1e9F);
Unsqueeze(paddingEnc, padding2, paddingEnc.order - 1, paddingEnc.GetDim(-1));
Unsqueeze(padding2, padding3, 0, nhead);
ScaleAndShiftMe(padding3, 1e9F, -1e9F);
InitTensor(&maskEnc, padding3);
InitTensor(&maskEnc, &padding3);
maskEnc.SetZeroAll();
/* generate the mask on the source language side (for padding) */
_Sum(&maskEnc, padding3, &maskEnc);
DelTensorBuf(padding3);
DelTensorBuf(padding2);
delete[] dimsPadding;
SumMe(maskEnc, padding3);
}
/*
......@@ -334,54 +281,33 @@ make the mask of the decoder
>> maksDec - mask of the decoder self-attention
>> maksEncDec - mask of the decoder enc-dec attention
*/
void T2TModel::MakeMTMaskDec(XTensor &inputEnc, XTensor &inputDec,
XTensor &paddingEnc, XTensor &paddingDec,
void T2TModel::MakeMTMaskDec(XTensor &paddingEnc, XTensor &paddingDec,
XTensor &maskDec, XTensor &maskEncDec)
{
int len = inputDec.GetDim(inputDec.order - 1);
int * dims = new int[inputDec.order + 2];
for(int i = 0; i < inputDec.order; i++)
dims[i + 1] = inputDec.GetDim(i);
int len = paddingDec.GetDim(paddingDec.order - 1);
int * dims = new int[paddingDec.order + 2];
for(int i = 0; i < paddingDec.order; i++)
dims[i + 1] = paddingDec.GetDim(i);
dims[0] = nhead;
dims[inputDec.order + 1] = len;
InitTensor(&maskDec, inputDec.order + 2, dims, X_FLOAT, paddingDec.devID);
dims[paddingDec.order + 1] = len;
InitTensor(&maskDec, paddingDec.order + 2, dims, X_FLOAT, paddingDec.devID);
/* An upper triangular matrix where the cells of the upper triangular are set to -1e-9.
This matrix can be used to block the attention to current or following words in
a given sequence. */
_SetDataLowTri(&maskDec, 1e9F, 0);
//maskDec.Dump(stderr, "mask: ");
_ScaleAndShiftMe(&maskDec, 1.0F, -1e9F);
//maskDec.Dump(stderr, "mask: ");
ScaleAndShiftMe(maskDec, 1.0F, -1e9F);
/* encoder-decoder mask that prevents the attention to padding dummy words */
dims[inputDec.order + 1] = inputEnc.GetDim(inputEnc.order - 1);
InitTensor(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, paddingEnc.devID);
XTensor maskEncDecTMP;
XTensor * maskEncDecTMPEnc = NewTensorBuf(paddingEnc.order + 1, dims + 1, paddingEnc.dataType,
paddingEnc.devID);
XTensor * maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID);
Unsqueeze(paddingEnc, maskEncDecTMP, paddingEnc.order - 1, paddingDec.GetDim(-1));
ScaleAndShiftMe(maskEncDecTMP, 1e9F, -1e9F);
Unsqueeze(maskEncDecTMP, maskEncDec, 0, dims[0]);
_Unsqueeze(&paddingEnc, maskEncDecTMPEnc, paddingEnc.order - 1, paddingDec.GetDim(-1));
//paddingEnc.Dump(stderr, "paddingenc:");
//maskEncDecTMPEnc->Dump(stderr, "maskencdectmpenc:");
_ScaleAndShiftMe(maskEncDecTMPEnc, 1e9F, -1e9F);
//maskEncDecTMPEnc->Dump(stderr, "maskencdectmpenc:");
_Unsqueeze(maskEncDecTMPEnc, &maskEncDec, 0, dims[0]);
//maskEncDecTMPEnc->Dump(stderr, "maskencdectmpenc:");
DelTensorBuf(maskEncDecTMPDec);
DelTensorBuf(maskEncDecTMPEnc);
delete[] dims;
}
/*
get parameter matrics
>> list - the list that keeps the parameter matrics
......
......@@ -87,11 +87,10 @@ public:
XTensor &maskEnc, XTensor &maskDec, XTensor &maskEncDec);
/* make the mask of the encoder */
void MakeMTMaskEnc(XTensor &inputEnc, XTensor &paddingEnc, XTensor &maskEnc);
void MakeMTMaskEnc(XTensor &paddingEnc, XTensor &maskEnc);
/* make the mask of the decoder */
void MakeMTMaskDec(XTensor &inputEnc, XTensor &inputDec,
XTensor &paddingEnc, XTensor &paddingDec,
void MakeMTMaskDec(XTensor &paddingEnc, XTensor &paddingDec,
XTensor &maskDec, XTensor &maskEncDec);
/* get parameter matrics */
......
......@@ -171,7 +171,7 @@ void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding,
dims[inputEnc->order - 1] = 1;
InitTensor(&first, inputEnc->order, dims, X_INT, inputEnc->devID);
_SetDataFixedInt(&first, startSymbol);
first.SetDataFixed(startSymbol);
/* add a new word into the input sequence of the decoder side */
if (inputLast == NULL) {
......@@ -195,13 +195,13 @@ void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding,
XTensor paddingDec;
InitTensor(&paddingDec, inputDec.order, dims, X_INT, paddingEnc->devID);
SetDataFixedInt(paddingDec, 1);
paddingDec.SetDataFixed(1);
XTensor maskDec;
XTensor maskEncDec;
/* decoder mask */
m->MakeMTMaskDec(*inputEnc, inputDec, *paddingEnc, paddingDec, maskDec, maskEncDec);
m->MakeMTMaskDec(*paddingEnc, paddingDec, maskDec, maskEncDec);
/* make the decoding network */
decoding = decoder.Make(inputDec, *encoding, maskDec, maskEncDec, false);
......
......@@ -89,7 +89,7 @@ void T2TSearch::Search(T2TModel * model, XTensor * input, XTensor * padding, XTe
Prepare(input->unitNum/input->GetDim(-1), beamSize);
/* encoder mask */
model->MakeMTMaskEnc(*input, *padding, maskEnc);
model->MakeMTMaskEnc(*padding, maskEnc);
//input->Dump(stderr, "input:");
//maskEnc.Dump(stderr, "maskenc:");
......@@ -503,7 +503,7 @@ void T2TSearch::Dump(XTensor * output)
int * words = new int[maxLength];
InitTensor(output, 3, dims, X_INT);
SetDataFixedInt(*output, -1);
output->SetDataFixed(-1);
/* heap for an input sentence in the batch */
for(int h = 0; h < batchSize; h++){
......
......@@ -119,7 +119,7 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
int ws =0;
int wordCount = 0;
int wordCountTotal = 0;
int wordCountBatch = 0;
int batchCountTotal = 0;
bool isEnd = false;
float loss = 0;
float lr = 0;
......@@ -174,9 +174,6 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
/* gold standard */
XTensor gold;
/* label smoothed gold standard (if needed) */
XTensor goldSmoothed;
while (batchLoader.LoadBatch(file, model->isLM,
&batchEnc, &paddingEnc, &batchDec, &paddingDec, &gold, &label,
NULL, vSize, vSizeTgt,
......@@ -197,51 +194,34 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
ShowNTErrors("Illegal model type!");
}
/* back-propagation for obtaining gradients */
//if (labelSmoothingP > 0)
// LabelSmooth(&gold, &goldSmoothed, labelSmoothingP);
/* get loss and probabilities */
XTensor labelOnehot;
XTensor lossTensor;
labelOnehot = IndexToOnehot(label, vSizeTgt, labelSmoothingP);
/* make paddings for the output */
//if (output.GetDim(0) > 0)
//PadOutput(&output, &labelOnehot, &paddingDec);
/* get probabilities */
//float prob = GetProb(&output, &labelOnehot, NULL);
XTensor lossTensor;
lossTensor = CrossEntropy(output, labelOnehot, paddingDec);
float prob = ReduceSumAll(lossTensor);
float lossBatch = ReduceSumAll(lossTensor);
DTYPE lossLocal = prob / wc;
DTYPE lossLocal = lossBatch / wc;
bool doUpdate = (!IsNAN(lossLocal) && !IsINF(lossLocal) && lossLocal < 1e3F);
//XTensor &g = labelSmoothingP > 0 ? goldSmoothed : gold;
if (doUpdate) {
/* recale the output for normalized loss */
//RescaleOutput(&output, &labelOnehot, &paddingDec);
/* back-propagation */
net.Backward(lossTensor);
//net.Backward(output, labelOnehot, paddingDec, CROSSENTROPY);
//net.Backward(output, label, labelSmoothingP, CROSSENTROPY);
gradStep += 1;
loss += prob;
loss += lossBatch;
wordCount += wc;
wordCountTotal += wc;
batchCountTotal += ws;
//totalW = wc + ws;
wordCountBatch += ws;
/* update the parameters */
if(gradStep == updateStep){
/* learning rate */
lr = lrate * (1.0F / (float)sqrt((float)d)) * (float)MIN(pow((float)validStep + 1, -0.5F - lrbias), ((float)validStep + 1) * pow((float)nwarmup, -1.5F - lrbias));
lr = lrate * (1.0F / (float)sqrt((float)d)) *
(float)MIN(pow((float)validStep + 1, -0.5F - lrbias),
((float)validStep + 1) * pow((float)nwarmup, -1.5F - lrbias));
/* model update */
Update(model, lr);
......@@ -260,8 +240,10 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
if (step % 100 == 0) {
double elapsed = GetClockSec() - startT;
XPRINT8(0, stderr, "[INFO] elapsed=%.1fs, step=%d, epoch=%d, tword=%d, sword=%d, loss=%.3f, ppl=%.3f, sppl=%.3f",
elapsed, step, epoch, wordCountTotal, wordCountBatch, loss/wordCount, exp(loss/wordCount), exp(prob/wc));
XPRINT8(0, stderr, "[INFO] elapsed=%.1fs, step=%d, epoch=%d, total word=%d, total batch=%d, loss=%.3f, ppl=%.3f, sppl=%.3f",
elapsed, step, epoch,
wordCountTotal, batchCountTotal,
loss/wordCount, exp(loss/wordCount), exp(lossBatch /wc));
if (!doUpdate)
XPRINT(0, stderr, " (no update)");
XPRINT(0, stderr, "\n");
......@@ -301,12 +283,11 @@ test the model
>> ofn - output data file
>> model - model that is trained
*/
void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
void T2TTrainer::Validate(const char * fn, const char * ofn, T2TModel * model)
{
int wc = 0;
int ws = 0;
int wordCount = 0;
int wordCountTotal = 0;
int sentCount = 0;
float loss = 0;
......@@ -316,14 +297,8 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
FILE * ofile = fopen(ofn, "wb");
CheckNTErrors(ofile, "Cannot open the output file");
int devID = model->devID;
XNet net;
double startT = GetClockSec();
wordCount = 0;
/* batch of input sequences */
XTensor batchEnc;
XTensor batchDec;
......@@ -346,7 +321,7 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
while(batchLoader.LoadBatch(file, model->isLM,
&batchEnc, &paddingEnc, &batchDec, &paddingDec, &gold, &label,
seqs, vSize, vSizeTgt,
1, 1, false, ws, wc, devID, false))
1, 1, false, ws, wc, model->devID, false))
{
CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch");
......@@ -366,15 +341,11 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
int length = output.GetDim(1);
/* prediction probabilities */
XTensor probs;
InitTensor1D(&probs, bSize * length);
XTensor labelOnehot;
XTensor lossTensor;
labelOnehot = IndexToOnehot(label, vSizeTgt, 0);
/* get probabilities */
float prob = GetProb(&output, &labelOnehot, &probs);
lossTensor = CrossEntropy(output, labelOnehot, paddingDec);
float lossBatch = ReduceSumAll(lossTensor);
/* dump the test result */
for(int s = 0; s < bSize; s++){
......@@ -390,7 +361,7 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
fprintf(ofile, "||| ");
for(int i = 0; i < length; i++){
if(seq[i] >= 0){
DTYPE p = probs.Get1D(s * length + i);
DTYPE p = lossTensor.Get2D(s, i);
fprintf(ofile, "%.3e ", p);
sum += p;
}
......@@ -400,10 +371,10 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
fprintf(ofile, "||| %e\n", sum);
}
loss += -prob;
loss += lossBatch;
wordCount += wc;
wordCountTotal += wc;
sentCount += 1;
sentCount += bSize;
}
fclose(file);
......@@ -413,8 +384,8 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
double elapsed = GetClockSec() - startT;
XPRINT3(0, stderr, "[INFO] test finished (took %.1fs, word=%d, and ppl=%.3f)\n",
elapsed,wordCountTotal, exp(loss / wordCount));
XPRINT5(0, stderr, "[INFO] test finished (took %.1fs, sentence=%d, word=%d, loss=%.3f and ppl=%.3f)\n",
elapsed, sentCount, wordCount, loss / wordCount, exp(loss / wordCount));
}
/*
......@@ -428,64 +399,25 @@ make a checkpoint
void T2TTrainer::MakeCheckpoint(T2TModel * model, const char * validFN, const char * modelFN, const char * label, int id)
{
char * fn = new char[MAX_LINE_LENGTH];
char * fn2 = new char[MAX_LINE_LENGTH];
sprintf(fn, "%s.%s.%03d", modelFN, label, id);
sprintf(fn2, "%s.%s.%03d.output", modelFN, label, id);
model->Dump(fn);
//if(validFN != NULL){
//T2TTrainer trainer;
//trainer.Init(argNum, argArray);
//trainer.Test(validFN, fn2, model);
//}
delete[] fn;
delete[] fn2;
}
/*
get word probabilities for a batch of sequences
>> output - word distribution for each position
>> gold - gold standard
>> wordProbs - word probability for gold prediction
*/
float T2TTrainer::GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs)
{
XTensor probs;
InitTensorV2(&probs, output);
_Multiply(output, gold, &probs);
/* probability of each word */
XTensor wprobs;
InitTensor1D(&wprobs, output->unitNum/output->GetDim(-1), X_FLOAT, output->devID);
int dims[2] = {output->unitNum/output->GetDim(-1), output->GetDim(-1)};
probs.Reshape(2, dims);
_ReduceSum(&probs, &wprobs, 1);
if(wordProbs != NULL)
_CopyValues(&wprobs, wordProbs);
/* reshape the tensor to fit it into the reduce procedure
TODO: XTensor supports scalars */
dims[0] = 1;
dims[1] = probs.unitNum;
probs.Reshape(2, dims);
/* probability for the batch */
XTensor result;
InitTensor1D(&result, 1, X_FLOAT, output->devID);
_ReduceSum(&probs, &result, 1);
return result.Get1D(0);
char* fn2 = new char[MAX_LINE_LENGTH];
sprintf(fn2, "%s.%s.%03d.output", modelFN, label, id);
if(validFN != NULL){
T2TTrainer trainer;
trainer.Init(argNum, argArray);
trainer.Validate(validFN, fn2, model);
}
delete[] fn2;
}
/*
update the model by delta rule
\theta_new = \theta - \lrate * grad
\theta_{new} = \theta - \lrate * grad
where
\lrate = d^-0.5 * min(stepNum^-0.5, stepNum * warmupStepNum^-1.5)
\lrate = d^-0.5 * min(stepNum^{-0.5}, stepNum * warmupStepNum^{-1.5})
>> model - the t2t model
>> lr - learning rate
*/
......@@ -531,7 +463,6 @@ void T2TTrainer::Update(T2TModel * model, const float lr)
_Sum(para, v2, para, -e);
DelTensorBuf(v2);
}
else{
/* the delta rule */
......@@ -574,86 +505,4 @@ void T2TTrainer::PrepareModel(T2TModel * model)
adamBeta2T = 1.0F;
}
/*
do padding on the output
>> output - output tensor of the network
>> gold - gold standard
>> padding - padding of a batch of sentences
>> lsP - smoothing factor
*/
void T2TTrainer::PadOutput(XTensor * output, XTensor * gold, XTensor * padding)
{
if(output == NULL || padding == NULL)
return;
int on = output->order;
int * dimso = new int[on];
memcpy(dimso, output->dimSize, sizeof(int) * on);
output->Reshape(output->unitNum/dimso[output->order - 1], dimso[output->order - 1]);
XTensor * padding2 = NewTensorBuf(1, &padding->unitNum, X_FLOAT, padding->devID);
_CopyValues(padding, padding2);
_MultiplyDim(output, padding2, output, 0);
_ScaleAndShiftMe(padding2, 1e9F, -1e9F);
_SumDim(output, padding2, output, 0);
output->Reshape(on, dimso);
if(gold != NULL){
gold->Reshape(gold->unitNum/dimso[gold->order - 1], dimso[gold->order - 1]);
_CopyValues(padding, padding2);
_MultiplyDim(gold, padding2, gold, 0);
gold->Reshape(on, dimso);
}
delete[] dimso;
DelTensorBuf(padding2);
}
/*
recale the output and gold tensors for normalized loss
>> output - output tensor of the network
>> gold - gold standard
>> padding - padding of a batch of sentences
*/
void T2TTrainer::RescaleOutput(XTensor * output, XTensor * gold, XTensor * padding)
{
CheckNTErrors(output->order == 3, "Wrong dimension number!");
CheckNTErrors(gold->order == 3, "Wrong dimension number!");
DTYPE count = _ReduceSumAll(padding);
_ExpMe(output);
_ScaleAndShiftMe(output, 1/count);
_LogMe(output);
_ScaleAndShiftMe(gold, 1/count);
}
/*
perform label smoothing
>> gold - gold standard
>> smoothed - result of label smoothing
>> p - smoothing factor
*/
void T2TTrainer::LabelSmooth(XTensor * gold, XTensor * smoothed, DTYPE p)
{
CheckNTErrors(p >= 0 && p <= 1.0F, "Smoothing factor must be in range [0,1]");
int n = gold->GetDim(-1);
DTYPE q = 1.0F - p;
DTYPE gift = p / n;
InitTensor(smoothed, gold);
_CopyValues(gold, smoothed);
if(p == 0)
return;
_ScaleAndShiftMe(smoothed, q, gift);
}
}
......@@ -125,28 +125,16 @@ public:
void Train(const char * fn, const char * validFN, const char * modelFN, T2TModel * model);
/* test the model */
void Test(const char * fn, const char * ofn, T2TModel * model);
void Validate(const char * fn, const char * ofn, T2TModel * model);
/* make a checkpoint */
void MakeCheckpoint(T2TModel * model, const char * validFN, const char * modelFN, const char * label, int id);
/* get word probabilities for a batch of sequences */
float GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs);
/* update the model by delta rule */
void Update(T2TModel * model, const float lr);
/* prepare model for training */
void PrepareModel(T2TModel * model);
/* do padding on the output */
void PadOutput(XTensor * output, XTensor * gold, XTensor * padding);
/* recale the output and gold tensors for normalized loss */
void RescaleOutput(XTensor * output, XTensor * gold, XTensor * padding);
/* perform label smoothing */
void LabelSmooth(XTensor * gold, XTensor * smoothed, DTYPE p);
};
......
......@@ -94,7 +94,7 @@ int TransformerMain(int argc, const char ** argv)
else{
T2TTrainer tester;
tester.Init(argc, args);
tester.Test(testFN, outputFN, &model);
tester.Validate(testFN, outputFN, &model);
}
}
......
......@@ -28,7 +28,6 @@
#ifndef __XTENSOR_H__
#define __XTENSOR_H__
#include <math.h>
#include "XGlobal.h"
#include "XMem.h"
#include "XPRunner.h"
......@@ -416,11 +415,11 @@ public:
bool BinarySearch(int key, DTYPE &value, void * &position) const;
/* dump data to a file */
void Dump(FILE * file, const char * label = NULL, const int n = -1, const int beg = 0, const int verbose = 0);
void Dump(FILE * file = stderr, const char * label = NULL, const int n = -1, const int beg = 0, const int verbose = 0);
/* dump data to a file */
static
void Dump(const XTensor * tensor, FILE * file, const char * label = NULL, const int n = -1, const int beg = 0, const int verbose = 0);
void Dump(const XTensor * tensor, FILE * file = stderr, const char * label = NULL, const int n = -1, const int beg = 0, const int verbose = 0);
/* dump data to a binary file */
void BinaryDump(FILE * file);
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论