Commit e1ed713a by xuchen

optimize the t2t code

parent bdf5c952
...@@ -34,7 +34,12 @@ namespace nts{ ...@@ -34,7 +34,12 @@ namespace nts{
/* compute dE/dx of a node */ /* compute dE/dx of a node */
void XShapeGrad::MakeGrad(XTensor * node, bool isEfficient) void XShapeGrad::MakeGrad(XTensor * node, bool isEfficient)
{ {
CheckNTErrors(node->grad != NULL, "No gradient found!"); if (!isEfficient) {
CheckNTErrors(node->grad != NULL, "No gradient found!");
}
else {
CheckNTErrors(!node->isGrad || node->grad != NULL, "No gradient found!");
}
XLink &income = node->income; XLink &income = node->income;
int operID = income.typeID; int operID = income.typeID;
......
...@@ -131,32 +131,20 @@ XTensor T2TEmbedder::Make(XTensor &input) ...@@ -131,32 +131,20 @@ XTensor T2TEmbedder::Make(XTensor &input)
XTensor wordEmbedding; XTensor wordEmbedding;
XTensor posEmbedding; XTensor posEmbedding;
bool match = (posEmbedding.order == input.order); /* make positional embeddings */
if(match){ XTensor position;
for(int i = 0; i < input.order; i++){ XTensor embTMP;
if(dims[i] != posEmbedding.GetDim(i))
match = false;
}
}
/* we make positional embeddings first */
//if(!match){
if(true){
InitTensor(&posEmbedding, input.order + 1, dims, X_FLOAT, devID);
XTensor * posTMP = NewTensorBuf(2, dims + 1, X_FLOAT, devID); InitTensor1D(&position, input.GetDim(-1), X_INT, devID);
position.Range(0, position.unitNum, 1);
_CopyValues(&posEmbeddingBase, 0, posTMP->unitNum, posTMP, 0); embTMP = Gather(posEmbeddingBase, position);
_Unsqueeze(posTMP, &posEmbedding, 0, dims[0]); posEmbedding = Unsqueeze(embTMP, 0, dims[0]);
DelTensorBuf(posTMP);
}
/* then we make word embeddings */ /* make word embeddings */
wordEmbedding = Gather(w, input); wordEmbedding = Gather(w, input);
wordEmbedding = Linear(wordEmbedding, (float)sqrt((float)eSize)); wordEmbedding = Linear(wordEmbedding, (float)sqrt((float)eSize));
/* we sum over the two embeddings */ /* sum over the two embeddings */
return wordEmbedding + posEmbedding; return wordEmbedding + posEmbedding;
} }
......
...@@ -114,64 +114,28 @@ make the network for language modeling (with the output softmax layer) ...@@ -114,64 +114,28 @@ make the network for language modeling (with the output softmax layer)
*/ */
void T2TModel::MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool isTraining) void T2TModel::MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool isTraining)
{ {
XTensor encoding; int len = padding.GetDim(padding.order - 1);
int * dims = new int[padding.order + 2];
/* generate mask to see "previous" words only */ for(int i = 0; i < padding.order; i++)
//int len = input.GetDim(input.order - 2); dims[i + 1] = padding.GetDim(i);
//int * dims = new int[input.order + 1];
//for(int i = 0; i < input.order; i++)
// dims[i + 1] = input.GetDim(i);
//dims[0] = nhead;
//dims[input.order] = len;
//XTensor mask(input.order + 1, dims, X_FLOAT, 1.0F, input.devID, input.mem);
int len = input.GetDim(input.order - 1);
int * dims = new int[input.order + 2];
for(int i = 0; i < input.order; i++)
dims[i + 1] = input.GetDim(i);
dims[0] = nhead; dims[0] = nhead;
dims[input.order + 1] = len; dims[padding.order + 1] = len;
XTensor mask; XTensor mask;
InitTensor(&mask, input.order + 2, dims, X_FLOAT, padding.devID); InitTensor(&mask, padding.order + 2, dims, X_FLOAT, padding.devID);
delete[] dims;
/* a upper triangular matrix where the cells of the upper triangular are set to -1e-9. /* a upper triangular matrix where the cells of the upper triangular are set to -1e-9.
this matrix can be used to prevent the attention to current or following words in this matrix can be used to prevent the attention to current or following words in
a given sequence. */ a given sequence. */
_SetDataLowTri(&mask, 1e9F, 0); _SetDataLowTri(&mask, 1e9F, 0);
_ScaleAndShiftMe(&mask, 1.0F, -1e9F); ScaleAndShiftMe(mask, 1.0F, -1e9F);
int * dimsPadding = new int[padding.order + 2];
for(int i = 0; i < padding.order - 1; i++)
dimsPadding[i] = padding.GetDim(i);
dimsPadding[padding.order - 1] = padding.GetDim(-1);
dimsPadding[padding.order] = padding.GetDim(-1);
XTensor * padding2 = NewTensorBuf(padding.order + 1, dimsPadding, padding.dataType,
padding.devID);
for(int i = 0; i < padding2->order; i++)
dimsPadding[i + 1] = padding2->GetDim(i);
dimsPadding[0] = nhead;
//XTensor * padding3 = NewTensorBuf(padding.order + 2, dimsPadding, padding.dataType, /* forward */
// padding.devID); XTensor encoding;
//
///* mask of the padding */
//_Unsqueeze(&padding, padding2, padding.order - 1, padding.GetDim(-1));
//_Unsqueeze(padding2, padding3, 0, nhead);
//
//_ScaleAndShiftMe(padding3, 1e9F, -1e9F);
//
////_Sum(&mask, padding3, &mask);
encoding = MakeEncoder(input, mask, isTraining); encoding = MakeEncoder(input, mask, isTraining);
outputLayer->Make(encoding, output); outputLayer->Make(encoding, output);
delete[] dims;
delete[] dimsPadding;
//DelTensorBuf(padding3);
DelTensorBuf(padding2);
} }
/* /*
...@@ -183,7 +147,9 @@ make the network for machine translation (with the output softmax layer) ...@@ -183,7 +147,9 @@ make the network for machine translation (with the output softmax layer)
>> paddingDec - padding of the sequences (on the decoder side) >> paddingDec - padding of the sequences (on the decoder side)
>> isTraining - indicates whether the model is for training >> isTraining - indicates whether the model is for training
*/ */
void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTensor &paddingEnc, XTensor &paddingDec, bool isTraining) void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output,
XTensor &paddingEnc, XTensor &paddingDec,
bool isTraining)
{ {
XTensor encoding; XTensor encoding;
XTensor decoding; XTensor decoding;
...@@ -192,10 +158,10 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe ...@@ -192,10 +158,10 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe
XTensor maskEncDec; XTensor maskEncDec;
/* encoder mask */ /* encoder mask */
MakeMTMaskEnc(inputEnc, paddingEnc, maskEnc); MakeMTMaskEnc(paddingEnc, maskEnc);
/* decoder mask */ /* decoder mask */
MakeMTMaskDec(inputEnc, inputDec, paddingEnc, paddingDec, maskDec, maskEncDec); MakeMTMaskDec(paddingEnc, paddingDec, maskDec, maskEncDec);
encoding = MakeEncoder(inputEnc, maskEnc, isTraining); encoding = MakeEncoder(inputEnc, maskEnc, isTraining);
...@@ -289,40 +255,21 @@ make the mask of the encoder ...@@ -289,40 +255,21 @@ make the mask of the encoder
>> paddingEnc - padding of the encoder input >> paddingEnc - padding of the encoder input
>> maskEnc - mask of the encoder self-attention >> maskEnc - mask of the encoder self-attention
*/ */
void T2TModel::MakeMTMaskEnc(XTensor &inputEnc, XTensor &paddingEnc, XTensor &maskEnc) void T2TModel::MakeMTMaskEnc(XTensor &paddingEnc, XTensor &maskEnc)
{ {
/* padding on the source side */ XTensor padding2;
int * dimsPadding = new int[paddingEnc.order + 2]; XTensor padding3;
for (int i = 0; i < paddingEnc.order - 1; i++)
dimsPadding[i] = paddingEnc.GetDim(i);
dimsPadding[paddingEnc.order - 1] = paddingEnc.GetDim(-1);
dimsPadding[paddingEnc.order] = paddingEnc.GetDim(-1);
XTensor * padding2 = NewTensorBuf(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType,
paddingEnc.devID);
for (int i = 0; i < padding2->order; i++)
dimsPadding[i + 1] = padding2->GetDim(i);
dimsPadding[0] = nhead;
XTensor * padding3 = NewTensorBuf(paddingEnc.order + 2, dimsPadding, paddingEnc.dataType,
paddingEnc.devID);
/* mask of the padding */ /* mask of the padding */
_Unsqueeze(&paddingEnc, padding2, paddingEnc.order - 1, paddingEnc.GetDim(-1)); Unsqueeze(paddingEnc, padding2, paddingEnc.order - 1, paddingEnc.GetDim(-1));
_Unsqueeze(padding2, padding3, 0, nhead); Unsqueeze(padding2, padding3, 0, nhead);
ScaleAndShiftMe(padding3, 1e9F, -1e9F);
_ScaleAndShiftMe(padding3, 1e9F, -1e9F);
InitTensor(&maskEnc, padding3); InitTensor(&maskEnc, &padding3);
maskEnc.SetZeroAll(); maskEnc.SetZeroAll();
/* generate the mask on the source language side (for padding) */ /* generate the mask on the source language side (for padding) */
_Sum(&maskEnc, padding3, &maskEnc); SumMe(maskEnc, padding3);
DelTensorBuf(padding3);
DelTensorBuf(padding2);
delete[] dimsPadding;
} }
/* /*
...@@ -334,54 +281,33 @@ make the mask of the decoder ...@@ -334,54 +281,33 @@ make the mask of the decoder
>> maksDec - mask of the decoder self-attention >> maksDec - mask of the decoder self-attention
>> maksEncDec - mask of the decoder enc-dec attention >> maksEncDec - mask of the decoder enc-dec attention
*/ */
void T2TModel::MakeMTMaskDec(XTensor &inputEnc, XTensor &inputDec, void T2TModel::MakeMTMaskDec(XTensor &paddingEnc, XTensor &paddingDec,
XTensor &paddingEnc, XTensor &paddingDec,
XTensor &maskDec, XTensor &maskEncDec) XTensor &maskDec, XTensor &maskEncDec)
{ {
int len = inputDec.GetDim(inputDec.order - 1); int len = paddingDec.GetDim(paddingDec.order - 1);
int * dims = new int[inputDec.order + 2]; int * dims = new int[paddingDec.order + 2];
for(int i = 0; i < inputDec.order; i++) for(int i = 0; i < paddingDec.order; i++)
dims[i + 1] = inputDec.GetDim(i); dims[i + 1] = paddingDec.GetDim(i);
dims[0] = nhead; dims[0] = nhead;
dims[inputDec.order + 1] = len; dims[paddingDec.order + 1] = len;
InitTensor(&maskDec, inputDec.order + 2, dims, X_FLOAT, paddingDec.devID); InitTensor(&maskDec, paddingDec.order + 2, dims, X_FLOAT, paddingDec.devID);
/* An upper triangular matrix where the cells of the upper triangular are set to -1e-9. /* An upper triangular matrix where the cells of the upper triangular are set to -1e-9.
This matrix can be used to block the attention to current or following words in This matrix can be used to block the attention to current or following words in
a given sequence. */ a given sequence. */
_SetDataLowTri(&maskDec, 1e9F, 0); _SetDataLowTri(&maskDec, 1e9F, 0);
ScaleAndShiftMe(maskDec, 1.0F, -1e9F);
//maskDec.Dump(stderr, "mask: ");
_ScaleAndShiftMe(&maskDec, 1.0F, -1e9F);
//maskDec.Dump(stderr, "mask: ");
/* encoder-decoder mask that prevents the attention to padding dummy words */ /* encoder-decoder mask that prevents the attention to padding dummy words */
dims[inputDec.order + 1] = inputEnc.GetDim(inputEnc.order - 1); XTensor maskEncDecTMP;
InitTensor(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, paddingEnc.devID);
XTensor * maskEncDecTMPEnc = NewTensorBuf(paddingEnc.order + 1, dims + 1, paddingEnc.dataType, Unsqueeze(paddingEnc, maskEncDecTMP, paddingEnc.order - 1, paddingDec.GetDim(-1));
paddingEnc.devID); ScaleAndShiftMe(maskEncDecTMP, 1e9F, -1e9F);
XTensor * maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID); Unsqueeze(maskEncDecTMP, maskEncDec, 0, dims[0]);
_Unsqueeze(&paddingEnc, maskEncDecTMPEnc, paddingEnc.order - 1, paddingDec.GetDim(-1));
//paddingEnc.Dump(stderr, "paddingenc:");
//maskEncDecTMPEnc->Dump(stderr, "maskencdectmpenc:");
_ScaleAndShiftMe(maskEncDecTMPEnc, 1e9F, -1e9F);
//maskEncDecTMPEnc->Dump(stderr, "maskencdectmpenc:");
_Unsqueeze(maskEncDecTMPEnc, &maskEncDec, 0, dims[0]);
//maskEncDecTMPEnc->Dump(stderr, "maskencdectmpenc:");
DelTensorBuf(maskEncDecTMPDec);
DelTensorBuf(maskEncDecTMPEnc);
delete[] dims; delete[] dims;
} }
/* /*
get parameter matrics get parameter matrics
>> list - the list that keeps the parameter matrics >> list - the list that keeps the parameter matrics
......
...@@ -87,11 +87,10 @@ public: ...@@ -87,11 +87,10 @@ public:
XTensor &maskEnc, XTensor &maskDec, XTensor &maskEncDec); XTensor &maskEnc, XTensor &maskDec, XTensor &maskEncDec);
/* make the mask of the encoder */ /* make the mask of the encoder */
void MakeMTMaskEnc(XTensor &inputEnc, XTensor &paddingEnc, XTensor &maskEnc); void MakeMTMaskEnc(XTensor &paddingEnc, XTensor &maskEnc);
/* make the mask of the decoder */ /* make the mask of the decoder */
void MakeMTMaskDec(XTensor &inputEnc, XTensor &inputDec, void MakeMTMaskDec(XTensor &paddingEnc, XTensor &paddingDec,
XTensor &paddingEnc, XTensor &paddingDec,
XTensor &maskDec, XTensor &maskEncDec); XTensor &maskDec, XTensor &maskEncDec);
/* get parameter matrics */ /* get parameter matrics */
......
...@@ -171,7 +171,7 @@ void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding, ...@@ -171,7 +171,7 @@ void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding,
dims[inputEnc->order - 1] = 1; dims[inputEnc->order - 1] = 1;
InitTensor(&first, inputEnc->order, dims, X_INT, inputEnc->devID); InitTensor(&first, inputEnc->order, dims, X_INT, inputEnc->devID);
_SetDataFixedInt(&first, startSymbol); first.SetDataFixed(startSymbol);
/* add a new word into the input sequence of the decoder side */ /* add a new word into the input sequence of the decoder side */
if (inputLast == NULL) { if (inputLast == NULL) {
...@@ -195,13 +195,13 @@ void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding, ...@@ -195,13 +195,13 @@ void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding,
XTensor paddingDec; XTensor paddingDec;
InitTensor(&paddingDec, inputDec.order, dims, X_INT, paddingEnc->devID); InitTensor(&paddingDec, inputDec.order, dims, X_INT, paddingEnc->devID);
SetDataFixedInt(paddingDec, 1); paddingDec.SetDataFixed(1);
XTensor maskDec; XTensor maskDec;
XTensor maskEncDec; XTensor maskEncDec;
/* decoder mask */ /* decoder mask */
m->MakeMTMaskDec(*inputEnc, inputDec, *paddingEnc, paddingDec, maskDec, maskEncDec); m->MakeMTMaskDec(*paddingEnc, paddingDec, maskDec, maskEncDec);
/* make the decoding network */ /* make the decoding network */
decoding = decoder.Make(inputDec, *encoding, maskDec, maskEncDec, false); decoding = decoder.Make(inputDec, *encoding, maskDec, maskEncDec, false);
......
...@@ -89,7 +89,7 @@ void T2TSearch::Search(T2TModel * model, XTensor * input, XTensor * padding, XTe ...@@ -89,7 +89,7 @@ void T2TSearch::Search(T2TModel * model, XTensor * input, XTensor * padding, XTe
Prepare(input->unitNum/input->GetDim(-1), beamSize); Prepare(input->unitNum/input->GetDim(-1), beamSize);
/* encoder mask */ /* encoder mask */
model->MakeMTMaskEnc(*input, *padding, maskEnc); model->MakeMTMaskEnc(*padding, maskEnc);
//input->Dump(stderr, "input:"); //input->Dump(stderr, "input:");
//maskEnc.Dump(stderr, "maskenc:"); //maskEnc.Dump(stderr, "maskenc:");
...@@ -503,7 +503,7 @@ void T2TSearch::Dump(XTensor * output) ...@@ -503,7 +503,7 @@ void T2TSearch::Dump(XTensor * output)
int * words = new int[maxLength]; int * words = new int[maxLength];
InitTensor(output, 3, dims, X_INT); InitTensor(output, 3, dims, X_INT);
SetDataFixedInt(*output, -1); output->SetDataFixed(-1);
/* heap for an input sentence in the batch */ /* heap for an input sentence in the batch */
for(int h = 0; h < batchSize; h++){ for(int h = 0; h < batchSize; h++){
......
...@@ -125,28 +125,16 @@ public: ...@@ -125,28 +125,16 @@ public:
void Train(const char * fn, const char * validFN, const char * modelFN, T2TModel * model); void Train(const char * fn, const char * validFN, const char * modelFN, T2TModel * model);
/* test the model */ /* test the model */
void Test(const char * fn, const char * ofn, T2TModel * model); void Validate(const char * fn, const char * ofn, T2TModel * model);
/* make a checkpoint */ /* make a checkpoint */
void MakeCheckpoint(T2TModel * model, const char * validFN, const char * modelFN, const char * label, int id); void MakeCheckpoint(T2TModel * model, const char * validFN, const char * modelFN, const char * label, int id);
/* get word probabilities for a batch of sequences */
float GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs);
/* update the model by delta rule */ /* update the model by delta rule */
void Update(T2TModel * model, const float lr); void Update(T2TModel * model, const float lr);
/* prepare model for training */ /* prepare model for training */
void PrepareModel(T2TModel * model); void PrepareModel(T2TModel * model);
/* do padding on the output */
void PadOutput(XTensor * output, XTensor * gold, XTensor * padding);
/* recale the output and gold tensors for normalized loss */
void RescaleOutput(XTensor * output, XTensor * gold, XTensor * padding);
/* perform label smoothing */
void LabelSmooth(XTensor * gold, XTensor * smoothed, DTYPE p);
}; };
......
...@@ -94,7 +94,7 @@ int TransformerMain(int argc, const char ** argv) ...@@ -94,7 +94,7 @@ int TransformerMain(int argc, const char ** argv)
else{ else{
T2TTrainer tester; T2TTrainer tester;
tester.Init(argc, args); tester.Init(argc, args);
tester.Test(testFN, outputFN, &model); tester.Validate(testFN, outputFN, &model);
} }
} }
......
...@@ -28,7 +28,6 @@ ...@@ -28,7 +28,6 @@
#ifndef __XTENSOR_H__ #ifndef __XTENSOR_H__
#define __XTENSOR_H__ #define __XTENSOR_H__
#include <math.h>
#include "XGlobal.h" #include "XGlobal.h"
#include "XMem.h" #include "XMem.h"
#include "XPRunner.h" #include "XPRunner.h"
...@@ -416,11 +415,11 @@ public: ...@@ -416,11 +415,11 @@ public:
bool BinarySearch(int key, DTYPE &value, void * &position) const; bool BinarySearch(int key, DTYPE &value, void * &position) const;
/* dump data to a file */ /* dump data to a file */
void Dump(FILE * file, const char * label = NULL, const int n = -1, const int beg = 0, const int verbose = 0); void Dump(FILE * file = stderr, const char * label = NULL, const int n = -1, const int beg = 0, const int verbose = 0);
/* dump data to a file */ /* dump data to a file */
static static
void Dump(const XTensor * tensor, FILE * file, const char * label = NULL, const int n = -1, const int beg = 0, const int verbose = 0); void Dump(const XTensor * tensor, FILE * file = stderr, const char * label = NULL, const int n = -1, const int beg = 0, const int verbose = 0);
/* dump data to a binary file */ /* dump data to a binary file */
void BinaryDump(FILE * file); void BinaryDump(FILE * file);
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论