diff --git a/source/sample/transformer/T2TLengthPenalty.cpp b/source/sample/transformer/T2TLengthPenalty.cpp new file mode 100644 index 0000000..0959b21 --- /dev/null +++ b/source/sample/transformer/T2TLengthPenalty.cpp @@ -0,0 +1,41 @@ +/* NiuTrans.Tensor - an open-source tensor library + * Copyright (C) 2019, Natural Language Processing Lab, Northestern University. + * All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../../tensor/core/CHeader.h" +#include "T2TLengthPenalty.h" + +using namespace nts; + +namespace transformer +{ + +/* +GNMT-like length penalty: pl = ((5 + n)/(5 + 1))^\alpha +where n = length of the sequence +>> length - length of the sequence (for each entry) +>> lp - length penaltyof the sequence (for each entry) +*/ +void T2TLengthPenalizer::GNMT(const XTensor & length, XTensor & lp, float alpha) +{ + XTensor base; + + base = ScaleAndShift(ScaleAndShift(length, 0, 5.0F), 1.0F/(5 + 1)); + + lp = Power(base, alpha); +} + +} diff --git a/source/sample/transformer/T2TLengthPenalty.h b/source/sample/transformer/T2TLengthPenalty.h new file mode 100644 index 0000000..12c5f61 --- /dev/null +++ b/source/sample/transformer/T2TLengthPenalty.h @@ -0,0 +1,48 @@ +/* NiuTrans.Tensor - an open-source tensor library + * Copyright (C) 2019, Natural Language Processing Lab, Northestern University. + * All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-04-08 + * Start of a new week - I just finished several documents. + * Writing document is harder than writing code :) + */ + +#ifndef __T2TLENGTHPENALTY_H__ +#define __T2TLENGTHPENALTY_H__ + +#include "../../tensor/XTensor.h" + +using namespace nts; + +namespace transformer +{ + +/* We intend to penalize short sequences because they have higher score + in product of a sequence of probability-like terms and have more chances + to beat others in search. */ +class T2TLengthPenalizer +{ +public: + /* GNMT-like length penalty: pl = ((5 + n)/(5 + 1))^\alpha + where n = length of the sequence */ + static + void GNMT(const XTensor & length, XTensor & lp, float alpha); +}; + +} + +#endif \ No newline at end of file diff --git a/source/sample/transformer/T2TPredictor.cpp b/source/sample/transformer/T2TPredictor.cpp index bf45504..acd0b67 100644 --- a/source/sample/transformer/T2TPredictor.cpp +++ b/source/sample/transformer/T2TPredictor.cpp @@ -53,8 +53,14 @@ void T2TStateBundle::MakeStates(int num) states = new T2TState[num]; - for(int i = 0; i < num; i++) + for(int i = 0; i < num; i++){ + states[i].prediction = -1; + states[i].prob = 0; + states[i].probPath = 0; + states[i].modelScore = 0; + states[i].nstep = 0; states[i].last = NULL; + } } /* constructor */ @@ -124,7 +130,7 @@ void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding, XTensor * inputDec = Concatenate(*inputLast, s->prediction, inputLast->GetDim(-1)); /* prediction probabilities */ - XTensor &output = next->score; + XTensor &output = next->prob; XTensor paddingDec; InitTensor3D(&paddingDec, inputDec.GetDim(0), inputDec.GetDim(1), m->outputLayer->vSize, X_INT); diff --git a/source/sample/transformer/T2TPredictor.h b/source/sample/transformer/T2TPredictor.h index c401b59..2891525 100644 --- a/source/sample/transformer/T2TPredictor.h +++ b/source/sample/transformer/T2TPredictor.h @@ -24,6 +24,7 @@ #define __T2TPREDICTOR_H__ #include "T2TModel.h" +#include "T2TLengthPenalty.h" namespace transformer { @@ -36,11 +37,17 @@ public: /* we assume that the prediction is an integer */ int prediction; - /* score of the prediction */ - float score; + /* probability of every prediction (last state of the path) */ + float prob; - /* score of the path */ - float scorePath; + /* probability of every path */ + float probPath; + + /* model score of every path */ + float modelScore; + + /* nubmer of steps we go over so far */ + int nstep; /* pointer to the previous state */ T2TState * last; @@ -56,11 +63,17 @@ public: /* id of the previous state that generates the current one */ XTensor preID; - /* score of every prediction (last state of the path) */ - XTensor score; + /* probability of every prediction (last state of the path) */ + XTensor prob; + + /* probability of every path */ + XTensor probPath; + + /* model score of every path */ + XTensor modelScore; - /* score of every path */ - XTensor scorePath; + /* step number of each hypothesis */ + XTensor nstep; /* layers on the encoder side. We actually use the encoder output instead of all hidden layers. */ diff --git a/source/sample/transformer/T2TSearch.cpp b/source/sample/transformer/T2TSearch.cpp index db3b493..89f2f57 100644 --- a/source/sample/transformer/T2TSearch.cpp +++ b/source/sample/transformer/T2TSearch.cpp @@ -36,6 +36,7 @@ initialize the model void T2TSearch::InitModel(int argc, char ** argv) { LoadParamInt(argc, argv, "beamsize", &beamSize, 1); + LoadParamFloat(argc, argv, "lenalpha", &alpha, 0.2F); } /* @@ -75,6 +76,9 @@ void T2TSearch::Search(T2TModel * model, XTensor * input, XTensor * padding, XTe /* predict the next state */ predictor.Predict(next, &encoding, input, padding); + /* compute the model score (given the prediction probability) */ + Score(cur, next); + /* beam pruning */ Generate(next); } @@ -83,6 +87,37 @@ void T2TSearch::Search(T2TModel * model, XTensor * input, XTensor * padding, XTe } /* +compute the model score for each hypothesis +>> prev - the beam of the previous state +>> beam - the beam that keeps a number of states +*/ +void T2TSearch::Score(T2TStateBundle * prev, T2TStateBundle * beam) +{ + XTensor &score = beam->modelScore; + XTensor &prob = beam->prob; + XTensor &probPath = beam->probPath; + XTensor &lenPrev = prev->nstep; + XTensor &len = beam->nstep; + XTensor lp; + + InitTensor(&score, &prob); + + /* the log-scale probability of the entire sequence */ + _Sum(&prob, &probPath, &score); + + InitTensor(&len, &lenPrev); + InitTensor(&lp, &lenPrev); + + _ScaleAndShift(&lenPrev, &len, 1.0F, 1.0F); + + /* the GNMT-like length penalty */ + T2TLengthPenalizer::GNMT(len, lp, alpha); + + /* score = log-prob/lp */ + _Div(&score, &lp, &score); +} + +/* generate tokens for the next state via beam pruning >> beam - the beam that keeps a number of states */ @@ -93,7 +128,7 @@ void T2TSearch::Generate(T2TStateBundle * beam) int dimsTopK[MAX_TENSOR_DIM_NUM]; XTensor scoreTopK; - XTensor &score = beam->score; + XTensor &score = beam->modelScore; XTensor &index = beam->prediction; XTensor &preID = beam->preID; int order = score.order; @@ -131,16 +166,20 @@ void T2TSearch::Generate(T2TStateBundle * beam) /* "preID" represents the id (or the offset) of previous state used to make the current hypothesis. Note that we reshape the "score" tensor into a matrix where each row means a previous state. The column number is size-of-beam * vocab-size. We, - therefore, divide the top-k index by vocab-size to compute the id of previous state for - each hypothesis in the top-k list. */ + therefore, divide entries of the top-k index by vocab-size to compute the id of + previous state for each hypothesis in the top-k list. */ Descale(preID, sizeVocab); - /* Then we are going to do something similar to "preID". For the top-k predictions, we - need to know their indices in the vocabulary. We compute the offset of each prediction + /* Then, we do something similar to "preID". For the top-k predictions, we need + to know their indices in the vocabulary. We compute the offset of each prediction in the vocabulary by dividing it with vocab-size and computing the remainder. */ Mod(index, sizeVocab); score.Reshape(order, dims); + + /* we keep the top-k scores */ + InitTensor(&score, &scoreTopK); + CopyValues(scoreTopK, score); } /* diff --git a/source/sample/transformer/T2TSearch.h b/source/sample/transformer/T2TSearch.h index 039ab0e..8d81560 100644 --- a/source/sample/transformer/T2TSearch.h +++ b/source/sample/transformer/T2TSearch.h @@ -35,6 +35,9 @@ namespace transformer class T2TSearch { private: + /* the alpha parameter controls the length preference */ + float alpha; + /* predictor */ T2TPredictor predictor; @@ -57,6 +60,9 @@ public: /* search for the most promising states */ void Search(T2TModel * model, XTensor * input, XTensor * padding, XTensor * output); + /* compute the model score for each hypothesis */ + void Score(T2TStateBundle * prev, T2TStateBundle * beam); + /* generate token indices via beam pruning */ void Generate(T2TStateBundle * beam);