Commit 55dfe49b by xiaotong

model score with length penalty

parent 1f041016
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "../../tensor/core/CHeader.h"
#include "T2TLengthPenalty.h"
using namespace nts;
namespace transformer
{
/*
GNMT-like length penalty: pl = ((5 + n)/(5 + 1))^\alpha
where n = length of the sequence
>> length - length of the sequence (for each entry)
>> lp - length penaltyof the sequence (for each entry)
*/
void T2TLengthPenalizer::GNMT(const XTensor & length, XTensor & lp, float alpha)
{
XTensor base;
base = ScaleAndShift(ScaleAndShift(length, 0, 5.0F), 1.0F/(5 + 1));
lp = Power(base, alpha);
}
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-04-08
* Start of a new week - I just finished several documents.
* Writing document is harder than writing code :)
*/
#ifndef __T2TLENGTHPENALTY_H__
#define __T2TLENGTHPENALTY_H__
#include "../../tensor/XTensor.h"
using namespace nts;
namespace transformer
{
/* We intend to penalize short sequences because they have higher score
in product of a sequence of probability-like terms and have more chances
to beat others in search. */
class T2TLengthPenalizer
{
public:
/* GNMT-like length penalty: pl = ((5 + n)/(5 + 1))^\alpha
where n = length of the sequence */
static
void GNMT(const XTensor & length, XTensor & lp, float alpha);
};
}
#endif
\ No newline at end of file
...@@ -53,8 +53,14 @@ void T2TStateBundle::MakeStates(int num) ...@@ -53,8 +53,14 @@ void T2TStateBundle::MakeStates(int num)
states = new T2TState[num]; states = new T2TState[num];
for(int i = 0; i < num; i++) for(int i = 0; i < num; i++){
states[i].prediction = -1;
states[i].prob = 0;
states[i].probPath = 0;
states[i].modelScore = 0;
states[i].nstep = 0;
states[i].last = NULL; states[i].last = NULL;
}
} }
/* constructor */ /* constructor */
...@@ -124,7 +130,7 @@ void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding, XTensor * ...@@ -124,7 +130,7 @@ void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding, XTensor *
inputDec = Concatenate(*inputLast, s->prediction, inputLast->GetDim(-1)); inputDec = Concatenate(*inputLast, s->prediction, inputLast->GetDim(-1));
/* prediction probabilities */ /* prediction probabilities */
XTensor &output = next->score; XTensor &output = next->prob;
XTensor paddingDec; XTensor paddingDec;
InitTensor3D(&paddingDec, inputDec.GetDim(0), inputDec.GetDim(1), m->outputLayer->vSize, X_INT); InitTensor3D(&paddingDec, inputDec.GetDim(0), inputDec.GetDim(1), m->outputLayer->vSize, X_INT);
......
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
#define __T2TPREDICTOR_H__ #define __T2TPREDICTOR_H__
#include "T2TModel.h" #include "T2TModel.h"
#include "T2TLengthPenalty.h"
namespace transformer namespace transformer
{ {
...@@ -36,11 +37,17 @@ public: ...@@ -36,11 +37,17 @@ public:
/* we assume that the prediction is an integer */ /* we assume that the prediction is an integer */
int prediction; int prediction;
/* score of the prediction */ /* probability of every prediction (last state of the path) */
float score; float prob;
/* score of the path */ /* probability of every path */
float scorePath; float probPath;
/* model score of every path */
float modelScore;
/* nubmer of steps we go over so far */
int nstep;
/* pointer to the previous state */ /* pointer to the previous state */
T2TState * last; T2TState * last;
...@@ -56,11 +63,17 @@ public: ...@@ -56,11 +63,17 @@ public:
/* id of the previous state that generates the current one */ /* id of the previous state that generates the current one */
XTensor preID; XTensor preID;
/* score of every prediction (last state of the path) */ /* probability of every prediction (last state of the path) */
XTensor score; XTensor prob;
/* probability of every path */
XTensor probPath;
/* model score of every path */
XTensor modelScore;
/* score of every path */ /* step number of each hypothesis */
XTensor scorePath; XTensor nstep;
/* layers on the encoder side. We actually use the encoder output instead /* layers on the encoder side. We actually use the encoder output instead
of all hidden layers. */ of all hidden layers. */
......
...@@ -36,6 +36,7 @@ initialize the model ...@@ -36,6 +36,7 @@ initialize the model
void T2TSearch::InitModel(int argc, char ** argv) void T2TSearch::InitModel(int argc, char ** argv)
{ {
LoadParamInt(argc, argv, "beamsize", &beamSize, 1); LoadParamInt(argc, argv, "beamsize", &beamSize, 1);
LoadParamFloat(argc, argv, "lenalpha", &alpha, 0.2F);
} }
/* /*
...@@ -75,6 +76,9 @@ void T2TSearch::Search(T2TModel * model, XTensor * input, XTensor * padding, XTe ...@@ -75,6 +76,9 @@ void T2TSearch::Search(T2TModel * model, XTensor * input, XTensor * padding, XTe
/* predict the next state */ /* predict the next state */
predictor.Predict(next, &encoding, input, padding); predictor.Predict(next, &encoding, input, padding);
/* compute the model score (given the prediction probability) */
Score(cur, next);
/* beam pruning */ /* beam pruning */
Generate(next); Generate(next);
} }
...@@ -83,6 +87,37 @@ void T2TSearch::Search(T2TModel * model, XTensor * input, XTensor * padding, XTe ...@@ -83,6 +87,37 @@ void T2TSearch::Search(T2TModel * model, XTensor * input, XTensor * padding, XTe
} }
/* /*
compute the model score for each hypothesis
>> prev - the beam of the previous state
>> beam - the beam that keeps a number of states
*/
void T2TSearch::Score(T2TStateBundle * prev, T2TStateBundle * beam)
{
XTensor &score = beam->modelScore;
XTensor &prob = beam->prob;
XTensor &probPath = beam->probPath;
XTensor &lenPrev = prev->nstep;
XTensor &len = beam->nstep;
XTensor lp;
InitTensor(&score, &prob);
/* the log-scale probability of the entire sequence */
_Sum(&prob, &probPath, &score);
InitTensor(&len, &lenPrev);
InitTensor(&lp, &lenPrev);
_ScaleAndShift(&lenPrev, &len, 1.0F, 1.0F);
/* the GNMT-like length penalty */
T2TLengthPenalizer::GNMT(len, lp, alpha);
/* score = log-prob/lp */
_Div(&score, &lp, &score);
}
/*
generate tokens for the next state via beam pruning generate tokens for the next state via beam pruning
>> beam - the beam that keeps a number of states >> beam - the beam that keeps a number of states
*/ */
...@@ -93,7 +128,7 @@ void T2TSearch::Generate(T2TStateBundle * beam) ...@@ -93,7 +128,7 @@ void T2TSearch::Generate(T2TStateBundle * beam)
int dimsTopK[MAX_TENSOR_DIM_NUM]; int dimsTopK[MAX_TENSOR_DIM_NUM];
XTensor scoreTopK; XTensor scoreTopK;
XTensor &score = beam->score; XTensor &score = beam->modelScore;
XTensor &index = beam->prediction; XTensor &index = beam->prediction;
XTensor &preID = beam->preID; XTensor &preID = beam->preID;
int order = score.order; int order = score.order;
...@@ -131,16 +166,20 @@ void T2TSearch::Generate(T2TStateBundle * beam) ...@@ -131,16 +166,20 @@ void T2TSearch::Generate(T2TStateBundle * beam)
/* "preID" represents the id (or the offset) of previous state used to make the current /* "preID" represents the id (or the offset) of previous state used to make the current
hypothesis. Note that we reshape the "score" tensor into a matrix where each hypothesis. Note that we reshape the "score" tensor into a matrix where each
row means a previous state. The column number is size-of-beam * vocab-size. We, row means a previous state. The column number is size-of-beam * vocab-size. We,
therefore, divide the top-k index by vocab-size to compute the id of previous state for therefore, divide entries of the top-k index by vocab-size to compute the id of
each hypothesis in the top-k list. */ previous state for each hypothesis in the top-k list. */
Descale(preID, sizeVocab); Descale(preID, sizeVocab);
/* Then we are going to do something similar to "preID". For the top-k predictions, we /* Then, we do something similar to "preID". For the top-k predictions, we need
need to know their indices in the vocabulary. We compute the offset of each prediction to know their indices in the vocabulary. We compute the offset of each prediction
in the vocabulary by dividing it with vocab-size and computing the remainder. */ in the vocabulary by dividing it with vocab-size and computing the remainder. */
Mod(index, sizeVocab); Mod(index, sizeVocab);
score.Reshape(order, dims); score.Reshape(order, dims);
/* we keep the top-k scores */
InitTensor(&score, &scoreTopK);
CopyValues(scoreTopK, score);
} }
/* /*
......
...@@ -35,6 +35,9 @@ namespace transformer ...@@ -35,6 +35,9 @@ namespace transformer
class T2TSearch class T2TSearch
{ {
private: private:
/* the alpha parameter controls the length preference */
float alpha;
/* predictor */ /* predictor */
T2TPredictor predictor; T2TPredictor predictor;
...@@ -57,6 +60,9 @@ public: ...@@ -57,6 +60,9 @@ public:
/* search for the most promising states */ /* search for the most promising states */
void Search(T2TModel * model, XTensor * input, XTensor * padding, XTensor * output); void Search(T2TModel * model, XTensor * input, XTensor * padding, XTensor * output);
/* compute the model score for each hypothesis */
void Score(T2TStateBundle * prev, T2TStateBundle * beam);
/* generate token indices via beam pruning */ /* generate token indices via beam pruning */
void Generate(T2TStateBundle * beam); void Generate(T2TStateBundle * beam);
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论