Commit 55dfe49b by xiaotong

model score with length penalty

parent 1f041016
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "../../tensor/core/CHeader.h"
#include "T2TLengthPenalty.h"
using namespace nts;
namespace transformer
{
/*
GNMT-like length penalty: pl = ((5 + n)/(5 + 1))^\alpha
where n = length of the sequence
>> length - length of the sequence (for each entry)
>> lp - length penaltyof the sequence (for each entry)
*/
void T2TLengthPenalizer::GNMT(const XTensor & length, XTensor & lp, float alpha)
{
XTensor base;
base = ScaleAndShift(ScaleAndShift(length, 0, 5.0F), 1.0F/(5 + 1));
lp = Power(base, alpha);
}
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-04-08
* Start of a new week - I just finished several documents.
* Writing document is harder than writing code :)
*/
#ifndef __T2TLENGTHPENALTY_H__
#define __T2TLENGTHPENALTY_H__
#include "../../tensor/XTensor.h"
using namespace nts;
namespace transformer
{
/* We intend to penalize short sequences because they have higher score
in product of a sequence of probability-like terms and have more chances
to beat others in search. */
class T2TLengthPenalizer
{
public:
/* GNMT-like length penalty: pl = ((5 + n)/(5 + 1))^\alpha
where n = length of the sequence */
static
void GNMT(const XTensor & length, XTensor & lp, float alpha);
};
}
#endif
\ No newline at end of file
......@@ -53,8 +53,14 @@ void T2TStateBundle::MakeStates(int num)
states = new T2TState[num];
for(int i = 0; i < num; i++)
for(int i = 0; i < num; i++){
states[i].prediction = -1;
states[i].prob = 0;
states[i].probPath = 0;
states[i].modelScore = 0;
states[i].nstep = 0;
states[i].last = NULL;
}
}
/* constructor */
......@@ -124,7 +130,7 @@ void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding, XTensor *
inputDec = Concatenate(*inputLast, s->prediction, inputLast->GetDim(-1));
/* prediction probabilities */
XTensor &output = next->score;
XTensor &output = next->prob;
XTensor paddingDec;
InitTensor3D(&paddingDec, inputDec.GetDim(0), inputDec.GetDim(1), m->outputLayer->vSize, X_INT);
......
......@@ -24,6 +24,7 @@
#define __T2TPREDICTOR_H__
#include "T2TModel.h"
#include "T2TLengthPenalty.h"
namespace transformer
{
......@@ -36,11 +37,17 @@ public:
/* we assume that the prediction is an integer */
int prediction;
/* score of the prediction */
float score;
/* probability of every prediction (last state of the path) */
float prob;
/* score of the path */
float scorePath;
/* probability of every path */
float probPath;
/* model score of every path */
float modelScore;
/* nubmer of steps we go over so far */
int nstep;
/* pointer to the previous state */
T2TState * last;
......@@ -56,11 +63,17 @@ public:
/* id of the previous state that generates the current one */
XTensor preID;
/* score of every prediction (last state of the path) */
XTensor score;
/* probability of every prediction (last state of the path) */
XTensor prob;
/* probability of every path */
XTensor probPath;
/* model score of every path */
XTensor modelScore;
/* score of every path */
XTensor scorePath;
/* step number of each hypothesis */
XTensor nstep;
/* layers on the encoder side. We actually use the encoder output instead
of all hidden layers. */
......
......@@ -36,6 +36,7 @@ initialize the model
void T2TSearch::InitModel(int argc, char ** argv)
{
LoadParamInt(argc, argv, "beamsize", &beamSize, 1);
LoadParamFloat(argc, argv, "lenalpha", &alpha, 0.2F);
}
/*
......@@ -75,6 +76,9 @@ void T2TSearch::Search(T2TModel * model, XTensor * input, XTensor * padding, XTe
/* predict the next state */
predictor.Predict(next, &encoding, input, padding);
/* compute the model score (given the prediction probability) */
Score(cur, next);
/* beam pruning */
Generate(next);
}
......@@ -83,6 +87,37 @@ void T2TSearch::Search(T2TModel * model, XTensor * input, XTensor * padding, XTe
}
/*
compute the model score for each hypothesis
>> prev - the beam of the previous state
>> beam - the beam that keeps a number of states
*/
void T2TSearch::Score(T2TStateBundle * prev, T2TStateBundle * beam)
{
XTensor &score = beam->modelScore;
XTensor &prob = beam->prob;
XTensor &probPath = beam->probPath;
XTensor &lenPrev = prev->nstep;
XTensor &len = beam->nstep;
XTensor lp;
InitTensor(&score, &prob);
/* the log-scale probability of the entire sequence */
_Sum(&prob, &probPath, &score);
InitTensor(&len, &lenPrev);
InitTensor(&lp, &lenPrev);
_ScaleAndShift(&lenPrev, &len, 1.0F, 1.0F);
/* the GNMT-like length penalty */
T2TLengthPenalizer::GNMT(len, lp, alpha);
/* score = log-prob/lp */
_Div(&score, &lp, &score);
}
/*
generate tokens for the next state via beam pruning
>> beam - the beam that keeps a number of states
*/
......@@ -93,7 +128,7 @@ void T2TSearch::Generate(T2TStateBundle * beam)
int dimsTopK[MAX_TENSOR_DIM_NUM];
XTensor scoreTopK;
XTensor &score = beam->score;
XTensor &score = beam->modelScore;
XTensor &index = beam->prediction;
XTensor &preID = beam->preID;
int order = score.order;
......@@ -131,16 +166,20 @@ void T2TSearch::Generate(T2TStateBundle * beam)
/* "preID" represents the id (or the offset) of previous state used to make the current
hypothesis. Note that we reshape the "score" tensor into a matrix where each
row means a previous state. The column number is size-of-beam * vocab-size. We,
therefore, divide the top-k index by vocab-size to compute the id of previous state for
each hypothesis in the top-k list. */
therefore, divide entries of the top-k index by vocab-size to compute the id of
previous state for each hypothesis in the top-k list. */
Descale(preID, sizeVocab);
/* Then we are going to do something similar to "preID". For the top-k predictions, we
need to know their indices in the vocabulary. We compute the offset of each prediction
/* Then, we do something similar to "preID". For the top-k predictions, we need
to know their indices in the vocabulary. We compute the offset of each prediction
in the vocabulary by dividing it with vocab-size and computing the remainder. */
Mod(index, sizeVocab);
score.Reshape(order, dims);
/* we keep the top-k scores */
InitTensor(&score, &scoreTopK);
CopyValues(scoreTopK, score);
}
/*
......
......@@ -35,6 +35,9 @@ namespace transformer
class T2TSearch
{
private:
/* the alpha parameter controls the length preference */
float alpha;
/* predictor */
T2TPredictor predictor;
......@@ -57,6 +60,9 @@ public:
/* search for the most promising states */
void Search(T2TModel * model, XTensor * input, XTensor * padding, XTensor * output);
/* compute the model score for each hypothesis */
void Score(T2TStateBundle * prev, T2TStateBundle * beam);
/* generate token indices via beam pruning */
void Generate(T2TStateBundle * beam);
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论