model score with length penalty

55dfe49b · xiaotong · 1f041016 · 55dfe49b · 55dfe49b · 55dfe49b
Commit 55dfe49b authored Apr 08, 2019 by xiaotong
--- a/source/sample/transformer/T2TLengthPenalty.cpp
+++ b/source/sample/transformer/T2TLengthPenalty.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2019, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "../../tensor/core/CHeader.h"
+#include "T2TLengthPenalty.h"
+using namespace nts;
+namespace transformer
+{
+/* 
+GNMT-like length penalty: pl = ((5 + n)/(5 + 1))^\alpha 
+where n = length of the sequence 
+>> length - length of the sequence (for each entry)
+>> lp - length penaltyof the sequence (for each entry)
+*/
+void T2TLengthPenalizer::GNMT(const XTensor & length, XTensor & lp, float alpha)
+{
+    XTensor base;
+    base = ScaleAndShift(ScaleAndShift(length, 0, 5.0F), 1.0F/(5 + 1));
+    lp = Power(base, alpha);
+}
+}
--- a/source/sample/transformer/T2TLengthPenalty.h
+++ b/source/sample/transformer/T2TLengthPenalty.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2019, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-04-08
+ * Start of a new week - I just finished several documents.
+ * Writing document is harder than writing code :)
+ */
+#ifndef __T2TLENGTHPENALTY_H__
+#define __T2TLENGTHPENALTY_H__
+#include "../../tensor/XTensor.h"
+using namespace nts;
+namespace transformer
+{
+/* We intend to penalize short sequences because they have higher score
+   in product of a sequence of probability-like terms and have more chances
+   to beat others in search. */
+class T2TLengthPenalizer
+{
+public:
+    /* GNMT-like length penalty: pl = ((5 + n)/(5 + 1))^\alpha 
+       where n = length of the sequence */
+    static
+    void GNMT(const XTensor & length, XTensor & lp, float alpha);
+};
+}
+#endif
\ No newline at end of file
--- a/source/sample/transformer/T2TPredictor.cpp
+++ b/source/sample/transformer/T2TPredictor.cpp
@@ -53,8 +53,14 @@ void T2TStateBundle::MakeStates(int num)
    states = new T2TState[num];
-    for(int i = 0; i < num; i++)
+    for(int i = 0; i < num; i++){
+        states[i].prediction = -1;
+        states[i].prob = 0;
+        states[i].probPath = 0;
+        states[i].modelScore = 0;
+        states[i].nstep = 0;
        states[i].last = NULL;
+    }
 }
 /* constructor */
@@ -124,7 +130,7 @@ void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding, XTensor * 
        inputDec = Concatenate(*inputLast, s->prediction, inputLast->GetDim(-1));
    /* prediction probabilities */
-    XTensor &output = next->score;
+    XTensor &output = next->prob;
    XTensor paddingDec;
    InitTensor3D(&paddingDec, inputDec.GetDim(0), inputDec.GetDim(1), m->outputLayer->vSize, X_INT);

--- a/source/sample/transformer/T2TPredictor.h
+++ b/source/sample/transformer/T2TPredictor.h
@@ -24,6 +24,7 @@
 #define __T2TPREDICTOR_H__
 #include "T2TModel.h"
+#include "T2TLengthPenalty.h"
 namespace transformer
 {
@@ -36,11 +37,17 @@ public:
    /* we assume that the prediction is an integer */
    int prediction;
-    /* score of the prediction */
+    /* probability of every prediction (last state of the path) */
-    float score;
+    float prob;
-    /* score of the path */
+    /* probability of every path */
-    float scorePath;
+    float probPath;
+    /* model score of every path */
+    float modelScore;
+    /* nubmer of steps we go over so far */
+    int nstep;
    /* pointer to the previous state */
    T2TState * last;
@@ -56,11 +63,17 @@ public:
    /* id of the previous state that generates the current one  */
    XTensor preID;
-    /* score of every prediction (last state of the path) */
+    /* probability of every prediction (last state of the path) */
-    XTensor score;
+    XTensor prob;
+    /* probability of every path */
+    XTensor probPath;
+    /* model score of every path */
+    XTensor modelScore;
-    /* score of every path */
+    /* step number of each hypothesis */
-    XTensor scorePath;
+    XTensor nstep;
    /* layers on the encoder side. We actually use the encoder output instead
       of all hidden layers. */

--- a/source/sample/transformer/T2TSearch.cpp
+++ b/source/sample/transformer/T2TSearch.cpp
@@ -36,6 +36,7 @@ initialize the model
 void T2TSearch::InitModel(int argc, char ** argv)
 {
    LoadParamInt(argc, argv, "beamsize", &beamSize, 1);
+    LoadParamFloat(argc, argv, "lenalpha", &alpha, 0.2F);
 }
 /* 
@@ -75,6 +76,9 @@ void T2TSearch::Search(T2TModel * model, XTensor * input, XTensor * padding, XTe
        /* predict the next state */
        predictor.Predict(next, &encoding, input, padding);
+        /* compute the model score (given the prediction probability) */
+        Score(cur, next);
        /* beam pruning */
        Generate(next);
    }
@@ -83,6 +87,37 @@ void T2TSearch::Search(T2TModel * model, XTensor * input, XTensor * padding, XTe
 }
 /* 
+compute the model score for each hypothesis 
+>> prev - the beam of the previous state
+>> beam - the beam that keeps a number of states
+*/
+void T2TSearch::Score(T2TStateBundle * prev, T2TStateBundle * beam)
+{
+    XTensor &score = beam->modelScore;
+    XTensor &prob = beam->prob;
+    XTensor &probPath = beam->probPath;
+    XTensor &lenPrev = prev->nstep;
+    XTensor &len = beam->nstep;
+    XTensor lp;
+    InitTensor(&score, &prob);
+    /* the log-scale probability of the entire sequence */
+    _Sum(&prob, &probPath, &score);
+    InitTensor(&len, &lenPrev);
+    InitTensor(&lp, &lenPrev);
+    _ScaleAndShift(&lenPrev, &len, 1.0F, 1.0F);
+    /* the GNMT-like length penalty */
+    T2TLengthPenalizer::GNMT(len, lp, alpha);
+    /* score = log-prob/lp */
+    _Div(&score, &lp, &score);
+}
+/* 
 generate tokens for the next state via beam pruning
 >> beam - the beam that keeps a number of states
 */
@@ -93,7 +128,7 @@ void T2TSearch::Generate(T2TStateBundle * beam)
    int dimsTopK[MAX_TENSOR_DIM_NUM];
    XTensor scoreTopK;
-    XTensor &score = beam->score;
+    XTensor &score = beam->modelScore;
    XTensor &index = beam->prediction;
    XTensor &preID = beam->preID;
    int order = score.order;
@@ -131,16 +166,20 @@ void T2TSearch::Generate(T2TStateBundle * beam)
    /* "preID" represents the id (or the offset) of previous state used to make the current
       hypothesis. Note that we reshape the "score" tensor into a matrix where each
       row means a previous state. The column number is size-of-beam * vocab-size. We,
-       therefore, divide the top-k index by vocab-size to compute the id of previous state for
+       therefore, divide entries of the top-k index by vocab-size to compute the id of 
-       each hypothesis in the top-k list. */
+       previous state for each hypothesis in the top-k list. */
    Descale(preID, sizeVocab);
-    /* Then we are going to do something similar to "preID". For the top-k predictions, we
+    /* Then, we do something similar to "preID". For the top-k predictions, we need 
-       need to know their indices in the vocabulary. We compute the offset of each prediction
+       to know their indices in the vocabulary. We compute the offset of each prediction
       in the vocabulary by dividing it with vocab-size and computing the remainder. */
    Mod(index, sizeVocab);
    score.Reshape(order, dims);
+    /* we keep the top-k scores */
+    InitTensor(&score, &scoreTopK);
+    CopyValues(scoreTopK, score);
 }
 /* 

--- a/source/sample/transformer/T2TSearch.h
+++ b/source/sample/transformer/T2TSearch.h
@@ -35,6 +35,9 @@ namespace transformer
 class T2TSearch
 {
 private:
+    /* the alpha parameter controls the length preference */
+    float alpha;
    /* predictor */
    T2TPredictor predictor;
@@ -57,6 +60,9 @@ public:
    /* search for the most promising states */
    void Search(T2TModel * model, XTensor * input, XTensor * padding, XTensor * output);
+    /* compute the model score for each hypothesis */
+    void Score(T2TStateBundle * prev, T2TStateBundle * beam);
    /* generate token indices via beam pruning */
    void Generate(T2TStateBundle * beam);