T2TTrainer.h 3.85 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
/* NiuTrans.Tensor - an open-source tensor library
 * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-08-02
 */

#ifndef __T2TTRAINER_H__
#define __T2TTRAINER_H__

#include "T2TModel.h"
xiaotong committed
26
#include "T2TBatchLoader.h"
27 28 29 30 31 32 33 34 35 36 37
#include "../../tensor/function/FHeader.h"

using namespace nts;

namespace transformer
{

/* trainer of the T2T model */
class T2TTrainer
{
public:
38 39 40 41 42
    /* paramter number */
    int argNum;

    /* parameter array */
    char ** argArray;
43 44 45 46 47 48
    
    /* dimension size of each inner layer */
    int d;
    
    /* step number of warm-up for training */
    int nwarmup;
49 50 51 52

    /* vocabulary size of the source side */
    int vSize;

53 54 55
    /* vocabulary size of the target side */
    int vSizeTgt;

56 57
    /* learning rate */
    float lrate;
58 59 60
    
    /* the parameter that controls the maximum learning rate in training */
    float lrbias;
61 62 63 64 65 66 67 68 69 70 71 72 73

    /* sentence batch size */
    int sBatchSize;

    /* word batch size */
    int wBatchSize;

    /* training epoch number */
    int nepoch;

    /* traing step number */
    int nstep;

74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89
    /* indicates whether we use adam */
    bool useAdam;

    /* hyper parameters of adam*/
    float adamBeta1;
    float adamBeta2;
    float adamDelta;
    float adamBeta1T;
    float adamBeta2T;

    /* list of the moment of the parameter matrics */
    XList moments;

    /* list of the 2nd order moment of the parameter matrics */
    XList moments2nd;

90 91 92 93 94 95 96 97 98 99 100 101 102 103
    /* indicates whether the data file is shuffled for training */
    bool isShuffled;
    
    /* the factor of label smoothing */
    DTYPE labelSmoothingP;

    /* number of steps after which we make a checkpoint */
    int nStepCheckpoint;

    /* indicates whether we make a checkpoint after each traing epoch */
    bool useEpochCheckpoint;
    
    /* number of batches on which we do model update */
    int updateStep;
104

105 106
    /* indicates whether we intend to debug the net */
    bool isDebugged;
107

xiaotong committed
108 109 110 111 112
    /* indicates whether the sequence is sorted by length */
    bool isLenSorted;

    /* for batching */
    T2TBatchLoader batchLoader;
113

114 115 116 117 118 119 120 121
public:
    /* constructor */
    T2TTrainer();

    /* de-constructor */
    ~T2TTrainer();

    /* initialize the trainer */
122
    void Init(int argc, char ** argv);
123 124

    /* train the model */
125
    void Train(const char * fn, const char * validFN, const char * modelFN, T2TModel * model);
126

127 128 129
    /* test the model */
    void Test(const char * fn, const char * ofn, T2TModel * model);

130 131
    /* make a checkpoint */
    void MakeCheckpoint(T2TModel * model, const char * validFN, const char * modelFN, const char * label, int id);
132 133 134 135 136
    
    /* get word probabilities for a batch of sequences */
    float GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs);

    /* update the model by delta rule */
137
    void Update(T2TModel * model, const float lr);
138 139 140 141 142

    /* prepare model for training */
    void PrepareModel(T2TModel * model);

    /* do padding on the output */
143 144 145 146 147 148 149
    void PadOutput(XTensor * output, XTensor * gold, XTensor * padding);
    
    /* recale the output and gold tensors for normalized loss */
    void RescaleOutput(XTensor * output, XTensor * gold, XTensor * padding);
    
    /* perform label smoothing */
    void LabelSmooth(XTensor * gold, XTensor * smoothed, DTYPE p);
150 151 152 153 154
};


}

155
#endif