/* NiuTrans.Tensor - an open-source tensor library
 * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-08-02
 */

#ifndef __T2TTRAINER_H__
#define __T2TTRAINER_H__

#include "T2TModel.h"

#include "../../tensor/function/FHeader.h"

#define MAX_SEQUENCE_LENGTH 1024 * 4

using namespace nts;

namespace transformer
{

/* trainer of the T2T model */
class T2TTrainer
{
public:
    /* buffer for loading words */
    int * buf;

    /* buffer size */
    int bufSize;

    /* length of each sequence */
    int * seqLen;

    /* offset of the first word for each sequence */
    int * seqOffset;

    /* number of sequences in the buffer */
    int nseqBuf;

    /* offset for next sequence in the buffer */
    int nextSeq;
    
    /* indicates whether the sequence is sorted by length */
    bool isLenSorted;
    
    /* dimension size of each inner layer */
    int d;
    
    /* step number of warm-up for training */
    int nwarmup;

    /* vocabulary size of the source side */
    int vSize;

    /* learning rate */
    float lrate;
    
    /* the parameter that controls the maximum learning rate in training */
    float lrbias;

    /* sentence batch size */
    int sBatchSize;

    /* word batch size */
    int wBatchSize;

    /* training epoch number */
    int nepoch;

    /* traing step number */
    int nstep;

    /* indicates whether we use adam */
    bool useAdam;

    /* hyper parameters of adam*/
    float adamBeta1;
    float adamBeta2;
    float adamDelta;
    float adamBeta1T;
    float adamBeta2T;

    /* list of the moment of the parameter matrics */
    XList moments;

    /* list of the 2nd order moment of the parameter matrics */
    XList moments2nd;

public:
    /* constructor */
    T2TTrainer();

    /* de-constructor */
    ~T2TTrainer();

    /* initialize the trainer */
    void Init(int argc, const char ** argv);

    /* train the model */
    void Train(const char * fn, T2TModel * model);

    /* test the model */
    void Test(const char * fn, const char * ofn, T2TModel * model);

    /* load data to buffer */
    int LoadBuf(FILE * file);

    /* clear data buffer */
    void ClearBuf();

    /* load a batch of sequences */
    int LoadBatch(FILE * file, bool isLM,
                  XTensor * batch, XTensor * padding, XTensor * output, 
                  int * seqs,
                  int step, int vs, int sBatch, int wBatch, 
                  bool isSorted, int &wCount,
                  int devID, XMem * mem);
    
    /* get word probabilities for a batch of sequences */
    float GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs);

    /* update the model by delta rule */
    void Update(T2TModel * model, const float lr);

    /* prepare model for training */
    void PrepareModel(T2TModel * model);

    /* do padding on the output */
    void PadOutput(XTensor * output, XTensor * padding);
};


}

#endif
