Commit 00d7b386 by liyinqiao

Update the Transformer sample based on the NiuTrans.NMT.

parent 3b93be69
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
Munich 18@@ 56 : Four maps that will change your view of the city
A mental asylum , where today young people are said to meet .
A cryp@@ t chap@@ el , where they are now dig@@ ging t@@ unn@@ els for the S @@@ -@@ @ Bahn .
Al@@ lo@@ t@@ ment holders cul@@ tiv@@ ate the soil of former farmers .
The oldest official map of Munich brings cap@@ tiv@@ ating stories to light .
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -26,7 +26,7 @@
#include "./tensor/core/CHeader.h"
#include "./tensor/test/Test.h"
#include "./sample/fnnlm/FNNLM.h"
#include "./sample/transformer/Transformer.h"
#include "./sample/transformer/NMT.h"
//#define CRTDBG_MAP_ALLOC
//#include <stdlib.h>
......@@ -34,7 +34,7 @@
using namespace nts;
using namespace fnnlm;
using namespace transformer;
using namespace nmt;
int main( int argc, const char ** argv )
{
......@@ -43,7 +43,7 @@ int main( int argc, const char ** argv )
else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
FNNLMMain(argc - 1, argv + 1);
else if(argc > 1 && !strcmp(argv[1], "-t2t"))
TransformerMain(argc - 1, argv + 1);
NMTMain(argc - 1, argv + 1);
else{
fprintf(stderr, "Thanks for using NiuTensor! This is a library for building\n");
fprintf(stderr, "neural networks in an easy way. \n\n");
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -20,15 +19,13 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#include <cmath>
#include "T2TDecoder.h"
#include "module/T2TUtility.h"
#include "module/T2TLayerNormal.h"
#include "module/T2TCommonModules.h"
#include "Decoder.h"
#include "Utility.h"
#include "module/LayerNorm.h"
#include "module/CommonModules.h"
#include "../../tensor/core/CHeader.h"
namespace transformer
namespace nmt
{
/* constructor */
......@@ -64,7 +61,7 @@ AttDecoder::~AttDecoder()
initialize the model
>> config - configurations of the model
*/
void AttDecoder::InitModel(T2TConfig& config)
void AttDecoder::InitModel(Config& config)
{
devID = config.devID;
nlayer = config.nDecLayer;
......@@ -80,16 +77,17 @@ void AttDecoder::InitModel(T2TConfig& config)
/* embedding model */
embedder.InitModel(config, false);
selfAtt = new T2TAttention[nlayer];
fnns = new T2TFNN[nlayer];
selfAttLayerNorms = new T2TLN[nlayer];
enDeAtt = new T2TAttention[nlayer];
enDeAttLayerNorms = new T2TLN[nlayer];
fnnLayerNorms = new T2TLN[nlayer];
selfAtt = new Attention[nlayer];
fnns = new FNN[nlayer];
selfAttLayerNorms = new LN[nlayer];
enDeAtt = new Attention[nlayer];
enDeAttLayerNorms = new LN[nlayer];
fnnLayerNorms = new LN[nlayer];
selfAttCache = new Cache[nlayer];
enDeAttCache = new Cache[nlayer];
if (preNorm)
decoderLayerNorm = new T2TLN;
decoderLayerNorm = new LN;
/* initialize the stacked layers */
for (int i = 0; i < nlayer; i++) {
......@@ -99,6 +97,8 @@ void AttDecoder::InitModel(T2TConfig& config)
fnnLayerNorms[i].InitModel(config);
enDeAtt[i].InitModel(config);
enDeAttLayerNorms[i].InitModel(config);
selfAttCache[i].enable = true;
enDeAttCache[i].enable = true;
}
if (preNorm)
decoderLayerNorm->InitModel(config);
......@@ -115,9 +115,10 @@ make the decoding network
<< return - the output tensor of the decoder
*/
XTensor AttDecoder::Make(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
XTensor* maskEncDec, int nstep, bool isTraining)
XTensor* maskEncDec, int nstep, bool isTraining)
{
XTensor x;
x = embedder.Make(inputDec, true, isTraining, nstep);
/* dropout */
......@@ -188,8 +189,86 @@ XTensor AttDecoder::Make(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
}
if (preNorm)
x = decoderLayerNorm->Make(x);
return decoderLayerNorm->Make(x);
return x;
}
/*
make the decoding network
>> inputDec - the input tensor of the decoder
>> outputEnc - the output tensor of the encoder
>> mask - mask that indicates which position is valid
>> maskEncDec - mask for the encoder-decoder attention
>> nstep - the current length of the decoder input
>> isTraining - indicates whether the model is used for training
<< return - the output tensor of the decoder
*/
XTensor AttDecoder::MakeFast(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
XTensor* maskEncDec, int nstep, bool isTraining)
{
XTensor x;
x = embedder.Make(inputDec, true, isTraining, nstep);
/* dropout */
if (isTraining && dropoutP > 0)
x = Dropout(x, dropoutP);
for (int i = 0; i < nlayer; i++) {
XTensor res;
res = x;
/* layer normalization with pre-norm for self-attn */
x = selfAttLayerNorms[i].Make(x);
/******************/
/* self attention */
x = selfAtt[i].Make(x, x, x, mask, isTraining, &selfAttCache[i], SELF_ATT);
/* dropout */
if (isTraining && dropoutP > 0)
x = Dropout(x, dropoutP);
/* residual connection */
x = Sum(res, x);
res = x;
/* layer normalization with pre-norm for encoder-decoder attention */
x = enDeAttLayerNorms[i].Make(x);
/* encoder-decoder attention */
x = enDeAtt[i].Make(outputEnc, x, outputEnc, maskEncDec,
isTraining, &enDeAttCache[i], EN_DE_ATT);
/* dropout */
if (isTraining && dropoutP > 0)
x = Dropout(x, dropoutP);
/* residual connection */
x = Sum(res, x);
res = x;
/* layer normalization with pre-norm for fnn */
x = fnnLayerNorms[i].Make(x);
/* fnn */
x = fnns[i].Make(x, isTraining);
/* dropout */
if (isTraining && dropoutP > 0)
x = Dropout(x, dropoutP);
/* residual connection */
x = Sum(res, x);
}
x = decoderLayerNorm->Make(x);
return x;
}
}
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -20,13 +19,13 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#ifndef __T2TDECODER_H__
#define __T2TDECODER_H__
#ifndef __DECODER_H__
#define __DECODER_H__
#include "T2TEncoder.h"
#include "module/T2TUtility.h"
#include "Encoder.h"
#include "Utility.h"
namespace transformer
namespace nmt
{
class AttDecoder
......@@ -52,28 +51,28 @@ public:
DTYPE dropoutP;
/* embedding of word at each position */
T2TEmbedder embedder;
Embedder embedder;
/* FNN model of each layer */
T2TFNN* fnns;
FNN* fnns;
/* attention model of each layer */
T2TAttention* selfAtt;
Attention* selfAtt;
/* layer normalization for attention */
T2TLN* selfAttLayerNorms;
LN* selfAttLayerNorms;
/* layer normalization for fnn */
T2TLN* fnnLayerNorms;
LN* fnnLayerNorms;
/* layer normalization for decoder */
T2TLN* decoderLayerNorm;
LN* decoderLayerNorm;
/* encoder-decoder attention model of each layer */
T2TAttention* enDeAtt;
Attention* enDeAtt;
/* layer normalization for encoder-decoder attention */
T2TLN* enDeAttLayerNorms;
LN* enDeAttLayerNorms;
/* layer cache list */
Cache* selfAttCache;
......@@ -92,11 +91,15 @@ public:
~AttDecoder();
/* initialize the model */
void InitModel(T2TConfig& config);
void InitModel(Config& config);
/* make the decoding network */
XTensor Make(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
XTensor* maskEncDec, int nstep, bool isTraining);
/* make the decoding network (pre norm) */
XTensor MakeFast(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
XTensor* maskEncDec, int nstep, bool isTraining);
};
}
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -20,15 +19,13 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#include <cmath>
#include "T2TEncoder.h"
#include "module/T2TUtility.h"
#include "module/T2TLayerNormal.h"
#include "module/T2TCommonModules.h"
#include "Encoder.h"
#include "Utility.h"
#include "module/LayerNorm.h"
#include "module/CommonModules.h"
#include "../../tensor/core/CHeader.h"
namespace transformer
namespace nmt
{
/* constructor */
......@@ -56,7 +53,7 @@ AttEncoder::~AttEncoder()
initialize the model
>> config - configurations for the model
*/
void AttEncoder::InitModel(T2TConfig& config)
void AttEncoder::InitModel(Config& config)
{
devID = config.devID;
......@@ -68,18 +65,18 @@ void AttEncoder::InitModel(T2TConfig& config)
dropoutP = config.dropout;
CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsize\"");
CheckNTErrors(vSize > 1, "Set vocabulary size by \"-vsize\"");
/* embedding model */
embedder.InitModel(config);
selfAtt = new T2TAttention[nlayer];
fnns = new T2TFNN[nlayer];
attLayerNorms = new T2TLN[nlayer];
fnnLayerNorms = new T2TLN[nlayer];
selfAtt = new Attention[nlayer];
fnns = new FNN[nlayer];
attLayerNorms = new LN[nlayer];
fnnLayerNorms = new LN[nlayer];
if (preNorm)
encoderLayerNorm = new T2TLN;
encoderLayerNorm = new LN;
/* initialize the stacked layers */
for (int i = 0; i < nlayer; i++) {
......@@ -122,7 +119,7 @@ XTensor AttEncoder::Make(XTensor& input, XTensor* mask, XTensor& maskEncDec, boo
attnBefore = LayerNorm(x, attLayerNorms[i], preNorm, true, false);
/* self attention */
att = selfAtt[i].Make(attnBefore, attnBefore, attnBefore, mask, isTraining, NULL, 0);
att = selfAtt[i].Make(attnBefore, attnBefore, attnBefore, mask, isTraining, NULL, SELF_ATT);
/* dropout */
if (isTraining && dropoutP > 0)
......@@ -151,7 +148,63 @@ XTensor AttEncoder::Make(XTensor& input, XTensor* mask, XTensor& maskEncDec, boo
x = LayerNorm(res, fnnLayerNorms[i], preNorm, false, true);
}
if (preNorm)
x = encoderLayerNorm->Make(x);
return encoderLayerNorm->Make(x);
return x;
}
/*
make the encoding network
>> input - the input tensor of the encoder
>> mask - the mask that indicate each position is valid
>> maskEncDec - no use
>> isTraining - indicates whether the model is used for training
<< return - the output tensor of the encoder
*/
XTensor AttEncoder::MakeFast(XTensor& input, XTensor* mask, XTensor& maskEncDec, bool isTraining)
{
XTensor x;
x = embedder.Make(input, false, isTraining);
/* dropout */
if (isTraining && dropoutP > 0)
x = Dropout(x, dropoutP);
for (int i = 0; i < nlayer; i++) {
XTensor res;
res = x;
/* layer normalization with pre-norm for self-attn */
x = attLayerNorms[i].Make(x);
/* self attention */
x = selfAtt[i].Make(x, x, x, mask, isTraining, NULL, SELF_ATT);
/* dropout */
if (isTraining && dropoutP > 0)
x = Dropout(x, dropoutP);
/* residual connection */
x = Sum(res, x);
res = x;
/* layer normalization with pre-norm for fnn */
x = fnnLayerNorms[i].Make(x);
/* fnn */
x = fnns[i].Make(x, isTraining);
/* dropout */
if (isTraining && dropoutP > 0)
x = Dropout(x, dropoutP);
/* residual connection */
x = Sum(res, x);
}
x = encoderLayerNorm->Make(x);
return x;
}
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -20,25 +19,25 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#ifndef __T2TENCODER_H__
#define __T2TENCODER_H__
#ifndef __ENCODER_H__
#define __ENCODER_H__
#include "module/T2TFNN.h"
#include "module/T2TUtility.h"
#include "module/T2TAttention.h"
#include "module/T2TEmbedding.h"
#include "module/T2TLayerNormal.h"
#include "Utility.h"
#include "module/FNN.h"
#include "module/Attention.h"
#include "module/Embedding.h"
#include "module/LayerNorm.h"
#include "../../network/XNet.h"
using namespace nts;
namespace transformer
namespace nmt
{
/*
base class of the encoder
*/
class T2TEncoder
class Encoder
{
public:
virtual XTensor Make(XTensor& input, XTensor* mask, XTensor& mask2, bool isTraining) = 0;
......@@ -47,7 +46,7 @@ public:
/*
the encoder based on self-attention
*/
class AttEncoder : T2TEncoder
class AttEncoder : Encoder
{
public:
/* device id */
......@@ -73,22 +72,22 @@ public:
int ignored;
/* embedding of word at each position */
T2TEmbedder embedder;
Embedder embedder;
/* FNN model of each layer */
T2TFNN* fnns;
FNN* fnns;
/* attention model of each layer */
T2TAttention* selfAtt;
Attention* selfAtt;
/* layer normalizations for attention */
T2TLN* attLayerNorms;
LN* attLayerNorms;
/* layer normalization for fnn */
T2TLN* fnnLayerNorms;
LN* fnnLayerNorms;
/* layer normalization for encoder */
T2TLN* encoderLayerNorm;
LN* encoderLayerNorm;
/* the location of layer normalization */
bool preNorm;
......@@ -101,11 +100,14 @@ public:
~AttEncoder();
/* initialize the model */
void InitModel(T2TConfig& config);
void InitModel(Config& config);
/* make the encoding network */
XTensor Make(XTensor& input, XTensor* mask, XTensor& maskEncDec, bool isTraining);
/* make the encoding network */
XTensor MakeFast(XTensor& input, XTensor* mask, XTensor& maskEncDec, bool isTraining);
/* make the encoding network (wrapper) */
XTensor Make(XTensor& input, XTensor* mask, bool isTraining);
};
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -22,32 +21,32 @@
#include <cstdint>
#include "T2TModel.h"
#include "module/T2TUtility.h"
#include "Model.h"
#include "Utility.h"
#include "../../tensor/XUtility.h"
#include "../../tensor/core/CHeader.h"
namespace transformer
namespace nmt
{
/* constructor */
T2TModel::T2TModel()
Model::Model()
{
devID = -1;
isLM = false;
isMT = false;
useFP16 = false;
shareAllEmbeddings = false;
shareDecInputOutputWeight = false;
shareAllEmbeddings = 0;
shareDecInputOutputWeight = 0;
nhead = 1;
encoder = new AttEncoder();
decoder = new AttDecoder();
outputLayer = new T2TOutput();
outputLayer = new Output();
}
/* de-constructor */
T2TModel::~T2TModel()
Model::~Model()
{
delete encoder;
delete decoder;
......@@ -58,7 +57,7 @@ T2TModel::~T2TModel()
initialize the model
>> config - configurations of the model
*/
void T2TModel::InitModel(T2TConfig& config)
void Model::InitModel(Config& config)
{
devID = config.devID;
isMT = config.isMT;
......@@ -71,8 +70,8 @@ void T2TModel::InitModel(T2TConfig& config)
&config.fnnHiddenSize, &config.modelSize,
&config.embSize, &config.srcVocabSize,
&config.tgtVocabSize, &config.nhead,
&config.maxRP, &shareAllEmbeddings,
&shareDecInputOutputWeight,
&config.maxRP, &config.shareAllEmbeddings,
&config.shareDecInputOutputWeight,
&config.maxPosLen
};
......@@ -81,10 +80,28 @@ void T2TModel::InitModel(T2TConfig& config)
/* read model configurations */
if (!config.isTraining) {
modelFile = fopen(config.modelFN, "rb");
for (auto& meta : metaInfo)
CheckNTErrors(modelFile, "Failed to open the model file");
for (auto& meta : metaInfo) {
fread(meta, sizeof(int), 1, modelFile);
}
}
else {
/* read the source and target vocab size */
FILE* trainF = fopen(config.trainFN, "rb");
CheckNTErrors(trainF, "Failed to open the training file");
fread(&config.srcVocabSize, sizeof(config.srcVocabSize), 1, trainF);
fread(&config.tgtVocabSize, sizeof(config.tgtVocabSize), 1, trainF);
CheckNTErrors(config.srcVocabSize > 0, "Invalid source vocabulary size");
CheckNTErrors(config.tgtVocabSize > 0, "Invalid target vocabulary size");
fclose(trainF);
}
nhead = config.nhead;
shareAllEmbeddings = config.shareAllEmbeddings;
shareDecInputOutputWeight = config.shareDecInputOutputWeight;
ShowModelConfig(config);
encoder->InitModel(config);
outputLayer->InitModel(config);
......@@ -92,13 +109,12 @@ void T2TModel::InitModel(T2TConfig& config)
if (isMT)
decoder->InitModel(config);
TensorList params(10);
GetParams(params);
/* load parameters */
if (!config.isTraining)
Read(modelFile);
else {
TensorList params;
GetParams(params);
for (int i = 0; i < params.Size(); i++)
params[i]->SetVarFlag();
}
......@@ -108,13 +124,28 @@ void T2TModel::InitModel(T2TConfig& config)
}
/*
print model configurations
>> config - model configurations
*/
void Model::ShowModelConfig(Config& config)
{
/* TODO: output more info */
XPRINT1(0, stderr, "encoder layer: %d\n", config.nEncLayer);
XPRINT1(0, stderr, "decoder layer: %d\n", config.nDecLayer);
XPRINT1(0, stderr, "attention heads: %d\n", config.nhead);
XPRINT1(0, stderr, "model size: %d\n", config.modelSize);
XPRINT1(0, stderr, "source vocab size: %d\n", config.srcVocabSize);
XPRINT1(0, stderr, "target vocab size: %d\n", config.tgtVocabSize);
}
/*
make the encoding network
>> input - input tensor
>> mask - the mask for positions that are/not involved in computation
>> input - input tensor, (batchSize, srcLen)
>> mask - the mask for encoder self-attention, (headNum, batchSize, srcLen, srcLen)
>> isTraining - indicates whether we are training the model
<< return - encoding result
<< return - encoding result, (batchSize, srcLen, hiddenDim)
*/
XTensor T2TModel::MakeEncoder(XTensor& input, XTensor* mask, bool isTraining)
XTensor Model::MakeEncoder(XTensor& input, XTensor* mask, bool isTraining)
{
XTensor nothing;
......@@ -123,18 +154,17 @@ XTensor T2TModel::MakeEncoder(XTensor& input, XTensor* mask, bool isTraining)
/*
make the decoding network
>> inputDec - input tensor of the decoder
>> outputEnc - output tensor of the encoder
>> output - output tensor (distribution)
>> mask - mask for positions that are/not involved in computation
>> maskEncDec - mask for the encoder-decoder attention
>> inputDec - input tensor of the decoder, (batchSize, tgtLen)
>> outputEnc - output tensor of the encoder, (batchSize, srcLen, hiddenDim)
>> mask - mask for decoder self-attention, (headNum, batchSize, tgtLen, tgtLen)
>> maskEncDec - mask for the encoder-decoder attention, (headNum, batchSize, tgtLen, srcLen)
>> isTraining - indicates whether we are training the model
<< return - encoding result
<< return - decoding result, (batchSize, tgtLen, hiddenDim)
*/
XTensor T2TModel::MakeDecoder(XTensor& inputDec, XTensor& outputEnc,
XTensor* mask, XTensor& maskEncDec, bool isTraining)
XTensor Model::MakeDecoder(XTensor& inputDec, XTensor& outputEnc,
XTensor* mask, XTensor& maskEncDec, bool isTraining)
{
return decoder->Make(inputDec, outputEnc, mask, &maskEncDec,
return decoder->Make(inputDec, outputEnc, mask, &maskEncDec,
inputDec.GetDim(1), isTraining);
}
......@@ -145,7 +175,7 @@ make the network for language modeling (with the output softmax layer)
>> padding - padding of the sequences
>> isTraining - indicates whether the model is for training
*/
void T2TModel::MakeLM(XTensor& input, XTensor& output, XTensor& padding, bool isTraining)
void Model::MakeLM(XTensor& input, XTensor& output, XTensor& padding, bool isTraining)
{
int len = padding.GetDim(padding.order - 1);
int* dims = new int[padding.order + 2];
......@@ -173,19 +203,19 @@ void T2TModel::MakeLM(XTensor& input, XTensor& output, XTensor& padding, bool is
/*
make the network for machine translation (with the output softmax layer)
>> inputEnc - input tensor of the encoder
>> inputDec - input tensor of the decoder
>> output - output tensor (distribution)
>> paddingEnc - padding of the sequences (on the encoder side)
>> paddingDec - padding of the sequences (on the decoder side)
>> inputEnc - input tensor of the encoder, (batchSize, srcLen)
>> inputDec - input tensor of the decoder, (batchSize, tgtLen)
>> output - output tensor (distribution), (batchSize, tgtLen, hiddenDim)
>> paddingEnc - padding of the sequences (on the encoder side), (batchSize, srcLen)
>> paddingDec - padding of the sequences (on the decoder side), (batchSize, tgtLen)
>> isTraining - indicates whether the model is for training
*/
void T2TModel::MakeMT(XTensor& inputEnc, XTensor& inputDec, XTensor& output,
XTensor& paddingEnc, XTensor& paddingDec,
bool isTraining)
void Model::MakeMT(XTensor& inputEnc, XTensor& inputDec, XTensor& output,
XTensor& paddingEnc, XTensor& paddingDec, bool isTraining)
{
XTensor encoding;
XTensor decoding;
XTensor maskEnc;
XTensor maskDec;
XTensor maskEncDec;
......@@ -213,9 +243,9 @@ make the mask for training MT models
>> maksDec - mask of the decoder self-attention
>> maksEncDec - mask of the decoder enc-dec attention
*/
void T2TModel::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
XTensor& paddingEnc, XTensor& paddingDec,
XTensor& maskEnc, XTensor& maskDec, XTensor& maskEncDec)
void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
XTensor& paddingEnc, XTensor& paddingDec,
XTensor& maskEnc, XTensor& maskDec, XTensor& maskEncDec)
{
int len = inputDec.GetDim(inputDec.order - 1);
int* dims = new int[inputDec.order + 2];
......@@ -235,8 +265,8 @@ void T2TModel::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
dims[inputDec.order + 1] = inputEnc.GetDim(inputEnc.order - 1);
InitTensor(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, paddingEnc.devID);
XTensor* maskEncDecTMPEnc = NewTensorBuf(paddingEnc.order + 1, dims + 1,
paddingEnc.dataType, paddingEnc.devID);
XTensor* maskEncDecTMPEnc = NewTensorBuf(paddingEnc.order + 1, dims + 1,
paddingEnc.dataType, paddingEnc.devID);
XTensor* maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID);
_Unsqueeze(&paddingEnc, maskEncDecTMPEnc, paddingEnc.order - 1, paddingDec.GetDim(-1));
......@@ -260,8 +290,7 @@ void T2TModel::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
dimsPadding[i + 1] = padding2->GetDim(i);
dimsPadding[0] = nhead;
XTensor* padding3 = NewTensorBuf(paddingEnc.order + 2, dimsPadding, paddingEnc.dataType,
paddingEnc.devID);
XTensor* padding3 = NewTensorBuf(paddingEnc.order + 2, dimsPadding, paddingEnc.dataType, paddingEnc.devID);
/* mask of the padding */
_Unsqueeze(&paddingEnc, padding2, paddingEnc.order - 1, paddingEnc.GetDim(-1));
......@@ -284,38 +313,28 @@ void T2TModel::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
/*
make the mask of the encoder
>> inputEnc - input of the encoder
>> paddingEnc - padding of the encoder input
>> maskEnc - mask of the encoder self-attention
>> paddingEnc - padding of the encoder input, (batchSize, srcLen)
>> maskEnc - mask of the encoder self-attention, (headNum, batchSize, srcLen, srcLen)
*/
void T2TModel::MakeMTMaskEnc(XTensor& paddingEnc, XTensor& maskEnc)
void Model::MakeMTMaskEnc(XTensor& paddingEnc, XTensor& maskEnc)
{
XTensor padding2;
XTensor padding3;
/* mask of the padding */
Unsqueeze(paddingEnc, padding2, paddingEnc.order - 1, paddingEnc.GetDim(-1));
Unsqueeze(padding2, padding3, 0, nhead);
ScaleAndShiftMe(padding3, 1e9F, -1e9F);
InitTensor(&maskEnc, &padding3);
maskEnc.SetZeroAll();
/* generate the mask on the source language side (for padding) */
SumMe(maskEnc, padding3);
Unsqueeze(padding2, maskEnc, 0, nhead);
ScaleAndShiftMe(maskEnc, 1e9F, -1e9F);
}
/*
make the mask of the decoder
>> inputEnc - input of the encoder
>> inputDec - input of the decoder
>> paddingEnc - padding of the encoder input
>> paddingDec - padding of the decoder input
>> maksDec - mask of the decoder self-attention
>> maksEncDec - mask of the decoder enc-dec attention
>> paddingEnc - padding of the encoder input, (batchSize, srcLen)
>> paddingDec - padding of the decoder input, (batchSize, tgtLen)
>> maksDec - mask of the decoder self-attention, (headNum, batchSize, tgtLen, tgtLen)
>> maksEncDec - mask of the decoder enc-dec attention, (headNum, batchSize, tgtLen, srcLen)
*/
void T2TModel::MakeMTMaskDec(XTensor& paddingEnc, XTensor& paddingDec,
XTensor& maskDec, XTensor& maskEncDec)
void Model::MakeMTMaskDec(XTensor& paddingEnc, XTensor& paddingDec,
XTensor& maskDec, XTensor& maskEncDec)
{
int len = paddingDec.GetDim(paddingDec.order - 1);
int* dims = new int[paddingDec.order + 2];
......@@ -340,26 +359,27 @@ void T2TModel::MakeMTMaskDec(XTensor& paddingEnc, XTensor& paddingDec,
delete[] dims;
}
/*
get parameter matrices
>> list - the list that keeps the parameter matrics
*/
void T2TModel::GetParams(TensorList& list)
void Model::GetParams(TensorList& list)
{
list.Clear();
/* encoder parameters */
for (int i = 0; i < encoder->nlayer; i++) {
list.Add(&encoder->selfAtt[i].wq);
list.Add(&encoder->selfAtt[i].wk);
list.Add(&encoder->selfAtt[i].wv);
list.Add(&encoder->selfAtt[i].bq);
list.Add(&encoder->selfAtt[i].bk);
list.Add(&encoder->selfAtt[i].bv);
list.Add(&encoder->selfAtt[i].weightQ);
list.Add(&encoder->selfAtt[i].weightK);
list.Add(&encoder->selfAtt[i].weightV);
list.Add(&encoder->selfAtt[i].biasQ);
list.Add(&encoder->selfAtt[i].biasK);
list.Add(&encoder->selfAtt[i].biasV);
if (encoder->selfAtt[i].useRPR)
list.Add(&encoder->selfAtt[i].RPEmbK);
list.Add(&encoder->selfAtt[i].wo);
list.Add(&encoder->selfAtt[i].bo);
list.Add(&encoder->selfAtt[i].weightO);
list.Add(&encoder->selfAtt[i].biasO);
list.Add(&encoder->fnns[i].w1);
list.Add(&encoder->fnns[i].b1);
list.Add(&encoder->fnns[i].w2);
......@@ -377,26 +397,26 @@ void T2TModel::GetParams(TensorList& list)
if (isMT) {
/* decoder parameters */
for (int i = 0; i < decoder->nlayer; i++) {
list.Add(&decoder->selfAtt[i].wq);
list.Add(&decoder->selfAtt[i].wk);
list.Add(&decoder->selfAtt[i].wv);
list.Add(&decoder->selfAtt[i].bq);
list.Add(&decoder->selfAtt[i].bk);
list.Add(&decoder->selfAtt[i].bv);
list.Add(&decoder->selfAtt[i].weightQ);
list.Add(&decoder->selfAtt[i].weightK);
list.Add(&decoder->selfAtt[i].weightV);
list.Add(&decoder->selfAtt[i].biasQ);
list.Add(&decoder->selfAtt[i].biasK);
list.Add(&decoder->selfAtt[i].biasV);
if (decoder->selfAtt[i].useRPR)
list.Add(&decoder->selfAtt[i].RPEmbK);
list.Add(&decoder->selfAtt[i].wo);
list.Add(&decoder->selfAtt[i].bo);
list.Add(&decoder->selfAtt[i].weightO);
list.Add(&decoder->selfAtt[i].biasO);
list.Add(&decoder->selfAttLayerNorms[i].w);
list.Add(&decoder->selfAttLayerNorms[i].b);
list.Add(&decoder->enDeAtt[i].wq);
list.Add(&decoder->enDeAtt[i].wk);
list.Add(&decoder->enDeAtt[i].wv);
list.Add(&decoder->enDeAtt[i].bq);
list.Add(&decoder->enDeAtt[i].bk);
list.Add(&decoder->enDeAtt[i].bv);
list.Add(&decoder->enDeAtt[i].wo);
list.Add(&decoder->enDeAtt[i].bo);
list.Add(&decoder->enDeAtt[i].weightQ);
list.Add(&decoder->enDeAtt[i].weightK);
list.Add(&decoder->enDeAtt[i].weightV);
list.Add(&decoder->enDeAtt[i].biasQ);
list.Add(&decoder->enDeAtt[i].biasK);
list.Add(&decoder->enDeAtt[i].biasV);
list.Add(&decoder->enDeAtt[i].weightO);
list.Add(&decoder->enDeAtt[i].biasO);
list.Add(&decoder->enDeAttLayerNorms[i].w);
list.Add(&decoder->enDeAttLayerNorms[i].b);
list.Add(&decoder->fnns[i].w1);
......@@ -418,8 +438,9 @@ void T2TModel::GetParams(TensorList& list)
list.Add(&decoder->embedder.w);
}
if (shareDecInputOutputWeight == 0)
if (shareDecInputOutputWeight == 0) {
list.Add(&outputLayer->w);
}
}
/*
......@@ -427,14 +448,14 @@ dump the model to a file
>> fn - where to save the model
>> model - the model
*/
void T2TModel::Dump(const char* fn)
void Model::Dump(const char* fn)
{
double startT = GetClockSec();
FILE* file = fopen(fn, "wb");
CheckNTErrors(file, "Cannot open the model file");
TensorList params(100);
TensorList params;
GetParams(params);
......@@ -459,22 +480,29 @@ void T2TModel::Dump(const char* fn)
double elapsed = GetClockSec() - startT;
XPRINT1(0, stderr, "[INFO] model saved (took %.1fs)\n", elapsed);
LOG("model saved (took %.1fs)", elapsed);
}
/* read the parameters */
void T2TModel::Read(FILE* file)
void Model::Read(FILE* file)
{
double startT = GetClockSec();
TensorList params(100);
TensorList params;
GetParams(params);
LOG("params count: %lu", params.Size());
int size = 0;
for (int i = 0; i < params.Size(); i++) {
size += params[i]->unitNum;
}
LOG("params size: %d", size);
/* convert parameters to FP16 */
/* convert parameters to FP16 before reading files */
if (useFP16) {
LOG("Convert parameters to FP16");
for (int i = 0; i < params.Size(); i++) {
XTensor* p = params[i];
InitTensorV2(p, p->order, p->dimSize, X_FLOAT16, 1, p->devID);
InitTensor(p, p->order, p->dimSize, X_FLOAT16, p->devID, p->enableGrad && X_ENABLE_GRAD);
}
auto& encEmb = encoder->embedder.posEmbeddingBase;
......@@ -488,18 +516,18 @@ void T2TModel::Read(FILE* file)
/* share all embeddings */
if (shareAllEmbeddings == 1) {
decoder->embedder.w = CopyValues(encoder->embedder.w);
XPRINT(0, stderr, "[INFO] sharing encoder decoder embeddings\n");
_CopyValues(&encoder->embedder.w, &decoder->embedder.w);
LOG("sharing encoder decoder embeddings");
}
/* share embeddings with output weights */
if (shareDecInputOutputWeight == 1) {
outputLayer->w = CopyValues(decoder->embedder.w);
XPRINT(0, stderr, "[INFO] sharing decoder embeddings with output weights\n");
_CopyValues(&decoder->embedder.w, &outputLayer->w);
LOG("sharing decoder embeddings with output weights");
}
double elapsed = GetClockSec() - startT;
XPRINT1(0, stderr, "[INFO] model loaded (took %.1fs)\n", elapsed);
LOG("model loaded (took %.1fs)", elapsed);
}
}
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -20,23 +19,22 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#ifndef __T2TMODEL_H__
#define __T2TMODEL_H__
#ifndef __MODEL_H__
#define __MODEL_H__
#include "T2TEncoder.h"
#include "T2TDecoder.h"
#include "module/T2TFNN.h"
#include "module/T2TOutput.h"
#include "module/T2TUtility.h"
#include "module/T2TAttention.h"
#include "Encoder.h"
#include "Decoder.h"
#include "module/FNN.h"
#include "module/Output.h"
#include "Utility.h"
#include "module/Attention.h"
namespace transformer
namespace nmt
{
/* a transformer model that keeps parameters of the encoder,
the decoder and the output layer (softmax). Also, it creates
the network used in transformer. */
class T2TModel
/* a nmt model that keeps parameters of the encoder,
the decoder and the output layer (softmax). */
class Model
{
public:
/* device id */
......@@ -49,7 +47,7 @@ public:
AttDecoder* decoder;
/* output layer */
T2TOutput* outputLayer;
Output* outputLayer;
/* indicates whether the model is running for language modeling */
bool isLM;
......@@ -71,13 +69,16 @@ public:
public:
/* constructor */
T2TModel();
Model();
/* de-constructor */
~T2TModel();
~Model();
/* initialize the model */
void InitModel(T2TConfig& config);
void InitModel(Config& config);
/* print model configurations */
void ShowModelConfig(Config& config);
/* make the encoding network */
XTensor MakeEncoder(XTensor& input, XTensor* mask, bool isTraining);
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -17,49 +16,47 @@
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-06
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-06, 2020-07
*/
#include <cmath>
#include <ctime>
#include "Transformer.h"
#include "train/T2TTrainer.h"
#include "module/T2TUtility.h"
#include "translate/T2TTranslator.h"
#include "../../tensor/XDevice.h"
#include "../../tensor/XGlobal.h"
#include "../../tensor/XUtility.h"
#include "NMT.h"
#include "train/Trainer.h"
#include "translate/Translator.h"
namespace transformer
namespace nmt
{
int TransformerMain(int argc, const char** argv)
int NMTMain(int argc, const char** argv)
{
if (argc == 0)
return 1;
/* load configurations */
T2TConfig config(argc, argv);
Config config(argc, argv);
srand((unsigned int)time(NULL));
srand(1);
/* train the model */
/* training */
if (strcmp(config.trainFN, "") != 0) {
ENABLE_GRAD;
T2TModel model;
Model model;
model.InitModel(config);
T2TTrainer trainer;
Trainer trainer;
trainer.Init(config);
trainer.Train(config.trainFN, config.validFN, config.modelFN, &model);
}
/* translate the test file */
/* translating */
if (strcmp(config.testFN, "") != 0 && strcmp(config.outputFN, "") != 0) {
/* disable grad flow */
DISABLE_GRAD;
T2TModel model;
Model model;
model.InitModel(config);
T2TTranslator translator;
Translator translator;
translator.Init(config);
translator.Translate(config.testFN, config.srcVocabFN,
config.tgtVocabFN, config.outputFN, &model);
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -16,29 +15,17 @@
*/
/*
*
* An implementation of the transformer system. See more details
* about FNNLM in
* "Attention Is All You Need" by Vaswani et al.
* https://arxiv.org/pdf/1706.03762.pdf
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
* I start writing the code related to NMT - a long time since my last coding
* work on MT
* An implementation of the NMT system.
*/
#ifndef __TRANSFORMER_H__
#define __TRANSFORMER_H__
#include "../../tensor/XGlobal.h"
#include "../../tensor/XTensor.h"
#include "../../tensor/core/CHeader.h"
#ifndef __NMT_H__
#define __NMT_H__
namespace transformer
namespace nmt
{
/* entrance of the program */
int TransformerMain(int argc, const char** argv);
int NMTMain(int argc, const char** argv);
}
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -27,13 +26,13 @@
#include <fstream>
#include <sstream>
#include "T2TUtility.h"
#include "../../../tensor/XGlobal.h"
#include "Utility.h"
#include "../../tensor/XGlobal.h"
using namespace nts;
using namespace std;
namespace transformer
namespace nmt
{
/*
......@@ -41,7 +40,7 @@ load configurations from the command
>> argc - number of arguments
>> argv - the list of arguments
*/
T2TConfig::T2TConfig(int argc, const char** argv)
Config::Config(int argc, const char** argv)
{
char** args = new char* [MAX_PARAM_NUM];
for (int i = 0; i < argc; i++) {
......@@ -61,22 +60,26 @@ T2TConfig::T2TConfig(int argc, const char** argv)
ShowParams(argsNum, args);
/* options for the model */
LoadParamInt(argsNum, args, "nhead", &nhead, 8);
LoadParamInt(argsNum, args, "enclayer", &nEncLayer, 1);
LoadParamInt(argsNum, args, "declayer", &nDecLayer, 1);
LoadParamInt(argsNum, args, "nhead", &nhead, 4);
LoadParamInt(argsNum, args, "enclayer", &nEncLayer, 6);
LoadParamInt(argsNum, args, "declayer", &nDecLayer, 6);
LoadParamInt(argsNum, args, "maxrp", &maxRP, 8);
LoadParamInt(argsNum, args, "embsize", &embSize, 256);
LoadParamInt(argsNum, args, "modelsize", &modelSize, 256);
LoadParamInt(argsNum, args, "embsize", &embSize, 512);
LoadParamInt(argsNum, args, "modelsize", &modelSize, 512);
LoadParamInt(argsNum, args, "maxpos", &maxPosLen, 1024);
LoadParamInt(argsNum, args, "fnnhidden", &fnnHiddenSize, modelSize * 4);
LoadParamInt(argsNum, args, "vsize", &srcVocabSize, 10000);
LoadParamInt(argsNum, args, "vsizetgt", &tgtVocabSize, 10000);
LoadParamInt(argsNum, args, "fnnhidden", &fnnHiddenSize, modelSize * 2);
LoadParamInt(argsNum, args, "vsize", &srcVocabSize, 10152);
LoadParamInt(argsNum, args, "vsizetgt", &tgtVocabSize, 10152);
LoadParamInt(argsNum, args, "padid", &padID, 1);
LoadParamInt(argsNum, args, "startid", &startID, 2);
LoadParamInt(argsNum, args, "endid", &endID, 2);
LoadParamBool(argsNum, args, "rpr", &useRPR, false);
LoadParamBool(argsNum, args, "prenorm", &preNorm, false);
LoadParamString(argsNum, args, "model", modelFN, "model.bin");
LoadParamBool(argsNum, args, "prenorm", &preNorm, true);
// TODO: refactor the parameters type to support weight sharing during training
LoadParamInt(argsNum, args, "shareemb", &shareAllEmbeddings, 0);
LoadParamInt(argsNum, args, "sharedec", &shareDecInputOutputWeight, 0);
LoadParamString(argsNum, args, "model", modelFN, "");
LoadParamString(argsNum, args, "srcvocab", srcVocabFN, "vocab.src");
LoadParamString(argsNum, args, "tgtvocab", tgtVocabFN, "vocab.tgt");
......@@ -84,19 +87,20 @@ T2TConfig::T2TConfig(int argc, const char** argv)
LoadParamString(argsNum, args, "train", trainFN, "");
LoadParamString(argsNum, args, "valid", validFN, "");
LoadParamInt(argsNum, args, "dev", &devID, 0);
LoadParamInt(argsNum, args, "wbatch", &wBatchSize, 2048);
LoadParamInt(argsNum, args, "sbatch", &sBatchSize, 1);
LoadParamInt(argsNum, args, "wbatch", &wBatchSize, 4096);
LoadParamInt(argsNum, args, "sbatch", &sBatchSize, 8);
isTraining = (strcmp(trainFN, "") == 0) ? false : true;
LoadParamBool(argsNum, args, "mt", &isMT, true);
LoadParamFloat(argsNum, args, "dropout", &dropout, 0.1);
LoadParamFloat(argsNum, args, "fnndrop", &fnnDropout, 0.0);
LoadParamFloat(argsNum, args, "attdrop", &attDropout, 0.0);
LoadParamFloat(argsNum, args, "dropout", &dropout, 0.3);
LoadParamFloat(argsNum, args, "fnndrop", &fnnDropout, 0.1);
LoadParamFloat(argsNum, args, "attdrop", &attDropout, 0.1);
LoadParamFloat(argc, args, "lrate", &lrate, 1.0F);
LoadParamFloat(argc, args, "lrate", &lrate, 0.0015F);
LoadParamFloat(argc, args, "lrbias", &lrbias, 0);
LoadParamInt(argc, args, "nepoch", &nepoch, 20);
LoadParamInt(argc, args, "nepoch", &nepoch, 50);
LoadParamInt(argc, args, "maxcheckpoint", &maxCheckpoint, 10);
LoadParamInt(argc, args, "nstep", &nstep, 100000);
LoadParamInt(argc, args, "nwarmup", &nwarmup, 3000);
LoadParamInt(argc, args, "nwarmup", &nwarmup, 8000);
LoadParamBool(argc, args, "adam", &useAdam, true);
LoadParamFloat(argc, args, "adambeta1", &adamBeta1, 0.9F);
LoadParamFloat(argc, args, "adambeta2", &adamBeta2, 0.98F);
......@@ -104,9 +108,8 @@ T2TConfig::T2TConfig(int argc, const char** argv)
LoadParamBool(argc, args, "shuffled", &isShuffled, true);
LoadParamFloat(argc, args, "labelsmoothing", &labelSmoothingP, 0.1);
LoadParamInt(argc, args, "nstepcheckpoint", &nStepCheckpoint, -1);
LoadParamBool(argc, args, "epochcheckpoint", &useEpochCheckpoint, false);
LoadParamBool(argc, args, "epochcheckpoint", &useEpochCheckpoint, true);
LoadParamInt(argc, args, "updatestep", &updateStep, 1);
LoadParamBool(argc, args, "debug", &isDebugged, false);
LoadParamBool(argc, args, "sorted", &isLenSorted, false);
LoadParamInt(argc, args, "bufsize", &bufSize, 50000);
......@@ -114,7 +117,7 @@ T2TConfig::T2TConfig(int argc, const char** argv)
LoadParamBool(argc, args, "smallbatch", &isSmallBatch, true);
LoadParamBool(argc, args, "bigbatch", &isBigBatch, false);
LoadParamBool(argc, args, "randbatch", &isRandomBatch, false);
LoadParamInt(argc, args, "bucketsize", &bucketSize, 0);
LoadParamInt(argc, args, "bucketsize", &bucketSize, wBatchSize * 10);
/* options for translating */
LoadParamString(argsNum, args, "test", testFN, "");
......@@ -122,7 +125,7 @@ T2TConfig::T2TConfig(int argc, const char** argv)
LoadParamInt(argsNum, args, "beamsize", &beamSize, 1);
LoadParamBool(argsNum, args, "fp16", &useFP16, false);
LoadParamFloat(argsNum, args, "lenalpha", &lenAlpha, 0.6);
LoadParamFloat(argsNum, args, "maxlenalpha", &maxLenAlpha, 2.0);
LoadParamFloat(argsNum, args, "maxlenalpha", &maxLenAlpha, 1.2);
for (int i = 0; i < argc; i++)
delete[] args[i];
......@@ -136,7 +139,7 @@ load configurations from a file
>> args - the list to store the configurations
format: one option per line, separated by a blank or a tab
*/
int T2TConfig::LoadFromFile(const char* configFN, char** args) {
int Config::LoadFromFile(const char* configFN, char** args) {
ifstream f(configFN, ios::in);
CheckNTErrors(f.is_open(), "unable to open the config file");
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -20,18 +19,18 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-06
*/
#ifndef __T2TUTILITY_H__
#define __T2TUTILITY_H__
#ifndef __UTILITY_H__
#define __UTILITY_H__
#include <string>
#include <cstdio>
#include "../../../tensor/XList.h"
#include "../../tensor/XList.h"
using namespace std;
using namespace nts;
namespace transformer
namespace nmt
{
#define MAX_PARAM_NUM 100
......@@ -50,8 +49,8 @@ IntList SplitInt(const string& s, const string& delimiter);
FloatList SplitFloat(const string& s, const string& delimiter);
UInt64List SplitToPos(const string& s, const string& delimiter);
/* configurations for t2t */
class T2TConfig {
/* configurations for */
class Config {
public:
/* path to the model */
char modelFN[1024];
......@@ -131,6 +130,12 @@ public:
/* indicates whether the model is running for machine translation */
bool isMT;
/* indicates whether share encoder decoder embeddings */
int shareAllEmbeddings;
/* indicates whether share decoder embeddings and output weights */
int shareDecInputOutputWeight;
/* indicates whether the model is running with FP16 data type */
bool useFP16;
......@@ -164,9 +169,12 @@ public:
/* training epoch number */
int nepoch;
/* traing step number */
/* training step number */
int nstep;
/* the maximum number of saved checkpoints */
int maxCheckpoint;
/* indicates whether we use Adam */
bool useAdam;
......@@ -193,9 +201,6 @@ public:
/* number of batches on which we do model update */
int updateStep;
/* indicates whether we intend to debug the net */
bool isDebugged;
/* indicates whether the sequence is sorted by length */
bool isLenSorted;
......@@ -222,7 +227,7 @@ public:
public:
/* load configurations from the command */
T2TConfig(int argc, const char** argv);
Config(int argc, const char** argv);
/* load configurations from a file */
int LoadFromFile(const char* configFN, char** args);
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -15,22 +14,20 @@
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
*/
#include <cmath>
#include "T2TUtility.h"
#include "T2TAttention.h"
#include "T2TEmbedding.h"
#include "Attention.h"
#include "Embedding.h"
#include "../Utility.h"
#include "../../../tensor/core/CHeader.h"
namespace transformer
namespace nmt
{
/* constructor */
T2TAttention::T2TAttention()
Attention::Attention()
{
nhead = -1;
dk = -1;
......@@ -39,7 +36,7 @@ T2TAttention::T2TAttention()
}
/* de-constructor */
T2TAttention::~T2TAttention()
Attention::~Attention()
{
}
......@@ -47,7 +44,7 @@ T2TAttention::~T2TAttention()
initialize the model
>> config - the configurations of the network
*/
void T2TAttention::InitModel(T2TConfig& config)
void Attention::InitModel(Config& config)
{
devID = config.devID;
useRPR = config.useRPR;
......@@ -59,28 +56,34 @@ void T2TAttention::InitModel(T2TConfig& config)
maxRP = config.maxRP;
dropoutP = config.attDropout;
InitTensor2D(&wq, d, d, X_FLOAT, devID);
InitTensor1D(&bq, d, X_FLOAT, devID);
InitTensor2D(&wk, d, d, X_FLOAT, devID);
InitTensor1D(&bk, d, X_FLOAT, devID);
InitTensor2D(&wv, d, d, X_FLOAT, devID);
InitTensor1D(&bv, d, X_FLOAT, devID);
/* initialize the parameters */
InitTensor2D(&weightQ, d, d, X_FLOAT, devID);
InitTensor1D(&biasQ, d, X_FLOAT, devID);
InitTensor2D(&weightK, d, d, X_FLOAT, devID);
InitTensor1D(&biasK, d, X_FLOAT, devID);
InitTensor2D(&weightV, d, d, X_FLOAT, devID);
InitTensor1D(&biasV, d, X_FLOAT, devID);
if (useRPR)
InitTensor2D(&RPEmbK, maxRP * 2 + 1, d / nhead, X_FLOAT, devID);
InitTensor2D(&wo, d, d, X_FLOAT, devID);
InitTensor1D(&bo, d, X_FLOAT, devID);
InitTensor2D(&weightO, d, d, X_FLOAT, devID);
InitTensor1D(&biasO, d, X_FLOAT, devID);
float scale = 1.0F;
_SetDataFanInOut(&wk, scale);
_SetDataFanInOut(&wq, scale);
_SetDataFanInOut(&wv, scale);
_SetDataFanInOut(&wo, scale);
_SetDataFanInOut(&weightK, scale);
_SetDataFanInOut(&weightQ, scale);
_SetDataFanInOut(&weightV, scale);
_SetDataFanInOut(&weightO, scale);
if (useRPR)
_SetDataFanInOut(&RPEmbK, scale);
bk.SetZeroAll();
bq.SetZeroAll();
bv.SetZeroAll();
bo.SetZeroAll();
biasQ.SetZeroAll();
biasO.SetZeroAll();
biasK.SetDataRand(-(DTYPE)sqrt(6.0F / d), (DTYPE)sqrt(6.0F / d));
biasV.SetDataRand(-(DTYPE)sqrt(6.0F / d), (DTYPE)sqrt(6.0F / d));
}
/*
......@@ -96,30 +99,30 @@ make the network
>> cacheType - type of cache, e.g., self-attention
<< return - multi-attention result
*/
XTensor T2TAttention::Make(XTensor& k, XTensor& q, XTensor& v, XTensor* mask,
bool isTraining, Cache* cache, int cacheType)
XTensor Attention::Make(XTensor& k, XTensor& q, XTensor& v, XTensor* mask,
bool isTraining, Cache* cache, int attType)
{
const bool isEnc = (!cache) ? true : false;
/* linear transformation before self-attention */
XTensor q2, k2, v2;
q2 = MulAndShift(q, wq, bq);
q2 = MulAndShift(q, weightQ, biasQ);
if (!cache || isTraining) {
if (!cache || isTraining || !(cache->enable)) {
/* self attention for encoder layers */
k2 = MulAndShift(k, wk, bk);
v2 = MulAndShift(v, wv, bv);
k2 = MulAndShift(k, weightK, biasK);
v2 = MulAndShift(v, weightV, biasV);
if (useRPR)
if (useRPR && attType == SELF_ATT)
return MakeRPRAttention(k2, q2, v2, mask, isTraining, isEnc);
return MakeAttention(k2, q2, v2, mask, isTraining);
}
else {
if (cacheType == SELF_ATT) {
k2 = MulAndShift(k, wk, bk);
v2 = MulAndShift(v, wv, bv);
if (attType == SELF_ATT) {
k2 = MulAndShift(k, weightK, biasK);
v2 = MulAndShift(v, weightV, biasV);
/* if hit, we only concat the cache with the new token */
if (!cache->miss) {
......@@ -134,10 +137,10 @@ XTensor T2TAttention::Make(XTensor& k, XTensor& q, XTensor& v, XTensor* mask,
return MakeRPRAttention(cache->key, q2, cache->value, mask, isTraining, isEnc);
return MakeAttention(cache->key, q2, cache->value, mask, isTraining);
}
else if (cacheType == EN_DE_ATT) {
else if (attType == EN_DE_ATT) {
if (cache->miss) {
cache->key = MulAndShift(k, wk, bk);
cache->value = MulAndShift(v, wv, bv);
cache->key = MulAndShift(k, weightK, biasK);
cache->value = MulAndShift(v, weightV, biasV);
cache->miss = false;
}
......@@ -155,8 +158,8 @@ make the attention network given keys, queries and values (after linear transfor
>> mask - as it is
>> isTraining - indicates whether the model is used for training
*/
XTensor T2TAttention::MakeAttention(XTensor& k, XTensor& q, XTensor& v,
XTensor* mask, bool isTraining)
XTensor Attention::MakeAttention(XTensor& k, XTensor& q, XTensor& v,
XTensor* mask, bool isTraining)
{
XTensor kheads;
XTensor qheads;
......@@ -185,7 +188,7 @@ XTensor T2TAttention::MakeAttention(XTensor& k, XTensor& q, XTensor& v,
dot = BMMul(qheads, X_NOTRANS, kheads, X_TRANS);
if (mask)
dot = dot + (*mask);
dot = dot + *mask;
dot = Linear(dot, 1.0F / (float)sqrt((float)dk / nhead));
......@@ -203,7 +206,7 @@ XTensor T2TAttention::MakeAttention(XTensor& k, XTensor& q, XTensor& v,
att = ConvertDataType(att, dataType);
/* concatenate the heads */
return MulAndShift(Merge(att, att.order - 1), wo, bo);
return MulAndShift(Merge(att, att.order - 1), weightO, biasO);
}
/*
......@@ -216,16 +219,16 @@ with the given keys, queries and values (after linear transformation)
>> isTraining - indicates whether the model is used for training
>> isEnc - indicates whether it is encoder
*/
XTensor T2TAttention::MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v,
XTensor* mask, bool isTraining, bool isEnc)
XTensor Attention::MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v,
XTensor* mask, bool isTraining, bool isEnc)
{
XTensor kheads;
XTensor qheads;
XTensor vheads;
const int batchSize = q.dimSize[0];
const int lenQ = q.dimSize[1];
const int lenKV = k.dimSize[1];
const int batchSize = q.GetDim(0);
const int lenQ = q.GetDim(1);
const int lenKV = k.GetDim(1);
const auto dataType = k.dataType;
......@@ -241,7 +244,7 @@ XTensor T2TAttention::MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v,
XTensor embMatrix, relativeKey;
/* generate the relative emb index (L_q, L_kv) */
embMatrix = GetRPEmbedding(lenQ, lenKV, maxRP, isEnc);
embMatrix = GetRPEmbedding(lenQ, lenKV, maxRP, isEnc || isTraining);
/* generate the relative key from the RPEmbK (L_q, L_kv, H/K) */
relativeKey = Gather(RPEmbK, embMatrix);
......@@ -252,12 +255,13 @@ XTensor T2TAttention::MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v,
relativeKey = ConvertDataType(relativeKey, X_FLOAT);
}
ScaleAndShiftMe(qheads, 1.0F / float(nhead));
float scaling = sqrt(d / nhead);
qheads = ScaleAndShift(qheads, 1.0F / scaling);
dot = RPDotProduct(qheads, kheads, relativeKey, true);
if (mask)
dot = dot + (*mask);
dot = dot + *mask;
/* softmax */
scalar = Softmax(dot, -1);
......@@ -275,7 +279,7 @@ XTensor T2TAttention::MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v,
att = ConvertDataType(att, dataType);
/* concatenate the heads */
return MulAndShift(Merge(att, att.order - 1), wo, bo);
return MulAndShift(Merge(att, att.order - 1), weightO, biasO);
}
/*
......@@ -284,8 +288,8 @@ generate relative position embeddings
>> lenKV - the length of key and value
>> maxRelativeLen - the maximum length of relative position
*/
XTensor T2TAttention::GetRPEmbedding(const int lenQ, const int lenKV,
const int maxRelativeLen, const bool isEnc)
XTensor Attention::GetRPEmbedding(const int lenQ, const int lenKV,
const int maxRelativeLen, const bool isEnc)
{
XTensor range;
XTensor embMatrix;
......@@ -309,37 +313,46 @@ XTensor T2TAttention::GetRPEmbedding(const int lenQ, const int lenKV,
embMatrix = Unsqueeze(range, 0, lenQ);
}
ClipMe(embMatrix, -float(maxRelativeLen), float(maxRelativeLen));
ScaleAndShiftMe(embMatrix, 1.0F, float(maxRelativeLen));
//ClipMe(embMatrix, -float(maxRelativeLen), float(maxRelativeLen));
embMatrix = Clip(embMatrix, -float(maxRelativeLen), float(maxRelativeLen));
embMatrix = ScaleAndShift(embMatrix, 1.0F, float(maxRelativeLen));
delete[] index;
return embMatrix;
}
/*
Relative position-aware dot-product attention inner calculation.
relative position-aware dot-product attention inner calculation.
>> x - Tensor with shape [batch_size*heads, length, length or depth].
>> y - Tensor with shape [batch_size*heads, length, depth].
>> z - Tensor with shape [length, length, depth].
>> isKey - Whether y is key.
<< return - A Tensor with shape [batch_size*heads, length, length or depth].
*/
XTensor T2TAttention::RPDotProduct(XTensor& x, XTensor& y, XTensor& z, const bool isKey)
XTensor Attention::RPDotProduct(XTensor& x, XTensor& y, XTensor& z, const bool isKey)
{
const int headNum = nhead;
const int batchSize = x.dimSize[1];
const int lenQ = x.dimSize[2];
const int lenKV = y.dimSize[2];
const int depth = y.dimSize[3];
const int batchSize = x.GetDim(1);
const int lenQ = x.GetDim(2);
const int lenKV = y.GetDim(2);
const int depth = y.GetDim(3);
const int lastDim = isKey ? lenKV : depth;
MATRIX_TRANS_TYPE transposeFlag = isKey ? X_TRANS : X_NOTRANS;
auto transposeFlag = isKey ? X_TRANS : X_NOTRANS;
XTensor context;
context = MatrixMulBatched(x, X_NOTRANS, y, transposeFlag);
int mergeDimsX[] = { headNum * batchSize, lenQ, x.GetDim(3) };
int mergeDimsY[] = { headNum * batchSize, lenKV, y.GetDim(3) };
x = Reshape(x, 3, mergeDimsX);
y = Reshape(y, 3, mergeDimsY);
if (isKey) {
y = Transpose(y, 1, 2);
}
int mergeDims[] = { headNum * batchSize, lenQ, x.dimSize[3] };
x.Reshape(3, mergeDims);
XTensor context;
context = BMMul(x, y);
int newDims[]{ headNum, batchSize, context.GetDim(1), context.GetDim(2) };
context = Reshape(context, 4, newDims);
XTensor xTrans;
xTrans = Transpose(x, 0, 1);
......@@ -351,15 +364,17 @@ XTensor T2TAttention::RPDotProduct(XTensor& x, XTensor& y, XTensor& z, const boo
relativeTrans = Transpose(relative, 0, 1);
int splitDims[] = { headNum, batchSize, lenQ, lastDim };
relativeTrans.Reshape(4, splitDims);
return Sum(context, relativeTrans);
relativeTrans = Reshape(relativeTrans, 4, splitDims);
return context + relativeTrans;
}
/* constructor */
Cache::Cache()
{
miss = true;
enable = true;
}
/* update the states cache */
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -20,17 +19,17 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
*/
#ifndef __T2TATTENTION_H__
#define __T2TATTENTION_H__
#ifndef __ATTENTION_H__
#define __ATTENTION_H__
#include "T2TNNUtil.h"
#include "T2TUtility.h"
#include "NNUtil.h"
#include "../Utility.h"
#include "../../../network/XNet.h"
#include "../../../tensor/core/CHeader.h"
using namespace nts;
namespace transformer
namespace nmt
{
/* attention type */
enum { NONE, SELF_ATT, EN_DE_ATT };
......@@ -50,6 +49,9 @@ public:
/* indicates cache miss if 'true' */
bool miss;
/* indicates whether we use cache */
bool enable;
/* constructor */
Cache();
......@@ -64,7 +66,7 @@ public:
};
/* multi-head attention */
class T2TAttention
class Attention
{
public:
/* device id */
......@@ -74,22 +76,22 @@ public:
int nhead;
/* transformation matrix for Q */
XTensor wq;
XTensor weightQ;
/* bias for Q */
XTensor bq;
XTensor biasQ;
/* transformation matrix for K */
XTensor wk;
XTensor weightK;
/* bias for K */
XTensor bk;
XTensor biasK;
/* transformation matrix for V */
XTensor wv;
XTensor weightV;
/* bias for V */
XTensor bv;
XTensor biasV;
XTensor wBig;
......@@ -99,10 +101,10 @@ public:
XTensor RPEmbK;
/* transformation after dot-product attention */
XTensor wo;
XTensor weightO;
/* bias after dot-product attention */
XTensor bo;
XTensor biasO;
/* size of transformed Q and K */
int dk;
......@@ -124,13 +126,13 @@ public:
public:
/* constructor */
T2TAttention();
Attention();
/* de-constructor */
~T2TAttention();
~Attention();
/* initialize the model */
void InitModel(T2TConfig& config);
void InitModel(Config& config);
/* make the network */
XTensor Make(XTensor& k, XTensor& q, XTensor& v,
......@@ -145,8 +147,10 @@ public:
XTensor MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v,
XTensor* mask, bool isTraining, bool isEnc);
/* generate relative position embeddings */
XTensor GetRPEmbedding(const int lenQ, const int lenKV, const int maxRelativeLen, const bool isEnc);
/* relative position-aware dot-product attention inner calculation */
XTensor RPDotProduct(XTensor& x, XTensor& y, XTensor& z, const bool is_key);
};
}
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northestern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -20,13 +19,11 @@
* This file includes some common modules of the Transformer model
*/
#include <cmath>
#include "T2TCommonModules.h"
#include "CommonModules.h"
#include "../../../tensor/core/CHeader.h"
#include "../../../tensor/function/FHeader.h"
namespace transformer
namespace nmt
{
/*
......@@ -37,7 +34,7 @@ flexible layer normalization for the Transformer
>> before - whether we use layernorm before attention/fnn
>> after - whether we use layernorm after attention/fnn
*/
XTensor LayerNorm(XTensor& input, T2TLN& ln, bool prenorm, bool before, bool after)
XTensor LayerNorm(XTensor& input, LN& ln, bool prenorm, bool before, bool after)
{
if (after ^ prenorm)
return ln.Make(input);
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northestern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -22,16 +21,16 @@
#ifndef __COMMONMODULE_H__
#define __COMMONMODULE_H__
#include "T2TLayerNormal.h"
#include "T2TCommonModules.h"
#include "LayerNorm.h"
#include "CommonModules.h"
using namespace nts;
namespace transformer
namespace nmt
{
/* the layer normalization module to control pre-norm or post-norm*/
XTensor LayerNorm(XTensor& input, T2TLN& ln, bool prenorm, bool before, bool after);
XTensor LayerNorm(XTensor& input, LN& ln, bool prenorm, bool before, bool after);
}
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -20,17 +19,15 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-07
*/
#include <cmath>
#include "T2TUtility.h"
#include "T2TEmbedding.h"
#include "Embedding.h"
#include "../Utility.h"
#include "../../../tensor/core/CHeader.h"
namespace transformer
namespace nmt
{
/* constructor */
T2TEmbedder::T2TEmbedder()
Embedder::Embedder()
{
devID = -1;
vSize = -1;
......@@ -38,7 +35,7 @@ T2TEmbedder::T2TEmbedder()
}
/* de-constructor */
T2TEmbedder::~T2TEmbedder()
Embedder::~Embedder()
{
}
......@@ -47,7 +44,7 @@ initialize the model
>> config - configurations of the model
>> isEnc - indicates if it is used for the encoder
*/
void T2TEmbedder::InitModel(T2TConfig& config, bool isEnc)
void Embedder::InitModel(Config& config, bool isEnc)
{
devID = config.devID;
d = config.modelSize;
......@@ -70,7 +67,7 @@ void T2TEmbedder::InitModel(T2TConfig& config, bool isEnc)
make positional embeddings (of size eSize * length)
>> length - length of the sequence
*/
void T2TEmbedder::MakePosEmbedding(int length)
void Embedder::MakePosEmbedding(int length)
{
InitTensor2D(&posEmbeddingBase, length, eSize, X_FLOAT, devID);
......@@ -110,58 +107,45 @@ make the network
>> isTraining - indicates whether it is training
<< return - word & position embeddings of the input
*/
XTensor T2TEmbedder::Make(XTensor& input, bool isDec, bool isTraining, int nstep)
XTensor Embedder::Make(XTensor& input, bool isDec, bool isTraining, int nstep)
{
/* make sure the padding index is 1 */
CheckNTErrors(input.order > 1, "Wrong input tensor size!");
CheckNTErrors(input.dimSize[input.order - 1] < maxLength, "The sequence is too long!");
CheckNTErrors(vSize > 0, "set vocabulary size by \"-vsize\"");
CheckNTErrors(eSize > 0, "set embedding size by \"-esize\"");
CheckNTErrors(vSize > 0, "Set vocabulary size by \"-vsize\"");
CheckNTErrors(eSize > 0, "Set embedding size by \"-esize\"");
XTensor wordEmbedding, position, posEmbedding;
InitTensor(&position, &input);
int* posData = new int[input.unitNum];
XTensor inputCPU;
InitTensorOnCPU(&inputCPU, &input);
_CopyValues(&input, &inputCPU);
InitTensor1D(&position, input.GetDim(-1), X_INT, devID);
if (!isDec)
if (!isDec || isTraining || input.GetDim(-1) > 1)
{
/* encoder embeddings */
for (int i = 0; i < inputCPU.dimSize[0]; i++) {
int startNoPad = 1 + 1;
int* p = ((int*)inputCPU.data) + i * inputCPU.dimSize[1];
for (int j = 0; j < inputCPU.dimSize[1]; j++) {
if (p[j] == 1) {
posData[i * inputCPU.dimSize[1] + j] = 1;
}
else {
posData[i * inputCPU.dimSize[1] + j] = startNoPad++;
}
}
}
position.SetData(posData, position.unitNum);
position.Range(0, position.unitNum, 1);
// disable grad
ScaleAndShiftMe(position, 1.0F, float(padIdx + 1));
}
else
{
/* decoder embeddings */
position.SetDataFixed(nstep + 2);
/* decoder embeddings during decoding */
position.SetDataFixed(nstep + padIdx + 1);
}
delete[] posData;
/* we make positional embeddings first */
posEmbedding = Gather(posEmbeddingBase, position);
XTensor embTMP;
embTMP = Gather(posEmbeddingBase, position);
posEmbedding = Unsqueeze(embTMP, 0, input.GetDim(0));
/* then we make word embeddings */
//w.enableGrad = false;
wordEmbedding = Gather(w, input);
wordEmbedding = Linear(wordEmbedding, (float)sqrt((float)eSize));
/* we sum over the two embeddings */
return wordEmbedding + posEmbedding;
SumMe(wordEmbedding, posEmbedding);
return wordEmbedding;
}
}
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -20,15 +19,15 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-07
*/
#ifndef __T2TEMBEDDING_H__
#define __T2TEMBEDDING_H__
#ifndef __EMBEDDING_H__
#define __EMBEDDING_H__
#include "T2TUtility.h"
#include "../Utility.h"
#include "../../../network/XNet.h"
using namespace nts;
namespace transformer
namespace nmt
{
#define DEFAULT_EMBEDDING_SIZE 512
......@@ -37,7 +36,7 @@ namespace transformer
embedding (of word at position i):
word embedding + positional embedding
*/
class T2TEmbedder
class Embedder
{
public:
/* device id */
......@@ -52,7 +51,7 @@ public:
/* maximum length of the sequence */
int maxLength;
/* dimension size of the hidden layers in the t2t model */
/* dimension size of the hidden layers in the model */
int d;
/* padding index */
......@@ -67,13 +66,13 @@ public:
public:
/* constructor */
T2TEmbedder();
Embedder();
/* de-constructor */
~T2TEmbedder();
~Embedder();
/* initialize the model */
void InitModel(T2TConfig& config, bool isEnc = true);
void InitModel(Config& config, bool isEnc = true);
/* make positional embeddings */
void MakePosEmbedding(int length);
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -20,19 +19,17 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#include <cmath>
#include "T2TFNN.h"
#include "T2TUtility.h"
#include "T2TEmbedding.h"
#include "FNN.h"
#include "Embedding.h"
#include "../Utility.h"
#include "../../../tensor/core/CHeader.h"
#include "../../../tensor/function/FHeader.h"
namespace transformer
namespace nmt
{
/* constructor */
T2TFNN::T2TFNN()
FNN::FNN()
{
inSize = -1;
outSize = -1;
......@@ -40,7 +37,7 @@ T2TFNN::T2TFNN()
}
/* de-constructor */
T2TFNN::~T2TFNN()
FNN::~FNN()
{
}
......@@ -50,7 +47,7 @@ initialize the model
>> argv - list of pointers to the arguments
>> config - configurations of the model
*/
void T2TFNN::InitModel(T2TConfig& config)
void FNN::InitModel(Config& config)
{
devID = config.devID;
......@@ -69,6 +66,9 @@ void T2TFNN::InitModel(T2TConfig& config)
_SetDataFanInOut(&w1, scale);
_SetDataFanInOut(&w2, scale);
w1.SetDataRand(-(DTYPE)sqrt(6.0F / inSize), (DTYPE)sqrt(6.0F / inSize));
w2.SetDataRand(-(DTYPE)sqrt(6.0F / hSize), (DTYPE)sqrt(6.0F / hSize));
b1.SetZeroAll();
b2.SetZeroAll();
}
......@@ -79,7 +79,7 @@ y = max(0, x * w1 + b1) * w2 + b2
>> input - the input tensor
>> return - the output tensor
*/
XTensor T2TFNN::Make(XTensor& input, bool isTraining)
XTensor FNN::Make(XTensor& input, bool isTraining)
{
XTensor t1;
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -20,20 +19,20 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#ifndef __T2TFNN_H__
#define __T2TFNN_H__
#ifndef __FNN_H__
#define __FNN_H__
#include "T2TUtility.h"
#include "T2TLayerNormal.h"
#include "LayerNorm.h"
#include "../Utility.h"
#include "../../../tensor/XTensor.h"
using namespace nts;
namespace transformer
namespace nmt
{
/* a fnn: y = max(0, x * w1 + b1) * w2 + b2 */
class T2TFNN
class FNN
{
public:
/* device id */
......@@ -66,13 +65,13 @@ public:
public:
/* constructor */
T2TFNN();
FNN();
/* de-constructor */
~T2TFNN();
~FNN();
/* initialize the model */
void InitModel(T2TConfig& config);
void InitModel(Config& config);
/* make the network */
XTensor Make(XTensor& input, bool isTraining);
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -19,16 +18,13 @@
* $Created by: Bei Li (libei_neu@outlook.com) 2020-02-03
*/
#include <cmath>
#include "T2TUtility.h"
#include "T2TEmbedding.h"
#include "T2TGatedLinearUnit.h"
#include "GLU.h"
#include "Embedding.h"
#include "../Utility.h"
#include "../../../tensor/core/CHeader.h"
#include "../../../tensor/function/FHeader.h"
namespace transformer
namespace nmt
{
/* constructor */
......@@ -48,7 +44,7 @@ GLU::~GLU()
initialize the model
>> config - configurations of the model
*/
void GLU::InitModel(T2TConfig& config)
void GLU::InitModel(Config& config)
{
devID = config.devID;
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -23,12 +22,11 @@
#ifndef __GLU_H__
#define __GLU_H__
#include "T2TLayerNormal.h"
#include "T2TGatedLinearUnit.h"
#include "LayerNorm.h"
using namespace nts;
namespace transformer
namespace nmt
{
/* a fnn: y = max(0, x * w1 + b1) * w2 + b2 */
......@@ -68,7 +66,7 @@ public:
~GLU();
/* initialize the model */
void InitModel(T2TConfig& config);
void InitModel(Config& config);
/* make the network */
XTensor Make(XTensor& input);
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -19,19 +18,16 @@
* $Created by: Bei Li (libei_neu@outlook.com) 2020-02-03
*/
#include <cmath>
#include "T2TUtility.h"
#include "T2TEmbedding.h"
#include "T2TLayerNormal.h"
#include "T2TLayerHistory.h"
#include "Embedding.h"
#include "LayerNorm.h"
#include "LayerHistory.h"
#include "../Utility.h"
#include "../../../tensor/core/CHeader.h"
#define SAFE_DELETE(x) do{ if((x) != NULL){delete (x); (x) = NULL;} } while(false)
#define SAFE_DELETE_ARRAY(x) do{ if((x) != NULL) {delete [] (x); (x)=NULL;} } while(false)
namespace transformer
namespace nmt
{
/* constructor */
......@@ -54,7 +50,7 @@ LayerHistory::~LayerHistory()
initialize the model
>> config - configurations of the model
*/
void LayerHistory::InitModel(T2TConfig& config)
void LayerHistory::InitModel(Config& config)
{
devID = config.devID;
d = config.modelSize;
......@@ -62,7 +58,7 @@ void LayerHistory::InitModel(T2TConfig& config)
InitTensor2D(&weight, nlayer + 1, nlayer + 1, X_FLOAT, devID);
layerNorms = new T2TLN[nlayer];
layerNorms = new LN[nlayer];
/* initialize the layer normalization of each layer */
for (int i = 0; i < nlayer; i++) {
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -22,14 +21,14 @@
#ifndef __LAYERHISTORY_H__
#define __LAYERHISTORY_H__
#include "T2TLayerNormal.h"
#include "T2TLayerHistory.h"
#include "LayerNorm.h"
#include "LayerHistory.h"
#include "../../../tensor/function/FHeader.h"
using namespace nts;
namespace transformer
namespace nmt
{
/*
......@@ -61,7 +60,7 @@ public:
TensorList history;
/* layer normalization for each intimidate layer */
T2TLN* layerNorms;
LN* layerNorms;
public:
/* constructor */
......@@ -71,7 +70,7 @@ public:
~LayerHistory();
/* initialize the model */
void InitModel(T2TConfig& config);
void InitModel(Config& config);
/* add the layer output to the history */
void Add(XTensor& tensor);
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -20,24 +19,23 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#include <cmath>
#include "T2TUtility.h"
#include "T2TEmbedding.h"
#include "T2TLayerNormal.h"
#include "Embedding.h"
#include "LayerNorm.h"
#include "../Utility.h"
#include "../../../tensor/core/CHeader.h"
namespace transformer
namespace nmt
{
/* constructor */
T2TLN::T2TLN()
LN::LN()
{
devID = -1;
d = 0;
}
/* de-constructor */
T2TLN::~T2TLN()
LN::~LN()
{
}
......@@ -47,7 +45,7 @@ initialize the model
>> argv - list of pointers to the arguments
>> config - configurations of the model
*/
void T2TLN::InitModel(T2TConfig& config)
void LN::InitModel(Config& config)
{
devID = config.devID;
......@@ -57,6 +55,8 @@ void T2TLN::InitModel(T2TConfig& config)
InitTensor1D(&b, d, X_FLOAT, devID);
w.SetDataRand(1.0F, 1.0F);
b.SetZeroAll();
w.SetDataFixed(1);
}
/*
......@@ -64,7 +64,7 @@ make the network
>> input - the input tensor
>> return - layer normalization output
*/
XTensor T2TLN::Make(XTensor& input)
XTensor LN::Make(XTensor& input)
{
XTensor& x = input;
XTensor xn;
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -20,20 +19,20 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#ifndef __T2TLAYERNORMAL_H__
#define __T2TLAYERNORMAL_H__
#ifndef __LAYERNORMAL_H__
#define __LAYERNORMAL_H__
#include "T2TUtility.h"
#include "../../../network/XNet.h"
#include "../Utility.h"
#include "../../../network//XNet.h"
using namespace nts;
namespace transformer
namespace nmt
{
/* layer normalization: y = norm(x) * w + b
where norm(x) = (x - mean)/standardDeviation */
class T2TLN
class LN
{
public:
/* device id */
......@@ -50,13 +49,13 @@ public:
public:
/* constructor */
T2TLN();
LN();
/* de-constructor */
~T2TLN();
~LN();
/* initialize the model */
void InitModel(T2TConfig& config);
void InitModel(Config& config);
/* make the network */
XTensor Make(XTensor& input);
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -16,12 +15,12 @@
*/
/*
* $Created by: Chi (huchinlp@foxmail.com) 2020-03-21
* $Created by: HU Chi (huchinlp@foxmail.com) 2020-03-21
*/
#include "T2TNNUtil.h"
#include "NNUtil.h"
namespace transformer
namespace nmt
{
/*
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -16,11 +15,11 @@
*/
/*
* $Created by: Chi (huchinlp@foxmail.com) 2020-03-21
* $Created by: HU Chi (huchinlp@foxmail.com) 2020-03-21
*/
#ifndef __T2TNNUTIL_H__
#define __T2TNNUTIL_H__
#ifndef __NNUTIL_H__
#define __NNUTIL_H__
#include "../../../tensor/XGlobal.h"
#include "../../../tensor/core/CHeader.h"
......@@ -28,7 +27,7 @@
using namespace nts;
namespace transformer
namespace nmt
{
/* the gather function for tensor with any dimension */
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -20,18 +19,16 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#include <cmath>
#include "T2TOutput.h"
#include "T2TUtility.h"
#include "T2TEmbedding.h"
#include "Output.h"
#include "Embedding.h"
#include "../Utility.h"
#include "../../../tensor/core/CHeader.h"
namespace transformer
namespace nmt
{
/* constructor */
T2TOutput::T2TOutput()
Output::Output()
{
devID = -1;
vSize = -1;
......@@ -39,7 +36,7 @@ T2TOutput::T2TOutput()
}
/* de-constructor */
T2TOutput::~T2TOutput()
Output::~Output()
{
}
......@@ -47,7 +44,7 @@ T2TOutput::~T2TOutput()
initialize the model
>> config - configurations of the model
*/
void T2TOutput::InitModel(T2TConfig& config)
void Output::InitModel(Config& config)
{
devID = config.devID;
hSize = config.modelSize;
......@@ -66,7 +63,7 @@ make the network (redefined output tensor)
>> isTraining - whether it is used for training
>> normalized - whether ignore the log-softmax
*/
void T2TOutput::Make(XTensor& input, XTensor& output, bool isTraining, bool normalized)
void Output::Make(XTensor& input, XTensor& output, bool isTraining, bool normalized)
{
XTensor& x = input;
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -20,19 +19,19 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#ifndef __T2TOUTPUT_H__
#define __T2TOUTPUT_H__
#ifndef __OUTPUT_H__
#define __OUTPUT_H__
#include "T2TUtility.h"
#include "../Utility.h"
#include "../../../tensor/function/FHeader.h"
using namespace nts;
namespace transformer
namespace nmt
{
/* output layer */
class T2TOutput
class Output
{
public:
/* device id */
......@@ -49,13 +48,13 @@ public:
public:
/* constructor */
T2TOutput();
Output();
/* de-constructor */
~T2TOutput();
~Output();
/* initialize the model */
void InitModel(T2TConfig& config);
void InitModel(Config& config);
/* make the network (redefined output tensor) */
void Make(XTensor& input, XTensor& output, bool isTraining, bool normalized);
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
*/
#include "T2TBatchLoader.h"
#include "../module/T2TUtility.h"
#include "../../../tensor/XUtility.h"
#include "../../../tensor/core/CHeader.h"
#include "../../../network/XNoder.h"
namespace transformer
{
/* constructor */
T2TBatchLoader::T2TBatchLoader()
{
seqLen = NULL;
seqLen2 = NULL;
nseqBuf = 0;
nextSeq = -1;
nextBatch = -1;
buf = NULL;
buf2 = NULL;
bufBatch = NULL;
bufSize = 0;
bufBatchSize = 0;
seqOffset = NULL;
}
/* de-constructor */
T2TBatchLoader::~T2TBatchLoader()
{
delete[] buf;
delete[] buf2;
delete[] bufBatch;
delete[] seqLen;
delete[] seqLen2;
delete[] seqOffset;
}
/*
initialization
>> argc - number of arguments
>> argv - list of pointers to the arguments
*/
void T2TBatchLoader::Init(T2TConfig& config)
{
bufSize = config.bufSize;
isDoubledEnd = config.isDoubledEnd;
isSmallBatch = config.isSmallBatch;
isBigBatch = config.isBigBatch;
isRandomBatch = config.isRandomBatch;
bucketSize = config.bucketSize;
buf = new int[bufSize];
buf2 = new int[bufSize];
bufBatch = new BatchNode[bufSize];
seqLen = new int[bufSize];
seqLen2 = new int[bufSize];
seqOffset = new int[bufSize];
}
char line[MAX_SEQUENCE_LENGTH];
struct SampleNode
{
int id;
int offset;
int* p;
int size;
int value;
int key;
};
int CompareSampleNode(const void* a, const void* b)
{
return ((SampleNode*)b)->value - ((SampleNode*)a)->value;
}
int CompareSampleNodeV2(const void* a, const void* b)
{
return ((SampleNode*)b)->key - ((SampleNode*)a)->key;
}
/*
load data to buffer
>> file - where to load data
>> isSorted - indicates whether the samples are sorted by length
>> step - the number of sequences we go over when move to the next sample
*/
int T2TBatchLoader::LoadBuf(FILE* file, bool isSorted, int step)
{
int lineCount = 0;
int seqCount = 0;
int wordCount = 0;
while (fgets(line, MAX_SEQUENCE_LENGTH - 1, file)) {
int len = (int)strlen(line);
while (line[len - 1] == '\r' || line[len - 1] == '\n') {
line[len - 1] = 0;
len--;
}
len = (int)strlen(line);
if (len == 0)
continue;
/* how many characters are in a word */
int wSize = 0;
/* how many words are in the sentence */
int wNum = 0;
int wNumLocal = 0;
int i = 0;
for (i = 0; i < len; i++) {
/* load word (id) seperated by space or tab */
if ((line[i] == ' ' || line[i] == '\t') && wSize > 0) {
line[i] = 0;
if (wSize == 3 && line[i - 1] == '|' && line[i - 2] == '|' && line[i - 3] == '|') {
seqLen[seqCount] = wNumLocal;
seqOffset[seqCount] = wordCount + wNum - wNumLocal;
seqCount++;
wNumLocal = 0;
}
else {
buf[wordCount + wNum++] = atoi(line + i - wSize);
wNumLocal++;
}
wSize = 0;
}
else
wSize++;
}
if (wSize > 0) {
buf[wordCount + wNum++] = atoi(line + i - wSize);
wNumLocal++;
}
seqLen[seqCount] = wNumLocal;
seqOffset[seqCount] = wordCount + wNum - wNumLocal;
seqCount++;
wordCount += wNum;
lineCount++;
if (wordCount >= bufSize - MAX_SEQUENCE_LENGTH)
break;
CheckNTErrors(seqCount % step == 0, "Wrong number of sequences!");
}
nseqBuf = seqCount;
nextSeq = 0;
/* sort the sequences by length */
if (isSorted) {
CheckNTErrors(seqCount % step == 0, "Wrong number of sequences!");
SampleNode* nodes = new SampleNode[seqCount];
int count = 0;
int offset = 0;
for (int i = 0; i < seqCount; i += step) {
SampleNode& node = nodes[count];
node.id = count;
node.offset = i;
node.p = buf + offset;
node.size = 0;
int max = 0;
for (int j = 0; j < step; j++) {
node.size += seqLen[i + j];
max = MAX(max, seqLen[i + j]);
}
node.value = max;
node.key = rand();
count++;
offset += node.size;
}
qsort(nodes, count, sizeof(SampleNode), CompareSampleNode);
/* distribute samples into buckets. In each bucket, sequences have
similar a length */
if (bucketSize > 0) {
int low = 0;
int high = low + bucketSize;
int n = count - 1;
int m = n;
int num = 0;
while (num < count) {
for (m = n; m >= 0; m--) {
if (nodes[m].value > high)
break;
}
qsort(nodes + m + 1, n - m, sizeof(SampleNode), CompareSampleNodeV2);
num += (n - m);
n = m;
low += bucketSize;
high = low + bucketSize;
}
}
count = 0;
offset = 0;
for (int i = 0; i < seqCount; i += step) {
SampleNode& node = nodes[count];
memcpy(buf2 + offset, node.p, sizeof(int) * node.size);
for (int j = 0; j < step; j++) {
seqLen2[i + j] = seqLen[node.offset + j];
seqOffset[i + j] = offset + (j > 0 ? seqLen[node.offset + j - 1] : 0);
}
count += 1;
offset += node.size;
}
int* tmp = buf;
buf = buf2;
buf2 = tmp;
tmp = seqLen;
seqLen = seqLen2;
seqLen2 = tmp;
delete[] nodes;
}
return lineCount;
}
/* clear the data buffer */
void T2TBatchLoader::ClearBuf()
{
nseqBuf = 0;
nextSeq = -1;
}
/*
set the random batch flag
>> flag - as it is
*/
void T2TBatchLoader::SetRandomBatch(bool flag)
{
isRandomBatch = flag;
}
/*
load a batch of sequences
>> file - the handle to the data file
>> isLM - indicates whether the data is used for training lms
>> batchEnc - the batch of the input sequences
>> paddingEnc - padding of the input sequences
>> batchDec - the batch of the output sequences
>> paddingDec - padding of the output sequences
>> gold - gold standard
>> seqs - keep the sequences in an array
>> vsEnc - size of the encoder vocabulary
>> vsDec - size of the decoder vocabulary
>> sBatch - batch size of sequences
>> wBatch - batch size of words
>> isSorted - indicates whether the sequences are sorted by length
>> wCount - word count
>> devID - device id
>> isTraining - indicates whether we are training the model
*/
int T2TBatchLoader::LoadBatch(FILE* file, bool isLM,
XTensor* batchEnc, XTensor* paddingEnc,
XTensor* batchDec, XTensor* paddingDec,
XTensor* gold, XTensor* label,
int* seqs,
int vsEnc, int vsDec, int sBatch, int wBatch,
bool isSorted, int& ws, int& wCount,
int devID, bool isTraining)
{
if (isLM) {
return LoadBatchLM(file, batchEnc, paddingEnc, batchDec, paddingDec, gold, label,
seqs, vsEnc, sBatch, wBatch,
isSorted, wCount, devID, isTraining);
}
else {
return LoadBatchMT(file, batchEnc, paddingEnc, batchDec, paddingDec, gold, label,
seqs, vsEnc, vsDec, sBatch, wBatch,
isSorted, ws, wCount, devID, isTraining);
}
}
/*
load a batch of sequences (for LM)
>> file - the handle to the data file
>> isLM - indicates whether the data is used for training lms
>> batchEnc - the batch of the input sequences
>> paddingEnc - padding of the input sequences
>> batchDec - the batch of the output sequences
>> paddingDec - padding of the output sequences
>> gold - gold standard (distribution of every position)
>> label - (gold standard) label index of every position
>> seqs - keep the sequences in an array
>> vSize - vocabulary size
>> sBatch - batch size of sequences
>> wBatch - batch size of words
>> isSorted - indicates whether the sequences are sorted by length
>> wCount - word count
>> devID - device id
>> isTraining - indicates whether we are training the model
*/
int T2TBatchLoader::LoadBatchLM(FILE* file,
XTensor* batchEnc, XTensor* paddingEnc,
XTensor* batchDec, XTensor* paddingDec,
XTensor* gold, XTensor* label,
int* seqs,
int vSize, int sBatch, int wBatch,
bool isSorted, int& wCount,
int devID, bool isTraining)
{
if (nextSeq < 0 || nextSeq >= nseqBuf)
LoadBuf(file, isSorted, 1);
int seq = MAX(nextSeq, 0);
int wc = 0;
int wn = 0;
int sc = 0;
int max = 0;
while (seq + sc < nseqBuf) {
int len = isDoubledEnd ? seqLen[seq + sc] : seqLen[seq + sc] - 1;
CheckNTErrors(len > 0, "Empty sequence!");
wn = len;
wc += wn;
sc += 1;
if (max < wn)
max = wn;
int tc = isBigBatch ? wc : max * sc;
if (sc >= sBatch && tc >= wBatch)
break;
}
wCount = 0;
nextSeq = seq + sc;
if (sc <= 0)
return 0;
int dims[MAX_TENSOR_DIM_NUM];
dims[0] = sc;
dims[1] = max;
dims[2] = vSize;
InitTensor2D(batchEnc, sc, max, X_INT, devID);
InitTensor2D(label, sc, max, X_INT, devID);
InitTensor(gold, 3, dims, X_FLOAT, devID);
InitTensor2D(paddingEnc, sc, max, X_FLOAT, devID);
InitTensor2D(paddingDec, sc, max, X_FLOAT, devID);
batchEnc->SetZeroAll();
label->SetZeroAll();
gold->SetZeroAll();
paddingEnc->SetZeroAll();
paddingDec->SetZeroAll();
int seqSize = 0;
int* batchEncValues = new int[batchEnc->unitNum];
int* labelValues = new int[label->unitNum];
MTYPE* goldOffsets = new MTYPE[gold->unitNum];
MTYPE* paddingEncOffsets = new MTYPE[paddingEnc->unitNum];
MTYPE* paddingDecOffsets = new MTYPE[paddingDec->unitNum];
int wGold = 0;
memset(batchEncValues, 0, sizeof(int) * batchEnc->unitNum);
memset(labelValues, 0, sizeof(int) * label->unitNum);
for (int s = seq; s < seq + sc; s++) {
int len = isDoubledEnd ? seqLen[s] : seqLen[s] - 1;
CheckNTErrors(len <= max, "Something is wrong!");
for (int w = 0; w < len; w++) {
int num = buf[seqOffset[s] + w];
batchEncValues[(int)batchEnc->GetOffset2D(s - seq, w)] = num;
paddingEncOffsets[wCount] = paddingEnc->GetOffset2D(s - seq, w);
paddingDecOffsets[wCount] = paddingDec->GetOffset2D(s - seq, w);
if (w > 0) {
goldOffsets[wGold++] = gold->GetOffset3D(s - seq, w - 1, num);
labelValues[(int)label->GetOffset2D(s - seq, w - 1)] = buf[seqOffset[s] + w];
}
if (w == len - 1) {
if (isDoubledEnd) {
goldOffsets[wGold++] = gold->GetOffset3D(s - seq, w, num);
labelValues[(int)label->GetOffset2D(s - seq, w)] = buf[seqOffset[s] + w];
}
else {
goldOffsets[wGold++] = gold->GetOffset3D(s - seq, w, buf[seqOffset[s] + w + 1]);
labelValues[(int)label->GetOffset2D(s - seq, w)] = buf[seqOffset[s] + w + 1];
}
}
wCount++;
if (seqs != NULL)
seqs[seqSize++] = buf[seqOffset[s] + w];
}
if (seqs != NULL) {
for (int w = len; w < max; w++)
seqs[seqSize++] = -1;
}
}
batchEnc->SetData(batchEncValues, batchEnc->unitNum);
label->SetData(labelValues, label->unitNum);
gold->SetDataBatched(goldOffsets, 1.0F, wGold);
paddingEnc->SetDataBatched(paddingEncOffsets, 1.0F, wCount);
paddingDec->SetDataBatched(paddingDecOffsets, 1.0F, wCount);
/*XTensor * tmp = NewTensorBuf(paddingEnc, devID);
_ConvertDataType(batchEnc, tmp);
_NotEqual(tmp, paddingEnc, 0);
DelTensorBuf(tmp);
XTensor * tmp2 = NewTensorBuf(paddingDec, devID);
_ConvertDataType(batchEnc, tmp2);
_NotEqual(tmp2, paddingDec, 0);
DelTensorBuf(tmp2);*/
delete[] batchEncValues;
delete[] labelValues;
delete[] goldOffsets;
delete[] paddingEncOffsets;
delete[] paddingDecOffsets;
return sc;
}
int CompareBatchNode(const void* a, const void* b)
{
return ((BatchNode*)b)->key - ((BatchNode*)a)->key;
}
/*
load a batch of sequences (for MT)
>> file - the handle to the data file
>> batchEnc - the batch of the input sequences
>> paddingEnc - padding of the input sequences
>> batchDec - the batch of the output sequences
>> paddingDec - padding of the output sequences
>> gold - gold standard (distribution of every position)
>> label - (gold standard) label index of every position
>> seqs - keep the sequences in an array
>> vSizeEnc - size of the encoder vocabulary
>> vSizeDec - size of the decoder vocabulary
>> sBatch - batch size of sequences
>> wBatch - batch size of words
>> isSorted - indicates whether the sequences are sorted by length
>> wCount - word count
>> devID - device id
>> isTraining - indicates whether we are training the model
*/
int T2TBatchLoader::LoadBatchMT(FILE* file,
XTensor* batchEnc, XTensor* paddingEnc,
XTensor* batchDec, XTensor* paddingDec,
XTensor* gold, XTensor* label,
int* seqs,
int vSizeEnc, int vSizeDec, int sBatch, int wBatch,
bool isSorted, int& ws, int& wCount,
int devID, bool isTraining)
{
if (nextBatch < 0 || nextBatch >= bufBatchSize) {
LoadBuf(file, isSorted, 2);
int seq = 0;
bufBatchSize = 0;
nextBatch = 0;
/* we segment the buffer into batches */
while (seq < nseqBuf) {
int wcEnc = 0;
int wcDec = 0;
int wnEnc = 0;
int wnDec = 0;
int maxEnc = 0;
int maxDec = 0;
int sc = 0;
while (seq + sc < nseqBuf) {
/* source-side sequence */
wnEnc = seqLen[seq + sc];
/* target-side sequence */
wnDec = isDoubledEnd ? seqLen[seq + sc + 1] : seqLen[seq + sc + 1] - 1;
int tcEnc = isBigBatch ? (wcEnc + wnEnc) : MAX(maxEnc, wnEnc) * (sc + 2) / 2;
int tcDec = isBigBatch ? (wcDec + wnDec) : MAX(maxDec, wnDec) * (sc + 2) / 2;
if (sc != 0 && sc > sBatch * 2 && (tcEnc > wBatch || tcDec > wBatch))
break;
wcEnc += wnEnc;
sc += 1;
if (maxEnc < wnEnc)
maxEnc = wnEnc;
wcDec += wnDec;
sc += 1;
if (maxDec < wnDec)
maxDec = wnDec;
}
BatchNode& batch = bufBatch[bufBatchSize];
batch.beg = seq;
batch.end = seq + sc;
batch.maxEnc = maxEnc;
batch.maxDec = maxDec;
batch.key = rand();
bufBatchSize++;
seq = seq + sc;
}
if (isRandomBatch)
qsort(bufBatch, bufBatchSize, sizeof(BatchNode), CompareBatchNode);
}
if (bufBatchSize <= 0)
return 0;
BatchNode& batch = bufBatch[nextBatch++];
int seq = batch.beg;
int sc = batch.end - batch.beg;
int maxEnc = batch.maxEnc;
int maxDec = batch.maxDec;
CheckNTErrors(sc % 2 == 0, "The input samples must be paired");
int sCount = sc / 2;
int seqSize = 0;
InitTensor2D(batchEnc, sCount, maxEnc, X_INT, devID);
InitTensor2D(paddingEnc, sCount, maxEnc, X_FLOAT, devID);
InitTensor2D(batchDec, sCount, maxDec, X_INT, devID);
InitTensor2D(paddingDec, sCount, maxDec, X_FLOAT, devID);
InitTensor2D(label, sCount, maxDec, X_INT, devID);
//InitTensor(gold, 3, dimsDec, X_FLOAT, devID);
batchEnc->SetZeroAll();
paddingEnc->SetZeroAll();
batchDec->SetZeroAll();
paddingDec->SetZeroAll();
label->SetZeroAll();
//gold->SetZeroAll();
int wCountEnc = 0;
int wCountDec = 0;
int wCountPad = 0;
wCount = 0;
int* batchEncValues = new int[batchEnc->unitNum];
int* batchDecValues = new int[batchDec->unitNum];
int* labelValues = new int[label->unitNum];
MTYPE* paddingEncOffsets = new MTYPE[sc * maxEnc / 2];
MTYPE* paddingDecOffsets = new MTYPE[sc * maxDec / 2];
//MTYPE * goldOffsets = new MTYPE[sc * maxDec / 2];
memset(batchEncValues, 0, sizeof(int) * batchEnc->unitNum);
memset(batchDecValues, 0, sizeof(int) * batchDec->unitNum);
memset(labelValues, 0, sizeof(int) * batchDec->unitNum);
/* batch of the source-side sequences */
for (int s = seq; s < seq + sc; s += 2) {
int len = seqLen[s];
int sent = (s - seq) / 2;
for (int w = 0; w < len; w++) {
int num = buf[seqOffset[s] + w];
batchEncValues[batchEnc->GetOffset2D(sent, w)] = num;
paddingEncOffsets[wCountEnc] = paddingEnc->GetOffset2D(sent, w);
wCountEnc++;
}
}
ws = wCountEnc;
batchEnc->SetData(batchEncValues, batchEnc->unitNum);
paddingEnc->SetDataBatched(paddingEncOffsets, 1.0F, wCountEnc);
//XTensor * tmp = NewTensorBuf(paddingEnc, devID);
//_ConvertDataType(batchEnc, tmp);
//tmp->Dump(stderr, "tmp:");
//_NotEqual(tmp, paddingEnc, 0);
//DelTensorBuf(tmp);
/* batch of the target-side sequences */
for (int s = seq + 1; s < seq + sc; s += 2) {
int len = isDoubledEnd ? seqLen[s] : seqLen[s] - 1;
CheckNTErrors(len <= maxDec, "Something is wrong!");
int sent = (s - seq - 1) / 2;
for (int w = 0; w < len; w++) {
int num = buf[seqOffset[s] + w];
batchDecValues[batchDec->GetOffset2D(sent, w)] = num;
//paddingDecOffsets[wCountDec] = paddingDec->GetOffset2D(sent, w);
if (w < len - 1) {
paddingDecOffsets[wCountPad++] = paddingDec->GetOffset2D(sent, w);
wCount++;
}
if (w > 0) {
//goldOffsets[wGold++] = gold->GetOffset3D(sent, w - 1, buf[seqOffset[s] + w]);
labelValues[label->GetOffset2D(sent, w - 1)] = buf[seqOffset[s] + w];
}
if (w == len - 1) {
if (isDoubledEnd) {
//goldOffsets[wGold++] = gold->GetOffset3D(sent, w, buf[seqOffset[s] + w]);
labelValues[label->GetOffset2D(sent, w)] = buf[seqOffset[s] + w];
}
else {
//goldOffsets[wGold++] = gold->GetOffset3D(sent, w, buf[seqOffset[s] + w + 1]);
labelValues[label->GetOffset2D(sent, w)] = buf[seqOffset[s] + w + 1];
}
}
//wCount++;
wCountDec++;
if (seqs != NULL)
seqs[seqSize++] = buf[seqOffset[s] + w];
}
if (seqs != NULL) {
for (int w = len; w < maxDec; w++)
seqs[seqSize++] = -1;
}
}
batchDec->SetData(batchDecValues, batchDec->unitNum);
label->SetData(labelValues, label->unitNum);
paddingDec->SetDataBatched(paddingDecOffsets, 1.0F, wCountPad);
//XTensor * tmp2 = NewTensorBuf(paddingDec, devID);
//_ConvertDataType(batchDec, tmp2);
//_NotEqual(tmp2, paddingDec, 0);
//DelTensorBuf(tmp2);
//gold->SetDataBatched(goldOffsets, 1.0F, wGold);
delete[] batchEncValues;
delete[] batchDecValues;
delete[] labelValues;
delete[] paddingEncOffsets;
delete[] paddingDecOffsets;
//delete[] goldOffsets;
return sc;
}
/*
shuffle lines of the file
>> srcFile - the source file to shuffle
>> tgtFile - the resulting file
*/
void T2TBatchLoader::Shuffle(const char* srcFile, const char* tgtFile)
{
char* line = new char[MAX_LINE_LENGTH];
#ifndef WIN32
sprintf(line, "shuf %s > %s", srcFile, tgtFile);
system(line);
#else
ShowNTErrors("Cannot shuffle the file on WINDOWS systems!");
#endif
delete[] line;
}
}
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-04-25
* it is cold today but I'll move to a warm place tomorrow :)
*/
#ifndef __T2TBATCHLOADER_H__
#define __T2TBATCHLOADER_H__
#include "../module/T2TUtility.h"
#include "../../../network/XNet.h"
using namespace nts;
namespace transformer
{
#define MAX_SEQUENCE_LENGTH 1024 * 4
/* node to keep batch information */
struct BatchNode
{
/* beginning position */
int beg;
/* end position */
int end;
/* maximum word number on the encoder side */
int maxEnc;
/* maximum word number on the decoder side */
int maxDec;
/* a key for sorting */
int key;
};
class T2TBatchLoader
{
public:
/* buffer for loading words */
int* buf;
/* another buffer */
int* buf2;
/* batch buf */
BatchNode* bufBatch;
/* buffer size */
int bufSize;
/* size of batch buffer */
int bufBatchSize;
/* length of each sequence */
int* seqLen;
/* another array */
int* seqLen2;
/* offset of the first word for each sequence */
int* seqOffset;
/* number of sequences in the buffer */
int nseqBuf;
/* offset for next sequence in the buffer */
int nextSeq;
/* offset for next batch */
int nextBatch;
/* indicates whether we double the </s> symbol for the output of LM */
bool isDoubledEnd;
/* indicates whether we use batchsize = max * sc
rather rather than batchsize = word-number, where max is the maximum
length and sc is the sentence number */
bool isSmallBatch;
/* counterpart of "isSmallBatch" */
bool isBigBatch;
/* randomize batches */
bool isRandomBatch;
/* bucket size */
int bucketSize;
public:
/* constructor */
T2TBatchLoader();
/* de-constructor */
~T2TBatchLoader();
/* initialization */
void Init(T2TConfig& config);
/* load data to buffer */
int LoadBuf(FILE* file, bool isSorted, int step);
/* clear data buffer */
void ClearBuf();
/* set the random batch flag */
void SetRandomBatch(bool flag = true);
/* load a batch of sequences */
int LoadBatch(FILE* file, bool isLM,
XTensor* batchEnc, XTensor* paddingEnc,
XTensor* batchDec, XTensor* paddingDec,
XTensor* gold, XTensor* label,
int* seqs,
int vsEnc, int vsDec, int sBatch, int wBatch,
bool isSorted, int& ws, int& wCount,
int devID, bool isTraining);
/* load a batch of sequences (for language modeling) */
int LoadBatchLM(FILE* file,
XTensor* batchEnc, XTensor* paddingEnc,
XTensor* batchDec, XTensor* paddingDec,
XTensor* gold, XTensor* label,
int* seqs, int vs, int sBatch, int wBatch,
bool isSorted, int& wCount,
int devID, bool isTraining);
/* load a batch of sequences (for machine translation) */
int LoadBatchMT(FILE* file,
XTensor* batchEnc, XTensor* paddingEnc,
XTensor* batchDec, XTensor* paddingDec,
XTensor* gold, XTensor* label,
int* seqs, int vsEnc, int vsDec, int sBatch, int wBatch,
bool isSorted, int& ws, int& wCount,
int devID, bool isTraining);
/* shuffle the data file */
void Shuffle(const char* srcFile, const char* tgtFile);
};
}
#endif
\ No newline at end of file
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: HU Chi (huchinlp@foxmail.com) 2020-08-09
* TODO: refactor the data loader class and references
*/
#include <string>
#include <vector>
#include <cstdlib>
#include <fstream>
#include <algorithm>
#include "TrainDataSet.h"
#include "../Utility.h"
#include "../translate/Vocab.h"
using namespace nmt;
namespace nts {
/* sort the dataset by length (in descending order) */
void TrainDataSet::SortByLength() {
sort(buffer.items, buffer.items + buffer.count,
[](TrainExample* a, TrainExample* b) {
return (a->srcSent.Size() + a->tgtSent.Size())
> (b->srcSent.Size() + b->tgtSent.Size());
});
}
/* sort buckets by key (in descending order) */
void TrainDataSet::SortBucket() {
sort(buffer.items, buffer.items + buffer.count,
[](TrainExample* a, TrainExample* b) {
return a->bucketKey > b->bucketKey;
});
}
/*
sort the output by key in a range (in descending order)
>> begin - the first index of the range
>> end - the last index of the range
*/
void TrainDataSet::SortInBucket(int begin, int end) {
sort(buffer.items + begin, buffer.items + end,
[](TrainExample* a, TrainExample* b) {
return (a->key > b->key);
});
}
/*
load all data from a file to the buffer
training data format (binary):
first 8 bit: number of sentence pairs
subsequent segements:
source sentence length (4 bit)
target sentence length (4 bit)
source tokens (4 bit per token)
target tokens (4 bit per token)
*/
void TrainDataSet::LoadDataToBuffer()
{
buffer.Clear();
curIdx = 0;
int id = 0;
uint64_t sentNum = 0;
int srcVocabSize = 0;
int tgtVocabSize = 0;
fread(&srcVocabSize, sizeof(srcVocabSize), 1, fp);
fread(&tgtVocabSize, sizeof(tgtVocabSize), 1, fp);
fread(&sentNum, sizeof(uint64_t), 1, fp);
CheckNTErrors(sentNum > 0, "Invalid sentence pairs number");
while (id < sentNum) {
int srcLen = 0;
int tgtLen = 0;
fread(&srcLen, sizeof(int), 1, fp);
fread(&tgtLen, sizeof(int), 1, fp);
CheckNTErrors(srcLen > 0, "Invalid source sentence length");
CheckNTErrors(tgtLen > 0, "Invalid target sentence length");
IntList srcSent;
IntList tgtSent;
srcSent.ReadFromFile(fp, srcLen);
tgtSent.ReadFromFile(fp, tgtLen);
TrainExample* example = new TrainExample;
example->id = id++;
example->key = id;
example->srcSent = srcSent;
example->tgtSent = tgtSent;
buffer.Add(example);
}
fclose(fp);
XPRINT1(0, stderr, "[INFO] loaded %d sentences\n", id);
}
/*
load a mini-batch to the device (for training)
>> batchEnc - a tensor to store the batch of encoder input
>> paddingEnc - a tensor to store the batch of encoder paddings
>> batchDec - a tensor to store the batch of decoder input
>> paddingDec - a tensor to store the batch of decoder paddings
>> label - a tensor to store the label of input
>> minSentBatch - the minimum number of sentence batch
>> batchSize - the maxium number of words in a batch
>> devID - the device id, -1 for the CPU
<< return - number of target tokens and sentences
*/
UInt64List TrainDataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
XTensor* batchDec, XTensor* paddingDec, XTensor* label,
size_t minSentBatch, size_t batchSize, int devID)
{
UInt64List info;
size_t srcTokenNum = 0;
size_t tgtTokenNum = 0;
int realBatchSize = 1;
if (!isTraining)
realBatchSize = minSentBatch;
/* get the maximum source sentence length in a mini-batch */
size_t maxSrcLen = buffer[curIdx]->srcSent.Size();
/* max batch size */
const int MAX_BATCH_SIZE = 512;
/* dynamic batching for sentences, enabled when the dataset is used for training */
if (isTraining) {
while ((realBatchSize < (buffer.Size() - curIdx))
&& (realBatchSize * maxSrcLen < batchSize)
&& (realBatchSize < MAX_BATCH_SIZE)
&& (realBatchSize * buffer[curIdx + realBatchSize]->srcSent.Size() < batchSize)) {
if (maxSrcLen < buffer[curIdx + realBatchSize]->srcSent.Size())
maxSrcLen = buffer[curIdx + realBatchSize]->srcSent.Size();
realBatchSize++;
}
}
/* real batch size */
if ((buffer.Size() - curIdx) < realBatchSize) {
realBatchSize = buffer.Size() - curIdx;
}
CheckNTErrors(realBatchSize > 0, "Invalid batch size");
/* get the maximum target sentence length in a mini-batch */
size_t maxTgtLen = buffer[curIdx]->tgtSent.Size();
for (size_t i = 0; i < realBatchSize; i++) {
if (maxTgtLen < buffer[curIdx + i]->tgtSent.Size())
maxTgtLen = buffer[curIdx + i]->tgtSent.Size();
}
for (size_t i = 0; i < realBatchSize; i++) {
if (maxSrcLen < buffer[curIdx + i]->srcSent.Size())
maxSrcLen = buffer[curIdx + i]->srcSent.Size();
}
CheckNTErrors(maxSrcLen != 0, "Invalid source length for batching");
int* batchEncValues = new int[realBatchSize * maxSrcLen];
float* paddingEncValues = new float[realBatchSize * maxSrcLen];
int* labelVaues = new int[realBatchSize * maxTgtLen];
int* batchDecValues = new int[realBatchSize * maxTgtLen];
float* paddingDecValues = new float[realBatchSize * maxTgtLen];
for (int i = 0; i < realBatchSize * maxSrcLen; i++) {
batchEncValues[i] = PAD;
paddingEncValues[i] = 1;
}
for (int i = 0; i < realBatchSize * maxTgtLen; i++) {
batchDecValues[i] = PAD;
labelVaues[i] = PAD;
paddingDecValues[i] = 1.0F;
}
size_t curSrc = 0;
size_t curTgt = 0;
/*
batchEnc: end with EOS (left padding)
batchDec: begin with SOS (right padding)
label: end with EOS (right padding)
*/
for (int i = 0; i < realBatchSize; ++i) {
srcTokenNum += buffer[curIdx + i]->srcSent.Size();
tgtTokenNum += buffer[curIdx + i]->tgtSent.Size();
curSrc = maxSrcLen * i;
for (int j = 0; j < buffer[curIdx + i]->srcSent.Size(); j++) {
batchEncValues[curSrc++] = buffer[curIdx + i]->srcSent[j];
}
curTgt = maxTgtLen * i;
for (int j = 0; j < buffer[curIdx + i]->tgtSent.Size(); j++) {
if (j > 0)
labelVaues[curTgt - 1] = buffer[curIdx + i]->tgtSent[j];
batchDecValues[curTgt++] = buffer[curIdx + i]->tgtSent[j];
}
labelVaues[curTgt - 1] = EOS;
while (curSrc < maxSrcLen * (i + 1))
paddingEncValues[curSrc++] = 0;
while (curTgt < maxTgtLen * (i + 1))
paddingDecValues[curTgt++] = 0;
}
InitTensor2D(batchEnc, realBatchSize, maxSrcLen, X_INT, devID);
InitTensor2D(paddingEnc, realBatchSize, maxSrcLen, X_FLOAT, devID);
InitTensor2D(batchDec, realBatchSize, maxTgtLen, X_INT, devID);
InitTensor2D(paddingDec, realBatchSize, maxTgtLen, X_FLOAT, devID);
InitTensor2D(label, realBatchSize, maxTgtLen, X_INT, devID);
curIdx += realBatchSize;
batchEnc->SetData(batchEncValues, batchEnc->unitNum);
paddingEnc->SetData(paddingEncValues, paddingEnc->unitNum);
batchDec->SetData(batchDecValues, batchDec->unitNum);
paddingDec->SetData(paddingDecValues, paddingDec->unitNum);
label->SetData(labelVaues, label->unitNum);
delete[] batchEncValues;
delete[] paddingEncValues;
delete[] batchDecValues;
delete[] paddingDecValues;
delete[] labelVaues;
info.Add(tgtTokenNum);
info.Add(realBatchSize);
return info;
}
/*
the constructor of DataSet
>> dataFile - path of the data file
>> bucketSize - size of the bucket to keep similar length sentence pairs
>> training - indicates whether it is used for training
*/
void TrainDataSet::Init(const char* dataFile, int myBucketSize, bool training)
{
fp = fopen(dataFile, "rb");
CheckNTErrors(fp, "can not open the training file");
curIdx = 0;
bucketSize = myBucketSize;
isTraining = training;
LoadDataToBuffer();
SortByLength();
if (isTraining)
BuildBucket();
}
/* check if the buffer is empty */
bool TrainDataSet::IsEmpty() {
if (curIdx < buffer.Size())
return false;
return true;
}
/* reset the buffer */
void TrainDataSet::ClearBuf()
{
curIdx = 0;
/* make different batches in different epochs */
SortByLength();
if (isTraining)
BuildBucket();
}
/* group data into buckets with similar length */
void TrainDataSet::BuildBucket()
{
size_t idx = 0;
/* build and shuffle buckets */
while (idx < buffer.Size()) {
/* sentence number in a bucket */
size_t sentNum = 1;
/* get the maximum source sentence length in a bucket */
size_t maxSrcLen = buffer[idx]->srcSent.Size();
/* bucketing for sentences */
while ((sentNum < (buffer.Size() - idx))
&& (sentNum * maxSrcLen < bucketSize)
&& (sentNum * buffer[curIdx + sentNum]->srcSent.Size() < bucketSize)) {
if (maxSrcLen < buffer[idx + sentNum]->srcSent.Size())
maxSrcLen = buffer[idx + sentNum]->srcSent.Size();
sentNum++;
}
/* make sure the number is valid */
if ((buffer.Size() - idx) < sentNum) {
sentNum = buffer.Size() - idx;
}
int randomKey = rand();
/* shuffle items in a bucket */
for (size_t i = 0; i < sentNum; i++) {
buffer[idx + i]->bucketKey = randomKey;
}
idx += sentNum;
}
SortBucket();
/* sort items in a bucket */
idx = 0;
while (idx < buffer.Size()) {
size_t sentNum = 0;
int bucketKey = buffer[idx + sentNum]->bucketKey;
while (sentNum < (buffer.Size() - idx)
&& buffer[idx + sentNum]->bucketKey == bucketKey) {
buffer[idx + sentNum]->key = buffer[idx + sentNum]->srcSent.Size();
sentNum++;
}
SortInBucket(idx, idx + sentNum);
idx += sentNum;
}
}
/* de-constructor */
TrainDataSet::~TrainDataSet()
{
/* release the buffer */
for (int i = 0; i < buffer.Size(); i++)
delete buffer[i];
}
}
\ No newline at end of file
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: HU Chi (huchinlp@foxmail.com) 2019-04-03
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-06
*/
#ifndef __TRAIN_DATASET_H__
#define __TRAIN_DATASET_H__
#include <cstdio>
#include <vector>
#include <fstream>
#include "../../../tensor/XList.h"
#include "../../../tensor/XTensor.h"
#include "../../../tensor/XGlobal.h"
#define MAX_WORD_NUM 120
using namespace std;
namespace nts {
/* a class of sentence pairs for training */
struct TrainExample {
/* id of the sentence pair */
int id;
/* source language setence (tokenized) */
IntList srcSent;
/* target language setence (tokenized) */
IntList tgtSent;
/* the key used to shuffle items in a bucket */
int key;
/* the key used to shuffle buckets */
int bucketKey;
};
/* A `TrainDataSet` is associated with a file which contains training data. */
struct TrainDataSet {
public:
/* the data buffer */
TrainBufferType buffer;
/* a list of empty line number */
IntList emptyLines;
/* the pointer to file stream */
FILE* fp;
/* current index in the buffer */
size_t curIdx;
/* size of used data in the buffer */
size_t bufferUsed;
/* size of the bucket used for grouping sentences */
size_t bucketSize;
/* indicates whether it is used for training */
bool isTraining;
public:
/* sort the input by length (in descending order) */
void SortByLength();
/* sort buckets by key (in descending order) */
void SortBucket();
/* sort the output by key (in descending order) */
void SortInBucket(int begin, int end);
/* load data from a file to the buffer */
void LoadDataToBuffer();
/* generate a mini-batch */
UInt64List LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
XTensor* batchDec, XTensor* paddingDec, XTensor* label,
size_t minSentBatch, size_t batchSize, int devID);
/* initialization function */
void Init(const char* dataFile, int bucketSize, bool training);
/* check if the buffer is empty */
bool IsEmpty();
/* reset the buffer */
void ClearBuf();
/* group data into buckets with similar length */
void BuildBucket();
/* de-constructor */
~TrainDataSet();
};
}
#endif // __TRAIN_DATASET_H__
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -19,30 +18,31 @@
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-08-02
*/
#include <cmath>
#include "T2TTrainer.h"
#include "../module/T2TUtility.h"
#include "Trainer.h"
#include "../Utility.h"
#include "../../../network/XNoder.h"
#include "../../../tensor/XUtility.h"
#include "../../../tensor/core/CHeader.h"
#include "../../../tensor/loss/LHeader.h"
#include "../../../network/XNoder.h"
#ifndef WIN32
#include <sys/time.h>
#include <unistd.h>
#endif
#include "../../../tensor/XMem.h"
namespace transformer
namespace nmt
{
/* constructor */
T2TTrainer::T2TTrainer()
Trainer::Trainer()
{
cfg = NULL;
}
/* de-constructor */
T2TTrainer::~T2TTrainer()
Trainer::~Trainer()
{
for (int i = 0; i < moments.count; i++) {
XTensor* m = (XTensor*)moments.Get(i);
......@@ -59,15 +59,17 @@ T2TTrainer::~T2TTrainer()
initialization
>> config - configurations of the training process
*/
void T2TTrainer::Init(T2TConfig& config)
void Trainer::Init(Config& config)
{
cfg = &config;
lrate = config.lrate;
lrbias = config.lrbias;
sBatchSize = config.sBatchSize;
wBatchSize = config.wBatchSize;
bucketSize = config.bucketSize;
nepoch = config.nepoch;
nstep = config.nstep;
maxCheckpoint = config.maxCheckpoint;
d = config.modelSize;
nwarmup = config.nwarmup;
vSize = config.srcVocabSize;
......@@ -81,17 +83,12 @@ void T2TTrainer::Init(T2TConfig& config)
nStepCheckpoint = config.nStepCheckpoint;
useEpochCheckpoint = config.useEpochCheckpoint;
updateStep = config.updateStep;
isDebugged = config.isDebugged;
isLenSorted = config.isLenSorted;
adamBeta1T = 1.0F;
adamBeta2T = 1.0F;
batchLoader.Init(config);
}
int tc = 0;
/*
train the model
>> fn - training data file
......@@ -99,8 +96,14 @@ train the model
>> modelFN - where we keep the model
>> model - model to train
*/
void T2TTrainer::Train(const char* fn, const char* validFN, const char* modelFN, T2TModel* model)
void Trainer::Train(const char* fn, const char* validFN,
const char* modelFN, Model* model)
{
/* disable cache during training */
for (int i = 0; i < model->decoder->nlayer; i++) {
model->decoder->selfAttCache[i].enable = false;
model->decoder->enDeAttCache[i].enable = false;
}
int step = 0;
int wc = 0;
int ws = 0;
......@@ -126,45 +129,42 @@ void T2TTrainer::Train(const char* fn, const char* validFN, const char* modelFN,
#endif
int devID = model->devID;
XNet net;
PrepareModel(model);
double startT = GetClockSec();
for (epoch = 1; epoch <= nepoch; epoch++) {
#ifndef WIN32
if (isShuffled) {
fprintf(stderr, "shuffle the file\n");
batchLoader.Shuffle(fn, trainFN);
}
#endif
batchLoader.Init(fn, bucketSize, true);
FILE* file = fopen(trainFN, "r");
CheckNTErrors(file, "cannot open training file!");
for (epoch = 1; epoch <= nepoch; epoch++) {
wordCount = 0;
loss = 0;
/* batch of sequences (on the encoder and decoder sides) */
XTensor batchEnc;
XTensor batchDec;
/* reset the batch loader */
batchLoader.ClearBuf();
/* labels */
XTensor label;
while (!batchLoader.IsEmpty())
{
XNet net;
net.Clear();
/* padding */
XTensor paddingEnc;
XTensor paddingDec;
/* batch of sequences (on the encoder and decoder sides) */
XTensor batchEnc;
XTensor batchDec;
/* gold standard */
XTensor gold;
/* labels */
XTensor label;
while (batchLoader.LoadBatch(file, model->isLM,
&batchEnc, &paddingEnc, &batchDec, &paddingDec, &gold, &label,
NULL, vSize, vSizeTgt,
sBatchSize, wBatchSize, isLenSorted, ws, wc, devID, true))
{
/* padding */
XTensor paddingEnc;
XTensor paddingDec;
UInt64List info = batchLoader.LoadBatch(&batchEnc, &paddingEnc, &batchDec, &paddingDec, &label,
sBatchSize, wBatchSize, devID);
wc = info[0];
ws = info[1];
CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch");
/* output probabilities */
......@@ -204,10 +204,18 @@ void T2TTrainer::Train(const char* fn, const char* validFN, const char* modelFN,
/* update the parameters */
if (gradStep == updateStep) {
/* learning rate */
lr = lrate * (1.0F / (float)sqrt((float)d)) *
(float)MIN(pow((float)validStep + 1, -0.5F - lrbias),
((float)validStep + 1) * pow((float)nwarmup, -1.5F - lrbias));
float warmupEndLR = lrate;
float warmupInitLR = 1e-7;
float lrStep = (warmupEndLR - warmupInitLR) / nwarmup;
float decayFactor = warmupEndLR * pow(float(nwarmup), 0.5F);
/* learning rate, scheduled by inverse square root */
if (step < nwarmup)
lr = warmupInitLR + step * lrStep;
else
lr = decayFactor * pow((float)step, -0.5F);
/* model update */
Update(model, lr);
......@@ -224,15 +232,21 @@ void T2TTrainer::Train(const char* fn, const char* validFN, const char* modelFN,
break;
}
if (step == 10) {
// LOG("after backward --------");
// lossTensor.mem->ShowMemUsage(stderr);
// exit(0);
}
if (step % 100 == 0) {
double elapsed = GetClockSec() - startT;
XPRINT8(0, stderr, "[INFO] elapsed=%.1fs, step=%d, epoch=%d, total word=%d, total batch=%d, loss=%.3f, ppl=%.3f, sppl=%.3f",
elapsed, step, epoch,
wordCountTotal, batchCountTotal,
loss / wordCount, exp(loss / wordCount), exp(lossBatch / wc));
LOG("elapsed=%.1fs, step=%d, epoch=%d, "
"total word=%d, total batch=%d, loss=%.3f, ppl=%.3f, lr=%.2e",
elapsed, step, epoch, wordCountTotal, batchCountTotal,
loss / wordCount / log(2.0), exp(loss / wordCount), lr);
if (!doUpdate)
XPRINT(0, stderr, " (no update)");
XPRINT(0, stderr, "\n");
}
if (nStepCheckpoint > 0 && ++nStepCheck >= nStepCheckpoint) {
......@@ -242,8 +256,6 @@ void T2TTrainer::Train(const char* fn, const char* validFN, const char* modelFN,
}
}
fclose(file);
if (isEnd)
break;
......@@ -255,10 +267,14 @@ void T2TTrainer::Train(const char* fn, const char* validFN, const char* modelFN,
epoch = MIN(epoch, nepoch);
XPRINT7(0, stderr, "[INFO] lr=%.2e, elapsed=%.1fs, step=%d, epoch=%d, word=%d, loss=%.3f, ppl=%.3f\n",
lr, elapsed, step, epoch, wordCountTotal, loss / wordCount, exp(loss / wordCount));
XPRINT4(0, stderr, "[INFO] training finished (took %.1fs, step=%d, skipped=%d and epoch=%d)\n",
elapsed, step, nSkipped, epoch);
LOG("lr=%.2e, elapsed=%.1fs, step=%d, "
"epoch=%d, word=%d, loss=%.3f, ppl=%.3f",
lr, elapsed, step, epoch, wordCountTotal, loss / wordCount / log(2.0), exp(loss / wordCount));
LOG("training finished (took %.1fs, step=%d, "
"skipped=%d and epoch=%d)", elapsed, step, nSkipped, epoch);
LOG("saving the final model");
model->Dump(modelFN);
delete[] trainFN;
}
......@@ -269,7 +285,7 @@ test the model
>> ofn - output data file
>> model - model that is trained
*/
void T2TTrainer::Validate(const char* fn, const char* ofn, T2TModel* model)
void Trainer::Validate(const char* fn, const char* ofn, Model* model)
{
int wc = 0;
int ws = 0;
......@@ -278,42 +294,36 @@ void T2TTrainer::Validate(const char* fn, const char* ofn, T2TModel* model)
float loss = 0;
/* data files */
FILE* file = fopen(fn, "rb");
CheckNTErrors(file, "Cannot read the test file");
FILE* ofile = fopen(ofn, "wb");
CheckNTErrors(ofile, "Cannot open the output file");
batchLoader.Init(fn, 0, false);
double startT = GetClockSec();
/* batch of input sequences */
XTensor batchEnc;
XTensor batchDec;
/* label */
XTensor label;
/* padding */
XTensor paddingEnc;
XTensor paddingDec;
/* gold standard */
XTensor gold;
/* an array that keeps the sequences */
int* seqs = new int[MILLION];
while (!batchLoader.IsEmpty())
{
/* batch of input sequences */
XTensor batchEnc;
XTensor batchDec;
batchLoader.ClearBuf();
/* label */
XTensor label;
while (batchLoader.LoadBatch(file, model->isLM,
&batchEnc, &paddingEnc, &batchDec, &paddingDec, &gold, &label,
seqs, vSize, vSizeTgt,
1, 1, false, ws, wc, model->devID, false))
{
CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch");
/* padding */
XTensor paddingEnc;
XTensor paddingDec;
/* output probabilities */
XTensor output;
/* prediction probabilities */
XTensor labelOnehot;
XTensor lossTensor;
UInt64List info = batchLoader.LoadBatch(&batchEnc, &paddingEnc, &batchDec, &paddingDec, &label,
sBatchSize, 0, model->devID);
wc = info[0];
ws = info[1];
CheckNTErrors(batchEnc.order == 2, "Wrong tensor order of the sequence batch");
/* make the network */
if (model->isLM)
model->MakeLM(batchEnc, output, paddingEnc, false);
......@@ -326,52 +336,20 @@ void T2TTrainer::Validate(const char* fn, const char* ofn, T2TModel* model)
int bSize = output.GetDim(0);
int length = output.GetDim(1);
/* prediction probabilities */
XTensor labelOnehot;
XTensor lossTensor;
labelOnehot = IndexToOnehot(label, vSizeTgt, 0);
lossTensor = CrossEntropy(output, labelOnehot, paddingDec);
float lossBatch = ReduceSumAllValue(lossTensor);
/* dump the test result */
for (int s = 0; s < bSize; s++) {
DTYPE sum = 0;
int* seq = seqs + s * length;
for (int i = 0; i < length; i++) {
if (seq[i] >= 0) {
fprintf(ofile, "%d ", seq[i]);
}
else
break;
}
fprintf(ofile, "||| ");
for (int i = 0; i < length; i++) {
if (seq[i] >= 0) {
DTYPE p = lossTensor.Get2D(s, i);
fprintf(ofile, "%.3e ", p);
sum += p;
}
else
break;
}
fprintf(ofile, "||| %e\n", sum);
}
loss += lossBatch;
wordCount += wc;
sentCount += bSize;
}
fclose(file);
fclose(ofile);
delete[] seqs;
double elapsed = GetClockSec() - startT;
XPRINT5(0, stderr, "[INFO] test finished (took %.1fs, sentence=%d, word=%d, loss=%.3f and ppl=%.3f)\n",
elapsed, sentCount, wordCount, loss / wordCount, exp(loss / wordCount));
LOG("test finished (took %.1fs, sentence=%d, word=%d, loss=%.3f and ppl=%.3f)",
elapsed, sentCount, wordCount, loss / wordCount / log(2.0), exp(loss / wordCount));
}
/*
......@@ -382,20 +360,29 @@ make a checkpoint
>> label - label of the model
>> id - id of the checkpoint
*/
void T2TTrainer::MakeCheckpoint(T2TModel* model, const char* validFN, const char* modelFN, const char* label, int id)
void Trainer::MakeCheckpoint(Model* model, const char* validFN,
const char* modelFN, const char* label, int id)
{
fprintf(stderr, "make a checkpoint\n");
LOG("make a checkpoint");
char* fn = new char[MAX_LINE_LENGTH];
Trainer validator;
validator.Init(*cfg);
/* save last checkpoints */
id = validator.maxCheckpoint - (maxCheckpoint--);
if (maxCheckpoint == 0)
maxCheckpoint = validator.maxCheckpoint;
sprintf(fn, "%s.%s.%03d", modelFN, label, id);
model->Dump(fn);
delete[] fn;
char* fn2 = new char[MAX_LINE_LENGTH];
sprintf(fn2, "%s.%s.%03d.output", modelFN, label, id);
if (validFN != NULL) {
T2TTrainer trainer;
trainer.Init(*cfg);
trainer.Validate(validFN, fn2, model);
validator.Validate(validFN, fn2, model);
}
delete[] fn2;
}
......@@ -405,12 +392,12 @@ update the model by delta rule
\theta_{new} = \theta - \lrate * grad
where
\lrate = d^-0.5 * min(stepNum^{-0.5}, stepNum * warmupStepNum^{-1.5})
>> model - the t2t model
>> model - the model
>> lr - learning rate
*/
void T2TTrainer::Update(T2TModel* model, const float lr)
void Trainer::Update(Model* model, const float lr)
{
TensorList ws(100);
TensorList ws;
model->GetParams(ws);
......@@ -465,12 +452,12 @@ void T2TTrainer::Update(T2TModel* model, const float lr)
prepare model for training
>> model - the model for training
*/
void T2TTrainer::PrepareModel(T2TModel* model)
void Trainer::PrepareModel(Model* model)
{
moments.Clear();
moments2nd.Clear();
TensorList ws(100);
TensorList ws;
model->GetParams(ws);
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -19,25 +18,24 @@
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-08-02
*/
#ifndef __T2TTRAINER_H__
#define __T2TTRAINER_H__
#ifndef __TRAINER_H__
#define __TRAINER_H__
#include "../T2TModel.h"
#include "T2TBatchLoader.h"
#include "../../../tensor/function/FHeader.h"
#include "../Model.h"
#include "TrainDataSet.h"
using namespace nts;
namespace transformer
namespace nmt
{
/* trainer of the T2T model */
class T2TTrainer
/* trainer of the model */
class Trainer
{
public:
/* configurations */
T2TConfig* cfg;
Config* cfg;
/* dimension size of each inner layer */
int d;
......@@ -63,12 +61,18 @@ public:
/* word batch size */
int wBatchSize;
/* size of bucket for grouping data by length */
int bucketSize;
/* training epoch number */
int nepoch;
/* traing step number */
int nstep;
/* the maximum number of saved checkpoints */
int maxCheckpoint;
/* indicates whether we use adam */
bool useAdam;
......@@ -100,39 +104,36 @@ public:
/* number of batches on which we do model update */
int updateStep;
/* indicates whether we intend to debug the net */
bool isDebugged;
/* indicates whether the sequence is sorted by length */
bool isLenSorted;
/* for batching */
T2TBatchLoader batchLoader;
/* used for loading batches */
TrainDataSet batchLoader;
public:
/* constructor */
T2TTrainer();
Trainer();
/* de-constructor */
~T2TTrainer();
~Trainer();
/* initialize the trainer */
void Init(T2TConfig& config);
void Init(Config& config);
/* train the model */
void Train(const char* fn, const char* validFN, const char* modelFN, T2TModel* model);
void Train(const char* fn, const char* validFN, const char* modelFN, Model* model);
/* test the model */
void Validate(const char* fn, const char* ofn, T2TModel* model);
void Validate(const char* fn, const char* ofn, Model* model);
/* make a checkpoint */
void MakeCheckpoint(T2TModel* model, const char* validFN, const char* modelFN, const char* label, int id);
void MakeCheckpoint(Model* model, const char* validFN, const char* modelFN, const char* label, int id);
/* update the model by delta rule */
void Update(T2TModel* model, const float lr);
void Update(Model* model, const float lr);
/* prepare model for training */
void PrepareModel(T2TModel* model);
void PrepareModel(Model* model);
};
}
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -26,24 +25,26 @@
#include <fstream>
#include <algorithm>
#include "T2TDataSet.h"
#include "../module/T2TUtility.h"
#include "DataSet.h"
#include "../Utility.h"
using namespace transformer;
using namespace nmt;
namespace nts {
/* sort the output by id (in ascending order) */
void DataSet::SortInput() {
sort(inputBuffer.items, inputBuffer.items + inputBuffer.count, [](Example* a, Example* b) {
return a->values.count > b->values.count;
sort(inputBuffer.items, inputBuffer.items + inputBuffer.count,
[](Example* a, Example* b) {
return a->values.count > b->values.count;
});
}
/* sort the input by length (in descending order) */
void DataSet::SortOutput() {
sort(outputBuffer.items, outputBuffer.items + outputBuffer.count, [](Result* a, Result* b) {
return a->id < b->id;
sort(outputBuffer.items, outputBuffer.items + outputBuffer.count,
[](Result* a, Result* b) {
return a->id < b->id;
});
}
......@@ -74,7 +75,7 @@ void DataSet::LoadDataToBuffer()
: line.size() - indices[i];
string word = line.substr(indices[i], offset);
if (srcVocab.word2id.find(word) == srcVocab.word2id.end())
values.Add(3);
values.Add(UNK);
else
values.Add(srcVocab.word2id.at(word));
}
......@@ -100,7 +101,7 @@ void DataSet::LoadDataToBuffer()
}
/*
load a mini-batch to the device
load a mini-batch to the device (for translating)
>> batchEnc - a tensor to store the batch of input
>> paddingEnc - a tensor to store the batch of paddings
>> minSentBatch - the minimum number of sentence batch
......@@ -117,10 +118,10 @@ UInt64List DataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
size_t maxLen = inputBuffer[bufferUsed]->values.Size();
/* dynamic batching for sentences */
while ((realBatchSize < (inputBuffer.Size() - bufferUsed))
&& (realBatchSize * maxLen < batchSize)) {
realBatchSize++;
}
//while ((realBatchSize < (inputBuffer.Size() - bufferUsed))
// && (realBatchSize * maxLen < batchSize)) {
// realBatchSize++;
//}
/* real batch size */
if ((inputBuffer.Size() - bufferUsed) < realBatchSize) {
......@@ -133,13 +134,13 @@ UInt64List DataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
float* paddingValues = new float[realBatchSize * maxLen];
for (int i = 0; i < realBatchSize * maxLen; i++) {
batchValues[i] = 1;
paddingValues[i] = 0.0F;
batchValues[i] = PAD;
paddingValues[i] = 1.0F;
}
size_t cur = 0;
size_t curSrc = 0;
/* left padding */
/* right padding */
UInt64List infos;
size_t totalLength = 0;
......@@ -147,11 +148,11 @@ UInt64List DataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
infos.Add(inputBuffer[bufferUsed + i]->id);
totalLength += inputBuffer[bufferUsed + i]->values.Size();
cur = maxLen * (i + 1) - inputBuffer[bufferUsed + i]->values.Size();
for (int j = 0; j < inputBuffer[bufferUsed + i]->values.Size(); j++) {
batchValues[cur] = inputBuffer[bufferUsed + i]->values[j];
paddingValues[cur++] = 1.0F;
}
curSrc = maxLen * i;
for (int j = 0; j < inputBuffer[bufferUsed + i]->values.Size(); j++)
batchValues[curSrc++] = inputBuffer[bufferUsed + i]->values[j];
while (curSrc < maxLen * (i + 1))
paddingValues[curSrc++] = 0;
}
infos.Add(totalLength);
......@@ -178,7 +179,7 @@ the constructor of DataSet
void DataSet::Init(const char* dataFile, const char* srcVocabFN, const char* tgtVocabFN)
{
fp = new ifstream(dataFile);
CheckNTErrors(fp->is_open(), "can not open the file");
CheckNTErrors(fp->is_open(), "Can not open the test data");
bufferUsed = 0;
CheckNTErrors(strcmp(srcVocabFN, "") != 0, "missing source vocab file");
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -26,7 +25,7 @@
#include <cstdio>
#include <vector>
#include <fstream>
#include "T2TVocab.h"
#include "Vocab.h"
#include "../../../tensor/XList.h"
#include "../../../tensor/XTensor.h"
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -22,11 +21,11 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#include "T2TLengthPenalty.h"
#include "LengthPenalty.h"
using namespace nts;
namespace transformer
namespace nmt
{
/*
......@@ -36,7 +35,7 @@ where n = length of the sequence
>> alpha - the parameter controls the length preference
<< return - length penalty of the sequence
*/
float T2TLengthPenalizer::GNMT(float length, float alpha)
float LengthPenalizer::GNMT(float length, float alpha)
{
float base;
float lp;
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -22,21 +21,21 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#ifndef __T2TLENGTHPENALTY_H__
#define __T2TLENGTHPENALTY_H__
#ifndef __LENGTHPENALTY_H__
#define __LENGTHPENALTY_H__
#include "../module/T2TUtility.h"
#include "../Utility.h"
#include "../../../tensor/XTensor.h"
using namespace nts;
namespace transformer
namespace nmt
{
/* We intend to penalize short sequences because they have higher score
in product of a sequence of probability-like terms and have more chances
to beat others in search. */
class T2TLengthPenalizer
class LengthPenalizer
{
public:
/* GNMT-like length penalty: pl = ((5 + n)/(5 + 1))^\alpha
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -22,23 +21,23 @@
#include <iostream>
#include "T2TPredictor.h"
#include "../module/T2TNNUtil.h"
#include "Predictor.h"
#include "../module/NNUtil.h"
using namespace nts;
namespace transformer
namespace nmt
{
/* constructor */
T2TStateBundle::T2TStateBundle()
StateBundle::StateBundle()
{
states = NULL;
isStart = false;
}
/* de-constructor */
T2TStateBundle::~T2TStateBundle()
StateBundle::~StateBundle()
{
if (states != NULL)
delete[] states;
......@@ -48,18 +47,18 @@ T2TStateBundle::~T2TStateBundle()
create states
>> num - number of states
*/
void T2TStateBundle::MakeStates(int num)
void StateBundle::MakeStates(int num)
{
CheckNTErrors(num > 0, "invalid number");
if (states != NULL)
delete[] states;
states = new T2TState[num];
states = new State[num];
for (int i = 0; i < num; i++) {
states[i].prediction = -1;
states[i].pid = T2T_PID_EMPTY;
states[i].pid = _PID_EMPTY;
states[i].isEnd = false;
states[i].isStart = false;
states[i].isCompleted = false;
......@@ -74,26 +73,26 @@ void T2TStateBundle::MakeStates(int num)
}
/* constructor */
T2TPredictor::T2TPredictor()
Predictor::Predictor()
{
startSymbol = 2;
}
/* de-constructor */
T2TPredictor::~T2TPredictor()
Predictor::~Predictor()
{
}
/*
create an initial state
>> model - the t2t model
>> model - the model
>> top - the top-most layer of the network
>> input - input of the network
>> beamSize - beam size
>> state - the state to be initialized
*/
void T2TPredictor::Create(T2TModel* model, XTensor* top, const XTensor* input,
int beamSize, T2TStateBundle* state)
void Predictor::Create(Model* model, XTensor* top, const XTensor* input,
int beamSize, StateBundle* state)
{
int dims[MAX_TENSOR_DIM_NUM];
for (int i = 0; i < input->order - 1; i++)
......@@ -114,20 +113,20 @@ void T2TPredictor::Create(T2TModel* model, XTensor* top, const XTensor* input,
set start symbol
>> symbol - the symbol (in integer)
*/
void T2TPredictor::SetStartSymbol(int symbol)
void Predictor::SetStartSymbol(int symbol)
{
startSymbol = symbol;
}
/*
read a state
>> model - the t2t model that keeps the network created so far
>> model - the model that keeps the network created so far
>> state - a set of states. It keeps
1) hypotheses (states)
2) probabilities of hypotheses
3) parts of the network for expanding toward the next state
*/
void T2TPredictor::Read(T2TModel* model, T2TStateBundle* state)
void Predictor::Read(Model* model, StateBundle* state)
{
m = model;
s = state;
......@@ -147,9 +146,9 @@ predict the next state
>> needReorder - whether we need reordering the states
>> nstep - current time step of the target sequence
*/
void T2TPredictor::Predict(T2TStateBundle* next, XTensor& aliveState, XTensor& encoding,
XTensor& inputEnc, XTensor& paddingEnc, int batchSize, bool isStart,
XTensor& reorderState, bool needReorder, int nstep)
void Predictor::Predict(StateBundle* next, XTensor& aliveState, XTensor& encoding,
XTensor& inputEnc, XTensor& paddingEnc, int batchSize, bool isStart,
XTensor& reorderState, bool needReorder, int nstep)
{
int dims[MAX_TENSOR_DIM_NUM];
......@@ -221,14 +220,14 @@ void T2TPredictor::Predict(T2TStateBundle* next, XTensor& aliveState, XTensor& e
generate paths up to the states of the current step
>> state - state bundle of the current step
*/
XTensor T2TPredictor::GeneratePaths(T2TStateBundle* state)
XTensor Predictor::GeneratePaths(StateBundle* state)
{
CheckNTErrors(state->stateNum >= 0, "Illegal state!");
int distance = -1;
for (int i = 0; i < state->stateNum; i++) {
T2TState* cur = state->states + i;
State* cur = state->states + i;
int nsteps = 0;
while (cur != NULL) {
......@@ -245,7 +244,7 @@ XTensor T2TPredictor::GeneratePaths(T2TStateBundle* state)
path.SetZeroAll();
for (int i = 0; i < state->stateNum; i++) {
T2TState* cur = state->states + i;
State* cur = state->states + i;
int nsteps = 0;
while (cur != NULL) {
......@@ -263,21 +262,21 @@ get the predictions of the previous step
>> state - state bundle of the current step
>> devID - the device id for the predictions
*/
XTensor T2TPredictor::GetLastPrediction(T2TStateBundle* state, int devID)
XTensor Predictor::GetLastPrediction(StateBundle* state, int devID)
{
CheckNTErrors(state->stateNum >= 0, "Illegal state!");
IntList last;
for (int i = 0; i < state->stateNum; i++) {
T2TState* cur = state->states + i;
State* cur = state->states + i;
last.Add(cur->prediction);
}
XTensor lastPred;
InitTensor2D(&lastPred, last.Size(), 1, X_INT, devID);
lastPred.SetData(last.items, last.Size());
InitTensor2D(&lastPred, int(last.Size()), 1, X_INT, devID);
lastPred.SetData(last.items, int(last.Size()));
return lastPred;
}
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -21,22 +20,22 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#ifndef __T2TPREDICTOR_H__
#define __T2TPREDICTOR_H__
#ifndef __PREDICTOR_H__
#define __PREDICTOR_H__
#include "../T2TModel.h"
#include "T2TLengthPenalty.h"
#include "../Model.h"
#include "LengthPenalty.h"
using namespace std;
namespace transformer
namespace nmt
{
#define T2T_PID_EMPTY -1
#define _PID_EMPTY -1
/* state for search. It keeps the path (back-pointer), prediction distribution,
and etc. It can be regarded as a hypotheses in translation. */
class T2TState
class State
{
public:
/* we assume that the prediction is an integer */
......@@ -69,11 +68,11 @@ public:
int nstep;
/* pointer to the previous state */
T2TState* last;
State* last;
};
/* a bundle of states */
class T2TStateBundle
class StateBundle
{
public:
/* predictions */
......@@ -98,7 +97,7 @@ public:
float nstep;
/* list of states */
T2TState* states;
State* states;
/* number of states */
int stateNum;
......@@ -108,10 +107,10 @@ public:
public:
/* constructor */
T2TStateBundle();
StateBundle();
/* de-constructor */
~T2TStateBundle();
~StateBundle();
/* create states */
void MakeStates(int num);
......@@ -122,14 +121,14 @@ public:
we get the state of previous words and then generate the next word.
Here, a state can be regarded as the representation of words (word
indices, hidden states, embeddings and etc.). */
class T2TPredictor
class Predictor
{
private:
/* pointer to the transformer model */
T2TModel* m;
Model* m;
/* current state */
T2TStateBundle* s;
StateBundle* s;
/* start symbol */
int startSymbol;
......@@ -139,30 +138,30 @@ private:
public:
/* constructor */
T2TPredictor();
Predictor();
/* de-constructor */
~T2TPredictor();
~Predictor();
/* create an initial state */
void Create(T2TModel* model, XTensor* top, const XTensor* input, int beamSize, T2TStateBundle* state);
void Create(Model* model, XTensor* top, const XTensor* input, int beamSize, StateBundle* state);
/* set the start symbol */
void SetStartSymbol(int symbol);
/* read a state */
void Read(T2TModel* model, T2TStateBundle* state);
void Read(Model* model, StateBundle* state);
/* predict the next state */
void Predict(T2TStateBundle* next, XTensor& aliveIndices, XTensor& encoding,
void Predict(StateBundle* next, XTensor& aliveIndices, XTensor& encoding,
XTensor& inputEnc, XTensor& paddingEnc, int rawBatchSize,
bool isStart, XTensor& reorderState, bool needReorder, int nstep);
/* generate paths up to the states of the current step */
XTensor GeneratePaths(T2TStateBundle* state);
XTensor GeneratePaths(StateBundle* state);
/* get the predictions of the previous step */
XTensor GetLastPrediction(T2TStateBundle* state, int devID);
XTensor GetLastPrediction(StateBundle* state, int devID);
};
}
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -20,13 +19,13 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
*/
#include "T2TSearch.h"
#include "../module/T2TUtility.h"
#include "Search.h"
#include "../Utility.h"
#include "../../../tensor/core/CHeader.h"
using namespace nts;
namespace transformer
namespace nmt
{
/* constructor */
BeamSearch::BeamSearch()
......@@ -55,7 +54,7 @@ initialize the model
>> argc - number of arguments
>> argv - list of pointers to the arguments
*/
void BeamSearch::Init(T2TConfig& config)
void BeamSearch::Init(Config& config)
{
beamSize = config.beamSize;
batchSize = config.sBatchSize;
......@@ -105,10 +104,10 @@ search for the most promising states
>> output - output that represents the sequences as rows
>> score - score of the sequences
*/
void BeamSearch::Search(T2TModel* model, XTensor& input, XTensor& padding,
void BeamSearch::Search(Model* model, XTensor& input, XTensor& padding,
IntList* output, XTensor& score)
{
T2TPredictor predictor;
Predictor predictor;
XTensor maskEnc;
XTensor encoding;
XTensor encodingBeam;
......@@ -140,10 +139,10 @@ void BeamSearch::Search(T2TModel* model, XTensor& input, XTensor& padding,
CheckNTErrors(lengthLimit > 0, "no max length specified!");
maxLength = lengthLimit;
T2TStateBundle* states = new T2TStateBundle[lengthLimit + 1];
T2TStateBundle* first = states;
T2TStateBundle* cur = NULL;
T2TStateBundle* next = NULL;
StateBundle* states = new StateBundle[lengthLimit + 1];
StateBundle* first = states;
StateBundle* cur = NULL;
StateBundle* next = NULL;
/* create the first state */
predictor.Create(model, &encodingBeam, &input, beamSize, first);
......@@ -213,7 +212,7 @@ compute the model score for each hypotheses
>> prev - the beam of the previous state
>> beam - the beam that keeps a number of states
*/
void BeamSearch::Score(T2TStateBundle* prev, T2TStateBundle* beam)
void BeamSearch::Score(StateBundle* prev, StateBundle* beam)
{
XTensor& score = beam->modelScore;
XTensor& prob = beam->prob;
......@@ -244,7 +243,7 @@ void BeamSearch::Score(T2TStateBundle* prev, T2TStateBundle* beam)
beam->nstep = prev->nstep + 1.0F;
/* the GNMT-like length penalty */
float lp = T2TLengthPenalizer::GNMT(beam->nstep, alpha);
float lp = LengthPenalizer::GNMT(beam->nstep, alpha);
/* score = log-prob/lp */
score = probPath / lp;
......@@ -279,7 +278,7 @@ generate tokens for the next state via beam pruning
>> prev - the last beam
>> beam - the beam that keeps a number of states
*/
void BeamSearch::Generate(T2TStateBundle* prev, T2TStateBundle* beam)
void BeamSearch::Generate(StateBundle* prev, StateBundle* beam)
{
int dims[MAX_TENSOR_DIM_NUM];
int dimsBeam[MAX_TENSOR_DIM_NUM];
......@@ -323,7 +322,7 @@ void BeamSearch::Generate(T2TStateBundle* prev, T2TStateBundle* beam)
/* keep the most promising candidates in the beam */
TopK(score, scoreTopK, index, -1, beamSize, true);
float lp = T2TLengthPenalizer::GNMT(beam->nstep, alpha);
float lp = LengthPenalizer::GNMT(beam->nstep, alpha);
CopyValues(index, indexCPU);
CopyValues(index, preID);
......@@ -375,26 +374,26 @@ expand the search graph
>> beam - the beam that keeps a number of states
>> reorderState - the new order of states
*/
void BeamSearch::Expand(T2TStateBundle* prev, T2TStateBundle* beam, XTensor& reorderState)
void BeamSearch::Expand(StateBundle* prev, StateBundle* beam, XTensor& reorderState)
{
CheckNTErrors(beam->prediction.unitNum == beam->preID.unitNum,
"A problem occurs in the beam!");
beam->MakeStates(beam->prediction.unitNum);
T2TState* states = beam->states;
State* states = beam->states;
XTensor& idRef = beam->preID;
XTensor& modelScoreRef = beam->modelScore;
XTensor& probRef = beam->prob;
XTensor& probPathRef = beam->probPath;
XTensor& predictionRef = beam->prediction;
XTensor& endMark = beam->endMark;
XTensor id;
XTensor modelScore;
XTensor prob;
XTensor probPath;
XTensor prediction;
XTensor endMarkCPU;
XTensor id;
XTensor modelScore;
XTensor prob;
XTensor probPath;
XTensor prediction;
XTensor endMarkCPU;
XTensor reorderStateCPU;
InitTensorOnCPU(&id, &idRef);
......@@ -424,7 +423,7 @@ void BeamSearch::Expand(T2TStateBundle* prev, T2TStateBundle* beam, XTensor& reo
for (int i = 0; i < beam->stateNum; i += beamSize) {
for (int j = 0; j < beamSize; j++) {
int k = i + j;
T2TState& state = states[k];
State& state = states[k];
int offset = id.GetInt(k);
int pid = i / beamSize;
......@@ -432,7 +431,7 @@ void BeamSearch::Expand(T2TStateBundle* prev, T2TStateBundle* beam, XTensor& reo
if (offset != j)
needReorder = true;
T2TState* last = prev->states + pid * beamSize + offset;
State* last = prev->states + pid * beamSize + offset;
CheckNTErrors(offset >= 0, "Wrong state index!");
......@@ -482,12 +481,12 @@ collect hypotheses with ending symbols. Given a beam of hypotheses,
we remove the finished hypotheses and keep them in a heap.
>> beam - the beam that keeps a number of states
*/
void BeamSearch::Collect(T2TStateBundle* beam)
void BeamSearch::Collect(StateBundle* beam)
{
T2TState* states = beam->states;
State* states = beam->states;
for (int i = 0; i < beam->stateNum; i++) {
T2TState& state = states[i];
State& state = states[i];
CheckNTErrors(state.pid >= 0 && state.pid < batchSize,
"Invalid sample id!");
......@@ -508,13 +507,13 @@ void BeamSearch::Collect(T2TStateBundle* beam)
fill the hypothesis heap with incomplete hypotheses
>> beam - the beam that keeps a number of states (final)
*/
void BeamSearch::FillHeap(T2TStateBundle* beam)
void BeamSearch::FillHeap(StateBundle* beam)
{
T2TState* states = beam->states;
State* states = beam->states;
for (int i = 0; i < beam->stateNum / beamSize; i++) {
for (int j = 0; j < beamSize; j++) {
T2TState& state = states[i * beamSize + j];
State& state = states[i * beamSize + j];
/* we push the incomplete hypothesis into the heap */
if (fullHypos[state.pid].Count() == 0 && state.isEnd && state.isCompleted) {
......@@ -548,10 +547,10 @@ void BeamSearch::Dump(IntList* output, XTensor* score)
int c = heap.Count();
float bestScore = -1e9F;
T2TState* state = NULL;
State* state = NULL;
for (int i = 0; i < c; i++) {
auto node = heap.Pop();
T2TState* s = (T2TState*)node.index;
State* s = (State*)node.index;
if (i == 0 || bestScore < node.value) {
state = s;
bestScore = node.value;
......@@ -619,12 +618,12 @@ void BeamSearch::SetEnd(const int* tokens, const int tokenNum)
check whether all hypotheses are completed
>> beam - the beam that keeps the searching states
*/
bool BeamSearch::IsAllCompleted(T2TStateBundle* beam)
bool BeamSearch::IsAllCompleted(StateBundle* beam)
{
T2TState* states = beam->states;
State* states = beam->states;
for (int i = 0; i < beam->stateNum; i++) {
T2TState& state = states[i];
State& state = states[i];
if (!state.isCompleted)
return false;
}
......@@ -640,11 +639,11 @@ update the beam by removing finished hypotheses
>> alivePadding - new paddings for the inputs, (B, L)
<< aliveIdx - the indices of alive states
*/
void BeamSearch::RemoveFinishedStates(T2TStateBundle* beam, XTensor& aliveEncoding,
void BeamSearch::RemoveFinishedStates(StateBundle* beam, XTensor& aliveEncoding,
XTensor& aliveInput, XTensor& alivePadding,
XTensor& aliveState)
{
T2TState* states = beam->states;
State* states = beam->states;
/* get the indices of uncompleted sentences and states */
aliveSentList.Clear();
......@@ -674,12 +673,12 @@ void BeamSearch::RemoveFinishedStates(T2TStateBundle* beam, XTensor& aliveEncodi
}
}
InitTensor1D(&aliveState, aliveStateList.Size(), X_INT, aliveEncoding.devID);
aliveState.SetData(aliveStateList.items, aliveStateList.Size());
InitTensor1D(&aliveState, int(aliveStateList.Size()), X_INT, aliveEncoding.devID);
aliveState.SetData(aliveStateList.items, int(aliveStateList.Size()));
XTensor aliveSent;
InitTensor1D(&aliveSent, aliveSentList.Size(), X_INT, aliveEncoding.devID);
aliveSent.SetData(aliveSentList.items, aliveSentList.Size());
InitTensor1D(&aliveSent, int(aliveSentList.Size()), X_INT, aliveEncoding.devID);
aliveSent.SetData(aliveSentList.items, int(aliveSentList.Size()));
if (aliveStateList.Size() < aliveEncoding.dimSize[0] && aliveStateList.Size() > 0) {
aliveInput = AutoGather(aliveInput, aliveState);
......@@ -697,7 +696,7 @@ void BeamSearch::RemoveFinishedStates(T2TStateBundle* beam, XTensor& aliveEncodi
make a mask to prevent duplicated entries in beam expansion for the first position
>> beam - the beam that keeps the searching states
*/
XTensor BeamSearch::MakeFirstMask(T2TStateBundle* beam)
XTensor BeamSearch::MakeFirstMask(StateBundle* beam)
{
XTensor& prob = beam->prob;
XTensor mask;
......@@ -742,7 +741,7 @@ initialize the model
>> argc - number of arguments
>> argv - list of pointers to the arguments
*/
void GreedySearch::Init(T2TConfig& config)
void GreedySearch::Init(Config& config)
{
batchSize = config.wBatchSize;
endSymbols[0] = config.endID;
......@@ -798,7 +797,7 @@ search for the most promising states
>> padding - padding of the input
>> output - output that represents the sequences as rows
*/
void GreedySearch::Search(T2TModel* model, XTensor& input,
void GreedySearch::Search(Model* model, XTensor& input,
XTensor& padding, IntList* output)
{
XTensor maskEnc;
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -20,15 +19,15 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
*/
#ifndef __T2TSEARCH_H__
#define __T2TSEARCH_H__
#ifndef __SEARCH_H__
#define __SEARCH_H__
#include "../T2TModel.h"
#include "T2TPredictor.h"
#include "../Model.h"
#include "Predictor.h"
using namespace std;
namespace transformer
namespace nmt
{
/* The class organizes the search process. It calls "predictors" to generate
......@@ -42,7 +41,7 @@ private:
float alpha;
/* predictor */
T2TPredictor predictor;
Predictor predictor;
/* max length of the generated sequence */
int maxLength;
......@@ -88,28 +87,28 @@ public:
~BeamSearch();
/* initialize the model */
void Init(T2TConfig& config);
void Init(Config& config);
/* search for the most promising states */
void Search(T2TModel* model, XTensor& input, XTensor& padding, IntList* output, XTensor& score);
void Search(Model* model, XTensor& input, XTensor& padding, IntList* output, XTensor& score);
/* preparation */
void Prepare(int myBatchSize, int myBeamSize);
/* compute the model score for each hypotheses */
void Score(T2TStateBundle* prev, T2TStateBundle* beam);
void Score(StateBundle* prev, StateBundle* beam);
/* generate token indices via beam pruning */
void Generate(T2TStateBundle* prev, T2TStateBundle* beam);
void Generate(StateBundle* prev, StateBundle* beam);
/* expand the search graph */
void Expand(T2TStateBundle* prev, T2TStateBundle* beam, XTensor& reorderState);
void Expand(StateBundle* prev, StateBundle* beam, XTensor& reorderState);
/* collect hypotheses with ending symbol */
void Collect(T2TStateBundle* beam);
void Collect(StateBundle* beam);
/* fill the hypotheses heap with incomplete hypotheses */
void FillHeap(T2TStateBundle* beam);
void FillHeap(StateBundle* beam);
/* save the output sequences and score */
void Dump(IntList* output, XTensor* score);
......@@ -118,17 +117,17 @@ public:
bool IsEnd(int token);
/* check whether all hypotheses are completed */
bool IsAllCompleted(T2TStateBundle* beam);
bool IsAllCompleted(StateBundle* beam);
/* update the beam by pruning finished states */
void RemoveFinishedStates(T2TStateBundle* beam, XTensor& aliveEncoding,
void RemoveFinishedStates(StateBundle* beam, XTensor& aliveEncoding,
XTensor& aliveInput, XTensor& alivePadding, XTensor& aliveIdx);
/* set end symbols for search */
void SetEnd(const int* tokens, const int tokenNum);
/* make a mask to prevent duplicated entries in beam expansion for the first position */
XTensor MakeFirstMask(T2TStateBundle* beam);
XTensor MakeFirstMask(StateBundle* beam);
};
class GreedySearch
......@@ -136,7 +135,7 @@ class GreedySearch
private:
/* predictor */
T2TPredictor predictor;
Predictor predictor;
/* max length of the generated sequence */
int maxLength;
......@@ -164,10 +163,10 @@ public:
~GreedySearch();
/* initialize the model */
void Init(T2TConfig& config);
void Init(Config& config);
/* search for the most promising states */
void Search(T2TModel* model, XTensor& input, XTensor& padding, IntList* output);
void Search(Model* model, XTensor& input, XTensor& padding, IntList* output);
/* preparation */
void Prepare(int myBatchSize);
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -20,27 +19,25 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
*/
#include <cmath>
#include "T2TTranslator.h"
#include "T2TSearch.h"
#include "../module/T2TUtility.h"
#include "Search.h"
#include "Translator.h"
#include "../Utility.h"
#include "../../../tensor/XTensor.h"
#include "../../../tensor/XUtility.h"
#include "../../../tensor/core/CHeader.h"
using namespace nts;
namespace transformer
namespace nmt
{
/* constructor */
T2TTranslator::T2TTranslator()
Translator::Translator()
{
}
/* de-constructor */
T2TTranslator::~T2TTranslator()
Translator::~Translator()
{
if (beamSize > 1)
delete (BeamSearch*)seacher;
......@@ -49,7 +46,7 @@ T2TTranslator::~T2TTranslator()
}
/* initialize the model */
void T2TTranslator::Init(T2TConfig& config)
void Translator::Init(Config& config)
{
beamSize = config.beamSize;
vSize = config.srcVocabSize;
......@@ -58,17 +55,17 @@ void T2TTranslator::Init(T2TConfig& config)
wordBatch = config.wBatchSize;
if (beamSize > 1) {
XPRINT1(0, stderr, "Translating with beam search (%d)\n", beamSize);
LOG("translating with beam search (%d)", beamSize);
seacher = new BeamSearch();
((BeamSearch*)seacher)->Init(config);
}
else if (beamSize == 1) {
XPRINT1(0, stderr, "Translating with greedy search (%d)\n", beamSize);
LOG("translating with greedy search");
seacher = new GreedySearch();
((GreedySearch*)seacher)->Init(config);
}
else {
CheckNTErrors(false, "invalid beam size\n");
CheckNTErrors(false, "Invalid beam size\n");
}
}
......@@ -80,8 +77,8 @@ test the model
>> ofn - output data file
>> model - pretrained model
*/
void T2TTranslator::Translate(const char* ifn, const char* sfn, const char* tfn,
const char* ofn, T2TModel* model)
void Translator::Translate(const char* ifn, const char* sfn,
const char* tfn, const char* ofn, Model* model)
{
int wc = 0;
int wordCountTotal = 0;
......@@ -99,8 +96,7 @@ void T2TTranslator::Translate(const char* ifn, const char* sfn, const char* tfn,
XTensor paddingEnc;
batchLoader.Init(ifn, sfn, tfn);
XPRINT1(0, stderr, "[INFO] loaded the input file, elapsed=%.1fs \n",
GetClockSec() - startT);
LOG("loaded the input file, elapsed=%.1fs ", GetClockSec() - startT);
int count = 0;
double batchStart = GetClockSec();
......@@ -130,24 +126,24 @@ void T2TTranslator::Translate(const char* ifn, const char* sfn, const char* tfn,
for (int i = 0; i < indices.Size() - 1; ++i) {
Result* res = new Result;
res->id = indices[i];
res->id = int(indices[i]);
res->res = output[i];
batchLoader.outputBuffer.Add(res);
}
delete[] output;
wc += indices[-1];
wordCountTotal += indices[-1];
wc += int(indices[-1]);
wordCountTotal += int(indices[-1]);
sentCount += (indices.Size() - 1);
sentCount += int(indices.Size() - 1);
batchCount += 1;
if (count % 1 == 0) {
double elapsed = GetClockSec() - batchStart;
batchStart = GetClockSec();
XPRINT3(0, stderr, "[INFO] elapsed=%.1fs, sentence=%f, sword=%.1fw/s\n",
elapsed, float(sentCount) / float(batchLoader.inputBuffer.Size()),
double(wc) / elapsed);
LOG("elapsed=%.1fs, sentence=%f, sword=%.1fw/s",
elapsed, float(sentCount) / float(batchLoader.inputBuffer.Size()),
double(wc) / elapsed);
wc = 0;
}
}
......@@ -169,8 +165,8 @@ void T2TTranslator::Translate(const char* ifn, const char* sfn, const char* tfn,
double elapsed = GetClockSec() - startDump;
XPRINT2(0, stderr, "[INFO] translation completed (word=%d, sent=%ld)\n",
wordCountTotal, batchLoader.inputBuffer.Size() + batchLoader.emptyLines.Size());
LOG("translation completed (word=%d, sent=%zu)",
wordCountTotal, batchLoader.inputBuffer.Size() + batchLoader.emptyLines.Size());
}
/*
......@@ -178,7 +174,7 @@ dump the result into the file
>> file - data file
>> output - output tensor
*/
void T2TTranslator::Dump(FILE* file, XTensor* output)
void Translator::Dump(FILE* file, XTensor* output)
{
if (output != NULL && output->unitNum != 0) {
int seqLength = output->dimSize[output->order - 1];
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -21,17 +20,17 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-06
*/
#ifndef __T2TTESTER_H__
#define __T2TTESTER_H__
#ifndef __TESTER_H__
#define __TESTER_H__
#include "T2TSearch.h"
#include "T2TDataSet.h"
#include "Search.h"
#include "DataSet.h"
namespace transformer
namespace nmt
{
/* This class translates test sentences with a trained model. */
class T2TTranslator
class Translator
{
public:
/* vocabulary size of the source side */
......@@ -57,17 +56,17 @@ public:
public:
/* constructor */
T2TTranslator();
Translator();
/* de-constructor */
~T2TTranslator();
~Translator();
/* initialize the model */
void Init(T2TConfig& config);
void Init(Config& config);
/* test the model */
void Translate(const char* ifn, const char* vfn, const char* ofn,
const char* tfn, T2TModel* model);
const char* tfn, Model* model);
/* dump the result into the file */
void Dump(FILE* file, XTensor* output);
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -21,8 +20,8 @@
#include <fstream>
#include "T2TVocab.h"
#include "../module/T2TUtility.h"
#include "Vocab.h"
#include "../Utility.h"
namespace nts {
......@@ -31,7 +30,7 @@ void Vocab::Load(const string& src)
{
string vsz, sid;
ifstream f(src, ios::in);
CheckNTErrors(f.is_open(), "Unable to open the vocabulary file");
CheckNTErrors(f.is_open(), "unable to open the vocabulary file");
/* get the vocab size and the start id */
f >> vsz >> sid;
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -19,8 +18,8 @@
* $Created by: HU Chi (huchinlp@foxmail.com) 2020-01-03
*/
#ifndef __T2TVOCAB_H__
#define __T2TVOCAB_H__
#ifndef __VOCAB_H__
#define __VOCAB_H__
#include <cstdio>
#include <unordered_map>
......@@ -30,10 +29,10 @@ using namespace std;
namespace nts {
/* user-defined symbols */
#define UNK 0
#define PAD 1
#define SOS 2
#define EOS 2
#define UNK 3
/* the vocabulary class */
struct Vocab
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论