Commit 1d17c439 by hello

Sync with github

parent 010f385d
差异被折叠。 点击展开。
This source diff could not be displayed because it is too large. You can view the blob instead.
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
* Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -19,19 +19,18 @@
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-10
*/
//#define CRTDBG_MAP_ALLOC
//#include <stdlib.h>
//#include <crtdbg.h>
#include <stdio.h>
#include "./network/XNet.h"
#include "./tensor/XUtility.h"
#include "./tensor/function/FHeader.h"
#include "./tensor/core/CHeader.h"
#include "./tensor/test/Test.h"
#include "./sample/fnnlm/FNNLM.h"
#include "./sample/transformer/Transformer.h"
//#define CRTDBG_MAP_ALLOC
//#include <stdlib.h>
//#include <crtdbg.h>
using namespace nts;
using namespace fnnlm;
......@@ -39,27 +38,19 @@ using namespace transformer;
int main( int argc, const char ** argv )
{
/*_CrtSetDbgFlag(_CrtSetDbgFlag(_CRTDBG_REPORT_FLAG) | _CRTDBG_LEAK_CHECK_DF);
_CrtSetBreakAlloc(2708);*/
TransformerMain(argc - 1, argv + 1);
if(argc > 1 && !strcmp(argv[1], "-test"))
Test();
else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
FNNLMMain(argc - 1, argv + 1);
else if(argc > 1 && !strcmp(argv[1], "-t2t"))
TransformerMain(argc - 1, argv + 1);
else{
fprintf(stderr, "Thanks for using NiuTensor! This is a library for building\n");
fprintf(stderr, "neural networks in an easy way. \n\n");
fprintf(stderr, "Run this program with \"-test\" for unit test!\n");
fprintf(stderr, "Or run this program with \"-fnnlm\" for sample FNNLM!\n");
fprintf(stderr, "Or run this program with \"-t2t\" for sample Transformer!\n");
}
//XTensor singleScore, singleIdx, score;
//InitTensor3DV2(&score, 2, 1, 136160);
////score.SetDataRand(0, 1);
//InitTensor1DV2(&singleIdx, 1, X_INT);
//singleIdx.Set1DInt(1, 0);
//singleIdx.Dump(stderr);
//singleScore = Select(score, singleIdx, 0);
//XTensor s, i;
//InitTensor3DV2(&s, 2, 1, 4);
//InitTensor3DV2(&i, 2, 1, 4, X_INT);
//TopK(score, s, i, -1, 4);
//i.Dump(stderr, "single score:\n");
//_CrtDumpMemoryLeaks();
return 0;
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......@@ -31,37 +31,65 @@ namespace nts{
/* compute dE/dx of a node */
void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient)
{
if (!isEfficient) {
CheckNTErrors(node->grad != NULL, "No gradient found!");
}
else {
CheckNTErrors(!node->isGrad || node->grad != NULL, "No gradient found!");
}
XLink &income = node->income;
int operID = income.typeID;
CheckNTErrors(node->grad != NULL, "No gradient found!");
CheckNTErrors(income.tailNum == 1, "Too many input tensors for the function!");
XTensor * input = income.tails[0];
XTensor * output = node;
XNoder::MakeGrad(input);
if (!isEfficient || input->isGrad) {
XNoder::MakeGrad(input);
if(operID == FUNC_HARDTANH)
_HardTanHBackward(output, input, output->grad, input->grad);
else if(operID == FUNC_IDENTITY)
_IdentityBackward(output, input, output->grad, input->grad);
else if(operID == FUNC_LOGSOFTMAX){
int leadDim = income.GetParamInt(0);
CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in logsoftmax!");
_LogSoftmaxBackward(NULL, output, input, output->grad, input->grad, NULL, leadDim, NOLOSS);
}
else if(operID == FUNC_RECTIFY)
_RectifyBackward(output, input, output->grad, input->grad);
else if(operID == FUNC_SIGMOID)
_SigmoidBackward(output, input, output->grad, input->grad);
else if(operID == FUNC_SOFTMAX){
int leadDim = income.GetParamInt(0);
CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in softmax!");
_SoftmaxBackward(NULL, output, input, output->grad, input->grad, NULL, leadDim, NOLOSS);
}
else{
ShowNTErrors("Wrong activation function type!");
XTensor * dedx = input->grad;
XTensor * dedy = output->grad;
XTensor* tmp;
/* store the result to a temporary node if the input has multiple children */
if (input->outgo.tailNum > 1) {
tmp = NewTensor(output);
tmp->SetZeroAll();
}
/* otherwise, the result is directly stored into the input node */
else {
tmp = dedx;
}
if (operID == FUNC_HARDTANH)
_HardTanHBackward(output, input, dedy, tmp);
else if (operID == FUNC_IDENTITY)
_IdentityBackward(output, input, dedy, tmp);
else if (operID == FUNC_LOGSOFTMAX) {
int leadDim = income.GetParamInt(0);
CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in logsoftmax!");
_LogSoftmaxBackward(NULL, output, input, dedy, tmp, NULL, leadDim, NOLOSS);
}
else if (operID == FUNC_RECTIFY)
_RectifyBackward(output, input, dedy, tmp);
else if (operID == FUNC_SIGMOID)
_SigmoidBackward(output, input, dedy, tmp);
else if (operID == FUNC_SOFTMAX) {
int leadDim = income.GetParamInt(0);
CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in softmax!");
_SoftmaxBackward(NULL, output, input, dedy, tmp, NULL, leadDim, NOLOSS);
}
else {
ShowNTErrors("Unsupported backward computation! TODO!");
}
if (input->outgo.tailNum > 1) {
_SumMe(dedx, tmp);
DelTensor(tmp);
}
}
node->visitMark = NODE_FINISHED;
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......@@ -33,7 +33,6 @@
namespace nts{
/* compute dE/dx of a node */
void XLossGrad::MakeGrad(XTensor * node, bool isEfficient)
{
......@@ -48,33 +47,45 @@ void XLossGrad::MakeGrad(XTensor * node, bool isEfficient)
XTensor * padding = NULL;
int leadingDim;
XNoder::MakeGrad(output);
XTensor * dedy = output->grad;
if (income.tailNum == 1) {
if(dedy->dataType == X_FLOAT)
_SetDataFixedFloat(dedy, 1.0F);
else if(dedy->dataType == X_DOUBLE)
_SetDataFixedDouble(dedy, 1.0);
else if(dedy->dataType == X_INT)
_SetDataFixedInt(dedy, 1);
else
ShowNTErrors("TODO");
return;
}
gold = income.tails[1];
if(operID == LOSS_CROSSENTROPY) {
if (income.tailNum == 3)
padding = income.tails[2];
leadingDim = income.GetParamInt(0);
CheckNTErrors(leadingDim >= 0 && leadingDim < output->order, "wrong leading dimension in logsoftmax!");
_CrossEntropyBackward(dedy, output, gold, weight, padding, leadingDim);
}
else{
ShowNTErrors("Wrong activation function type!");
bool isRoot = XNoder::IsRoot(node);
if (!isEfficient || output->isGrad) {
XNoder::MakeGrad(output);
XTensor * dedy = output->grad;
if (income.tailNum == 1) {
dedy->SetDataFixed(1);
return;
}
gold = income.tails[1];
XTensor* tmp;
if (!isRoot) {
tmp = NewTensor(output);
tmp->SetZeroAll();
}
else{
tmp = dedy;
}
if (operID == LOSS_CROSSENTROPY) {
if (income.tailNum == 3)
padding = income.tails[2];
leadingDim = income.GetParamInt(0);
CheckNTErrors(leadingDim >= 0 && leadingDim < output->order, "wrong leading dimension in logsoftmax!");
_CrossEntropyBackward(tmp, output, gold, weight, padding, leadingDim);
if (isRoot)
gold->DestroyData();
else
_SumMe(dedy, tmp);
}
else {
ShowNTErrors("Unsupported backward computation! TODO!");
}
if (!isRoot)
DelTensor(tmp);
}
node->visitMark = NODE_FINISHED;
......@@ -87,79 +98,4 @@ bool XLossGrad::IsLossOP(XTensor * node)
return (income.typeID & LOSS_BASE) != 0;
}
/*
compute dE/dx for a given function y = f(x)
>> gold - gold standard to measure error (or loss)
>> y - output of the function
>> x - input of the function
>> dedy - dE/dy
>> dedx - dE/dx
>> funcID - id of the function f
>> params - parameters of the function
>> lossName - name of the loss, e.g., cross entropy
*/
//void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x,
// XTensor * dedy, XTensor * dedx, XTensor * padding,
// int funcID, void * params,
// LOSS_FUNCTION_NAME lossName)
//{
// CheckNTErrors(gold && y && x, "Empty input tensors!");
// CheckNTErrors(dedx, "Empty gradient tensors!");
// CheckNTErrors((funcID & FUNCTION_BASE) != 0, "Illegal function id");
//
// if(funcID == FUNC_HARDTANH){
// _HardTanHBackward(gold, y, x, dedy, dedx, lossName);
// }
// else if(funcID == FUNC_IDENTITY){
// _IdentityBackward(gold, y, x, dedy, dedx, lossName);
// }
// else if(funcID == FUNC_LOGSOFTMAX){
// int leadDim = *(int*)params;
// _LogSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
// }
// else if(funcID == FUNC_RECTIFY){
// _RectifyBackward(gold, y, x, dedy, dedx, lossName);
// }
// else if(funcID == FUNC_SIGMOID){
// _SigmoidBackward(gold, y, x, dedy, dedx, lossName);
// }else if(funcID == FUNC_SOFTMAX){
// int leadDim = *(int*)params;
// _SoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
// }
// else{
// ShowNTErrors("wrong function found when call the backward process!");
// }
//
//}
/*
compute dE/dy for variable y and error(loss) function E
>> gold - gold standard to measure error (or loss)
>> y - output of the function
>> dedy - dE/dy
>> lossName - name of the loss, e.g., cross entropy
*/
//void XLossGrad::Compute(XTensor * gold, XTensor * y,
// XTensor * dedy, XTensor * padding,
// LOSS_FUNCTION_NAME lossName)
//{
// if(gold == NULL){
// if(dedy->dataType == X_FLOAT)
// _SetDataFixedFloat(dedy, 1.0F);
// else if(dedy->dataType == X_DOUBLE)
// _SetDataFixedDouble(dedy, 1.0);
// else if(dedy->dataType == X_INT)
// _SetDataFixedInt(dedy, 1);
// else{
// ShowNTErrors("TODO");
// }
// return;
// }
//
// //_LossBackward(dedy, gold, y, lossName);
// if(lossName == CROSSENTROPY)
// _CrossEntropyBackward(dedy, y, gold, NULL, padding);
//
//}
}
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......@@ -126,6 +126,18 @@ private:
static
void GradPower(XTensor * node, bool isEfficient);
/* gradient for power */
static
void GradReciprocal(XTensor* node, bool isEfficient);
/* gradient for sqrt */
static
void GradSqrt(XTensor* node, bool isEfficient);
/* gradient for square */
static
void GradSquare(XTensor* node, bool isEfficient);
/* gradient for ScaleAndShift */
static
void GradScaleAndShift(XTensor * node, bool isEfficient);
......@@ -188,6 +200,10 @@ private:
/* gradient for operation */
static
void GradMulAndShift(XTensor * node, bool isEfficient);
/* gradient for MLP */
static
void GradMLP(XTensor* node, bool isEfficient);
};
}
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......@@ -34,7 +34,7 @@ class XShapeGrad
public:
/* compute dE/dx of a node */
static
void MakeGrad(XTensor * node, bool isEfficent);
void MakeGrad(XTensor * node, bool isEfficient);
/* indicates whether the node is for a shaping operation */
static
......@@ -42,55 +42,59 @@ public:
/* post processing of a node */
static
void PostProcessing(XTensor * node, int typeId, bool isEfficent);
void PostProcessing(XTensor * node, int typeId, bool isEfficient);
private:
/* gradient computation for convertdatatype: b = convertdatatype(a) */
static
void GradConvertDataType(XTensor * node, bool isEfficient);
/* gradient computation for copying indexed sub-tensors: b = copyindexed(a, srcIndex, indexSize, tgtIndex, copyNum) */
static
void GradCopyIndexed(XTensor * node, bool isEfficent);
void GradCopyIndexed(XTensor * node, bool isEfficient);
/* gradient computation for copying indexed sub-tensors: b = gather(a, index) */
static
void GradGather(XTensor * node, bool isEfficent);
void GradGather(XTensor * node, bool isEfficient);
/* gradient computation for dropout with index: b = dropoutwithindex(a, index) */
static
void GradDropoutWithIndex(XTensor * node, bool isEfficent);
void GradDropoutWithIndex(XTensor * node, bool isEfficient);
/* gradient computation for merge: c = merge(a, b, ...) */
static
void GradMerge(XTensor * node, bool isEfficent);
void GradMerge(XTensor * node, bool isEfficient);
/* gradient computation for merging a list of tensors : c = merge(list(a, b, ...)) */
static
void GradMergeList(XTensor * node, bool isEfficent);
void GradMergeList(XTensor * node, bool isEfficient);
/* gradient computation for transposing a tensor : b = transpose(a) */
static
void GradTranspose(XTensor * node, bool isEfficent);
void GradTranspose(XTensor * node, bool isEfficient);
/* gradient computation for reshaping a tensor: c = reshape(a) */
static
void GradReshape(XTensor * node, bool isEfficent);
void GradReshape(XTensor * node, bool isEfficient);
/* gradient computation for split: c = split(a) */
static
void GradSplit(XTensor * node, bool isEfficent);
void GradSplit(XTensor * node, bool isEfficient);
/* gradient computation for spliting. we return the list of the splits : list(c_1, ...) = split(a) */
static
void GradSplitList(XTensor * node, bool isEfficent);
void GradSplitList(XTensor * node, bool isEfficient);
/* gradient computation for spliting. we return the list of the splits : list(c_1, ...) = split(a).
this method is called only when all nodes of spliting have been processed. We do this in a post-processing
manner because we can fuze multiple memory copy jobs one time. This is good for system speed up. */
static
void GradSplitListPost(XTensor * node, bool isEfficent);
void GradSplitListPost(XTensor * node, bool isEfficient);
/* gradient computation for unsqueezing a tensor : c = unsqueeze(a) */
static
void GradUnsqueeze(XTensor * node, bool isEfficent);
void GradUnsqueeze(XTensor * node, bool isEfficient);
};
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......@@ -121,8 +121,13 @@ void XNet::Backward(TensorList &roots)
ClearGrad(parent);
}
if(XNoder::IsLeaf(node))
if (XNoder::IsLeaf(node)) {
ClearGrad(node);
if (node->outgo.tailNum == 0) {
delete node;
}
}
}
}
}
......@@ -316,7 +321,6 @@ void XNet::ClearGrad(XTensor * node)
}
if(finished){
//fprintf(stderr, "del %d %ld\n", node->id, node->grad->unitNum);
delete node->grad;
node->grad = NULL;
}
......@@ -334,7 +338,7 @@ void XNet::ShowNetwork(FILE * file, XTensor * node)
Traverse(roots);
XLink::ShowNode(file, node);
//XLink::ShowNode(file, node);
/* go over nodes in its topological order */
for(int i = nodes.count - 1; i >= 0; i--){
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......@@ -128,8 +128,10 @@ int FNNLMMain(int argc, const char ** argv)
Init(model);
/* learn model parameters */
if(strcmp(trainFN, ""))
if(strcmp(trainFN, "")) {
ENABLE_GRAD;
Train(trainFN, shuffled, model);
}
/* save the final model */
if(strcmp(modelFN, "") && strcmp(trainFN, ""))
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......@@ -17,12 +17,15 @@
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-10-09
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#include <math.h>
#include <cmath>
#include "T2TDecoder.h"
#include "T2TUtility.h"
#include "T2TLayerNormal.h"
#include "module/T2TUtility.h"
#include "module/T2TLayerNormal.h"
#include "module/T2TCommonModules.h"
#include "../../tensor/core/CHeader.h"
namespace transformer
......@@ -34,6 +37,7 @@ AttDecoder::AttDecoder()
selfAtt = NULL;
fnns = NULL;
selfAttLayerNorms = NULL;
fnnLayerNorms = NULL;
enDeAtt = NULL;
enDeAttLayerNorms = NULL;
decoderLayerNorm = NULL;
......@@ -49,123 +53,143 @@ AttDecoder::~AttDecoder()
delete[] selfAtt;
delete[] fnns;
delete[] selfAttLayerNorms;
delete[] fnnLayerNorms;
delete[] enDeAtt;
delete[] enDeAttLayerNorms;
delete decoderLayerNorm;
if (preNorm)
delete decoderLayerNorm;
}
/*
initialize the model
>> argc - number of arguments
>> argv - list of pointers to the arguments
>> myIsMasked - indicates whether the masked attention is employed
>> myIgnored - number of positions ignored in attention (from the start)
>> myDevID - device id
/*
initialize the model
>> config - configurations of the model
*/
void AttDecoder::InitModel(int argc, char ** argv,
bool myIsMasked, int myIgnored,
int myDevID)
void AttDecoder::InitModel(T2TConfig& config)
{
//AttEncoder::InitModel(argc, argv, myIsMasked, myIgnored, myDevID);
devID = myDevID;
ignored = myIgnored;
LoadParamInt(argc, argv, "nlayer", &nlayer, 4);
LoadParamInt(argc, argv, "hsize", &hSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "esize", &eSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "vsizetgt", &vSize, 34040);
LoadParamFloat(argc, argv, "dropout", &dropoutP, 0);
devID = config.devID;
nlayer = config.nDecLayer;
hSize = config.modelSize;
eSize = config.embSize;
vSize = config.tgtVocabSize;
dropoutP = config.dropout;
preNorm = config.preNorm;
CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsizetgt\"");
/* embedding model */
embedder.InitModel(argc, argv, devID, false);
embedder.InitModel(config, false);
selfAtt = new T2TAttention[nlayer];
fnns = new T2TFNN[nlayer];
selfAttLayerNorms = new T2TLN[nlayer];
enDeAtt = new T2TAttention[nlayer];
enDeAttLayerNorms = new T2TLN[nlayer];
decoderLayerNorm = new T2TLN;
fnnLayerNorms = new T2TLN[nlayer];
selfAttCache = new Cache[nlayer];
enDeAttCache = new Cache[nlayer];
if (preNorm)
decoderLayerNorm = new T2TLN;
/* initialize the stacked layers */
for (int i = 0; i < nlayer; i++) {
selfAtt[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID);
fnns[i].InitModel(argc, argv, myDevID);
selfAttLayerNorms[i].InitModel(argc, argv, myDevID);
enDeAtt[i].InitModel(argc, argv, true, myIgnored, myDevID);
enDeAttLayerNorms[i].InitModel(argc, argv, myDevID);
selfAtt[i].InitModel(config);
fnns[i].InitModel(config);
selfAttLayerNorms[i].InitModel(config);
fnnLayerNorms[i].InitModel(config);
enDeAtt[i].InitModel(config);
enDeAttLayerNorms[i].InitModel(config);
}
decoderLayerNorm->InitModel(argc, argv, myDevID);
if (preNorm)
decoderLayerNorm->InitModel(config);
}
/*
/*
make the decoding network
>> inputDec - the input tensor of the decoder
>> outputEnc - the output tensor of the encoder
>> mask - mask that indicates which position is valid
>> maskEncDec - mask for the encoder-decoder attention
>> nstep - the current length of the decoder input
>> isTraining - indicates whether the model is used for training
<< return - the output tensor of the encoder
<< return - the output tensor of the decoder
*/
XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor *mask, XTensor &maskEncDec, bool isTraining)
XTensor AttDecoder::Make(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
XTensor* maskEncDec, int nstep, bool isTraining)
{
XTensor x;
x = embedder.Make(inputDec, inputDec.GetDim(1), true);
x = embedder.Make(inputDec, true, isTraining, nstep);
/* dropout */
if(isTraining && dropoutP > 0)
if (isTraining && dropoutP > 0)
x = Dropout(x, dropoutP);
for(int i = 0; i < nlayer; i++){
for (int i = 0; i < nlayer; i++) {
XTensor att;
XTensor ende;
XTensor ln;
XTensor fnn;
XTensor inputNorm;
XTensor attNorm;
XTensor res;
XTensor selfAttnBefore;
XTensor selfAttnAfter;
XTensor endeAttnBefore;
XTensor endeAttnAfter;
XTensor fnnBefore;
/* layer normalization */
inputNorm = selfAttLayerNorms[i].Make(x);
/* layer normalization with pre-norm for self-attn */
selfAttnBefore = LayerNorm(x, selfAttLayerNorms[i], preNorm, true, false);
/******************/
/* self attention */
att = selfAtt[i].Make(inputNorm, inputNorm, inputNorm, NULL, isTraining, &selfAttCache[i], SELF_ATT);
att = selfAtt[i].Make(selfAttnBefore, selfAttnBefore, selfAttnBefore,
mask, isTraining, &selfAttCache[i], SELF_ATT);
/* dropout */
if(isTraining && dropoutP > 0)
if (isTraining && dropoutP > 0)
att = Dropout(att, dropoutP);
/* residual connection */
att = att + x;
res = Sum(att, x);
/* layer normalization */
attNorm = enDeAttLayerNorms[i].Make(att);
/* layer normalization with post-norm for self-attention */
selfAttnAfter = LayerNorm(res, selfAttLayerNorms[i], preNorm, false, true);
/* layer normalization with pre-norm for encoder-decoder attention */
endeAttnBefore = LayerNorm(selfAttnAfter, enDeAttLayerNorms[i], preNorm, true, false);
/* encoder-decoder attention */
ende = enDeAtt[i].Make(outputEnc, attNorm, outputEnc, &maskEncDec, isTraining, &enDeAttCache[i], EN_DE_ATT);
ende = enDeAtt[i].Make(outputEnc, endeAttnBefore, outputEnc, maskEncDec,
isTraining, &enDeAttCache[i], EN_DE_ATT);
/* dropout */
if(isTraining && dropoutP > 0)
if (isTraining && dropoutP > 0)
ende = Dropout(ende, dropoutP);
/* residual connection */
ende = ende + att;
res = Sum(ende, selfAttnAfter);
/* layer normalization with post-norm for encoder-decoder attention */
endeAttnAfter = LayerNorm(res, enDeAttLayerNorms[i], preNorm, false, true);
/* layer normalization with pre-norm for fnn */
fnnBefore = LayerNorm(endeAttnAfter, fnnLayerNorms[i], preNorm, true, false);
/* fnn */
x = fnns[i].Make(ende, isTraining);
fnn = fnns[i].Make(fnnBefore, isTraining);
}
/* dropout */
if (isTraining && dropoutP > 0)
fnn = Dropout(fnn, dropoutP);
x = decoderLayerNorm->Make(x);
/* residual connection */
res = Sum(fnn, endeAttnAfter);
return x;
}
/* layer normalization with post-norm for fnn */
x = LayerNorm(res, fnnLayerNorms[i], preNorm, false, true);
}
if (preNorm)
x = decoderLayerNorm->Make(x);
return x;
}
}
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......@@ -17,18 +17,17 @@
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#ifndef __T2TDECODER_H__
#define __T2TDECODER_H__
#include "T2TEncoder.h"
#include "module/T2TUtility.h"
namespace transformer
{
#define DECODING_NAME "decoding"
#define DECODING_INPUT_NAME "decoding_input"
class AttDecoder
{
......@@ -52,36 +51,29 @@ public:
/* dropout probability */
DTYPE dropoutP;
/* some positions can be ignored in attention. this is useful in lm where the first position needs
* special design for the attention model. */
int ignored;
/* embedding of word at each position */
T2TEmbedder embedder;
/* FNN model of each layer */
T2TFNN * fnns;
T2TFNN* fnns;
/* attention model of each layer */
T2TAttention * selfAtt;
T2TAttention* selfAtt;
/* layer normalization for attention */
T2TLN * selfAttLayerNorms;
/* layer normalization for decoder */
T2TLN * decoderLayerNorm;
T2TLN* selfAttLayerNorms;
/* input tensor of the encoder */
XTensor * input;
/* layer normalization for fnn */
T2TLN* fnnLayerNorms;
/* output tensor of the encoder */
XTensor * output;
/* layer normalization for decoder */
T2TLN* decoderLayerNorm;
/* encoder-decoder attention model of each layer */
T2TAttention * enDeAtt;
T2TAttention* enDeAtt;
/* layer normalization for encoder-decoder attention */
T2TLN * enDeAttLayerNorms;
T2TLN* enDeAttLayerNorms;
/* layer cache list */
Cache* selfAttCache;
......@@ -89,20 +81,22 @@ public:
/* layer cache list */
Cache* enDeAttCache;
/* the location of layer normalization */
bool preNorm;
public:
/* constructor */
AttDecoder();
/* deconstructor */
/* de-constructor */
~AttDecoder();
/* initialize the model */
void InitModel(int argc, char ** argv,
bool myIsMasked, int myIgnored,
int myDevID = -1);
void InitModel(T2TConfig& config);
/* make the decoding network */
XTensor Make(XTensor &inputDec, XTensor &outputEnc, XTensor *mask, XTensor &maskEncDec, bool isTraining);
XTensor Make(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
XTensor* maskEncDec, int nstep, bool isTraining);
};
}
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......@@ -17,12 +17,15 @@
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#include <math.h>
#include <cmath>
#include "T2TEncoder.h"
#include "T2TLayerNormal.h"
#include "T2TUtility.h"
#include "module/T2TUtility.h"
#include "module/T2TLayerNormal.h"
#include "module/T2TCommonModules.h"
#include "../../tensor/core/CHeader.h"
namespace transformer
......@@ -31,63 +34,65 @@ namespace transformer
/* constructor */
AttEncoder::AttEncoder()
{
attentions = NULL;
selfAtt = NULL;
fnns = NULL;
attLayerNorms = NULL;
fnnLayerNorms = NULL;
encoderLayerNorm = NULL;
}
/* de-constructor */
AttEncoder::~AttEncoder()
{
delete[] attentions;
delete[] selfAtt;
delete[] fnns;
delete[] attLayerNorms;
delete encoderLayerNorm;
delete[] fnnLayerNorms;
if (preNorm)
delete encoderLayerNorm;
}
/*
initialize the model
>> argc - number of arguments
>> argv - list of pointers to the arguments
>> myIsMasked - indicates whether the masked attention is employed
>> myIgnored - number of positions ignored in attention (from the start)
>> myDevID - device id
/*
initialize the model
>> config - configurations for the model
*/
void AttEncoder::InitModel(int argc, char ** argv,
bool myIsMasked, int myIgnored,
int myDevID)
void AttEncoder::InitModel(T2TConfig& config)
{
devID = myDevID;
ignored = myIgnored;
LoadParamInt(argc, argv, "nlayer", &nlayer, 20);
LoadParamInt(argc, argv, "hsize", &hSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "esize", &eSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "vsize", &vSize, 34040);
LoadParamFloat(argc, argv, "dropout", &dropoutP, 0);
devID = config.devID;
nlayer = config.nEncLayer;
eSize = config.embSize;
hSize = config.modelSize;
vSize = config.srcVocabSize;
preNorm = config.preNorm;
dropoutP = config.dropout;
CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsize\"");
/* embedding model */
embedder.InitModel(argc, argv, devID);
embedder.InitModel(config);
attentions = new T2TAttention[nlayer];
selfAtt = new T2TAttention[nlayer];
fnns = new T2TFNN[nlayer];
attLayerNorms = new T2TLN[nlayer];
encoderLayerNorm = new T2TLN;
fnnLayerNorms = new T2TLN[nlayer];
if (preNorm)
encoderLayerNorm = new T2TLN;
/* initialize the stacked layers */
for(int i = 0; i < nlayer; i++){
attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID);
fnns[i].InitModel(argc, argv, myDevID);
attLayerNorms[i].InitModel(argc, argv, myDevID);
for (int i = 0; i < nlayer; i++) {
selfAtt[i].InitModel(config);
fnns[i].InitModel(config);
attLayerNorms[i].InitModel(config);
fnnLayerNorms[i].InitModel(config);
}
encoderLayerNorm->InitModel(argc, argv, myDevID);
if (preNorm)
encoderLayerNorm->InitModel(config);
}
/*
/*
make the encoding network
>> input - the input tensor of the encoder
>> mask - the mask that indicate each position is valid
......@@ -95,53 +100,74 @@ make the encoding network
>> isTraining - indicates whether the model is used for training
<< return - the output tensor of the encoder
*/
XTensor AttEncoder::Make(XTensor &input, XTensor *mask, XTensor &maskEncDec, bool isTraining)
XTensor AttEncoder::Make(XTensor& input, XTensor* mask, XTensor& maskEncDec, bool isTraining)
{
XTensor x;
x = embedder.Make(input, 0);
x = embedder.Make(input, false, isTraining);
/* dropout */
if(isTraining && dropoutP > 0)
if (isTraining && dropoutP > 0)
x = Dropout(x, dropoutP);
for(int i = 0; i < nlayer; i++){
for (int i = 0; i < nlayer; i++) {
XTensor att;
XTensor ln;
XTensor fnn;
XTensor res;
XTensor inputNorm;
XTensor attnBefore;
XTensor attnAfter;
XTensor fnnBefore;
/* layer normalization */
inputNorm = attLayerNorms[i].Make(x);
/* layer normalization with pre-norm for self-attn */
attnBefore = LayerNorm(x, attLayerNorms[i], preNorm, true, false);
/* self attention */
att = attentions[i].Make(inputNorm, inputNorm, inputNorm, mask, isTraining, NULL, 0);
att = selfAtt[i].Make(attnBefore, attnBefore, attnBefore, mask, isTraining, NULL, 0);
/* dropout */
if (isTraining && dropoutP > 0)
att = Dropout(att, dropoutP);
/* residual connection */
res = Sum(att, x);
/* layer normalization with post-norm for self-attn */
attnAfter = LayerNorm(res, attLayerNorms[i], preNorm, false, true);
/* layer normalization with pre-norm for fnn */
fnnBefore = LayerNorm(attnAfter, fnnLayerNorms[i], preNorm, true, false);
/* fnn */
x = fnns[i].Make(res, isTraining);
}
fnn = fnns[i].Make(fnnBefore, isTraining);
x = encoderLayerNorm->Make(x);
/* dropout */
if (isTraining && dropoutP > 0)
fnn = Dropout(fnn, dropoutP);
/* residual connection */
res = Sum(fnn, attnAfter);
/* layer normalization with post-norm for fnn */
x = LayerNorm(res, fnnLayerNorms[i], preNorm, false, true);
}
if (preNorm)
x = encoderLayerNorm->Make(x);
return x;
}
/*
make the encoding network (wrapper)
make the encoding network (wrapper)
>> input - the input tensor of the encoder
>> mask - the mask that indicate each position is valid
>> isTraining - indicates whether the model is used for training
<< return - the output tensor of the encoder
*/
XTensor AttEncoder::Make(XTensor &input, XTensor *mask, bool isTraining)
XTensor AttEncoder::Make(XTensor& input, XTensor* mask, bool isTraining)
{
XTensor nothing;
return Make(input, mask, nothing, isTraining);
}
}
}
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......@@ -17,47 +17,35 @@
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#ifndef __T2TENCODER_H__
#define __T2TENCODER_H__
#include "T2TFNN.h"
#include "T2TAttention.h"
#include "T2TEmbedding.h"
#include "T2TLayerNormal.h"
#include "module/T2TFNN.h"
#include "module/T2TUtility.h"
#include "module/T2TAttention.h"
#include "module/T2TEmbedding.h"
#include "module/T2TLayerNormal.h"
#include "../../network/XNet.h"
using namespace nts;
namespace transformer
{
#define ENCODING_NAME "encoding"
#define ENCODING_INPUT_NAME "encoding_input"
/*
base class of the encoder
/*
base class of the encoder
*/
class T2TEncoder
{
public:
virtual
XTensor Make(XTensor &input, XTensor *mask, XTensor &mask2, bool isTraining) = 0;
};
/*
the encoder based on RNN
*/
class RNNEncoder : T2TEncoder
{
public:
XTensor Make(XTensor &input, XTensor *mask, XTensor &mask2, bool isTraining);
virtual XTensor Make(XTensor& input, XTensor* mask, XTensor& mask2, bool isTraining) = 0;
};
/*
the encoder based on self-attention
/*
the encoder based on self-attention
*/
class AttEncoder : T2TEncoder
{
......@@ -88,23 +76,23 @@ public:
T2TEmbedder embedder;
/* FNN model of each layer */
T2TFNN * fnns;
T2TFNN* fnns;
/* attention model of each layer */
T2TAttention * attentions;
T2TAttention* selfAtt;
/* layer normalizations for attention */
T2TLN * attLayerNorms;
T2TLN* attLayerNorms;
/* layer normalization for fnn */
T2TLN* fnnLayerNorms;
/* layer normalization for encoder */
T2TLN * encoderLayerNorm;
T2TLN* encoderLayerNorm;
/* input tensor of the encoder */
XTensor * input;
/* the location of layer normalization */
bool preNorm;
/* output tensor of the encoder */
XTensor * output;
public:
/* constructor */
AttEncoder();
......@@ -113,18 +101,15 @@ public:
~AttEncoder();
/* initialize the model */
void InitModel(int argc, char ** argv,
bool myIsMasked, int myIgnored,
int myDevID = -1);
void InitModel(T2TConfig& config);
/* make the encoding network */
XTensor Make(XTensor &input, XTensor *mask, XTensor &maskEncDec, bool isTraining);
XTensor Make(XTensor& input, XTensor* mask, XTensor& maskEncDec, bool isTraining);
/* make the encoding network (wrapper) */
XTensor Make(XTensor &input, XTensor *mask, bool isTraining);
XTensor Make(XTensor& input, XTensor* mask, bool isTraining);
};
}
#endif
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......@@ -17,16 +17,18 @@
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#ifndef __T2TMODEL_H__
#define __T2TMODEL_H__
#include "T2TFNN.h"
#include "T2TAttention.h"
#include "T2TEncoder.h"
#include "T2TDecoder.h"
#include "T2TOutput.h"
#include "module/T2TFNN.h"
#include "module/T2TOutput.h"
#include "module/T2TUtility.h"
#include "module/T2TAttention.h"
namespace transformer
{
......@@ -41,13 +43,13 @@ public:
int devID;
/* the encoder */
AttEncoder * encoder;
AttEncoder* encoder;
/* the decoder */
AttDecoder * decoder;
AttDecoder* decoder;
/* output layer */
T2TOutput * outputLayer;
T2TOutput* outputLayer;
/* indicates whether the model is running for language modeling */
bool isLM;
......@@ -55,9 +57,18 @@ public:
/* indicates whether the model is running for machine translation */
bool isMT;
/* indicates whether the model is running with FP16 data type */
bool useFP16;
/* number of heads in the attention model */
int nhead;
/* indicates whether share encoders embeddings with decoders */
int shareAllEmbeddings;
/* indicates whether share decoder embeddings with output weights */
int shareDecInputOutputWeight;
public:
/* constructor */
T2TModel();
......@@ -66,42 +77,42 @@ public:
~T2TModel();
/* initialize the model */
void InitModel(int argc, char ** argv);
void InitModel(T2TConfig& config);
/* make the encoding network */
XTensor MakeEncoder(XTensor &input, XTensor *mask, bool isTraining);
XTensor MakeEncoder(XTensor& input, XTensor* mask, bool isTraining);
/* make the encoding network */
XTensor MakeDecoder(XTensor &inputEnc, XTensor &inputDec, XTensor *mask, XTensor &MaskEncDec, bool isTraining);
XTensor MakeDecoder(XTensor& inputEnc, XTensor& inputDec, XTensor* mask,
XTensor& MaskEncDec, bool isTraining);
/* make the network for langauge modeling (with the output softmax layer) */
void MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool isTraining);
/* make the network for language modeling (with the output softmax layer) */
void MakeLM(XTensor& input, XTensor& output, XTensor& padding, bool isTraining);
/* make the network for machine translation (with the output softmax layer) */
void MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output,
XTensor &paddingEnc, XTensor &paddingDec, bool isTraining);
void MakeMT(XTensor& inputEnc, XTensor& inputDec, XTensor& output,
XTensor& paddingEnc, XTensor& paddingDec, bool isTraining);
/* make the mask for training MT models */
void MakeMTMask(XTensor &inputEnc, XTensor &inputDec,
XTensor &paddingEnc, XTensor &paddingDec,
XTensor &maskEnc, XTensor &maskDec, XTensor &maskEncDec);
void MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
XTensor& paddingEnc, XTensor& paddingDec,
XTensor& maskEnc, XTensor& maskDec, XTensor& maskEncDec);
/* make the mask of the encoder */
void MakeMTMaskEnc(XTensor &inputEnc, XTensor &paddingEnc, XTensor &maskEnc);
void MakeMTMaskEnc(XTensor& paddingEnc, XTensor& maskEnc);
/* make the mask of the decoder */
void MakeMTMaskDec(XTensor& inputEnc, XTensor& inputDec,
XTensor& paddingEnc, XTensor& paddingDec,
void MakeMTMaskDec(XTensor& paddingEnc, XTensor& paddingDec,
XTensor& maskDec, XTensor& maskEncDec);
/* get parameter matrics */
void GetParams(TensorList &list);
/* get parameter matrices */
void GetParams(TensorList& list);
/* dump the parameters */
void Dump(const char * fn);
/* dump the model to a file */
void Dump(const char* fn);
/* read the parameters */
void Read(const char * fn);
void Read(FILE* file);
};
}
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......@@ -17,75 +17,55 @@
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-06
*/
#include <math.h>
#include <time.h>
#include <cmath>
#include <ctime>
#include "Transformer.h"
#include "T2TModel.h"
#include "T2TUtility.h"
#include "T2TPredictor.h"
#include "T2TTester.h"
#include "train/T2TTrainer.h"
#include "module/T2TUtility.h"
#include "translate/T2TTranslator.h"
#include "../../tensor/XDevice.h"
#include "../../tensor/XUtility.h"
#include "../../tensor/XGlobal.h"
#include "../../tensor/XUtility.h"
namespace transformer
{
int TransformerMain(int argc, const char ** argv)
int TransformerMain(int argc, const char** argv)
{
if(argc == 0)
if (argc == 0)
return 1;
char ** args = new char*[argc];
for(int i = 0; i < argc; i++){
args[i] = new char[strlen(argv[i]) + 1];
strcpy(args[i], argv[i]);
}
ShowParams(argc, args);
bool isBeamSearch = false;
char * trainFN = new char[MAX_LINE_LENGTH];
char * modelFN = new char[MAX_LINE_LENGTH];
char * testFN = new char[MAX_LINE_LENGTH];
char * outputFN = new char[MAX_LINE_LENGTH];
char * rawModel = new char[MAX_LINE_LENGTH];
LoadParamString(argc, args, "model", modelFN, "");
LoadParamString(argc, args, "rawmodel", rawModel, "");
LoadParamString(argc, args, "test", testFN, "");
LoadParamString(argc, args, "output", outputFN, "");
LoadParamBool(argc, args, "beamsearch", &isBeamSearch, false);
/* load configurations */
T2TConfig config(argc, argv);
srand((unsigned int)time(NULL));
T2TModel model;
model.InitModel(argc, args);
/* load the model if neccessary */
if(strcmp(modelFN, ""))
model.Read(modelFN);
/* test the model on the new data */
if(strcmp(testFN, "") && strcmp(outputFN, "")){
T2TTester searcher;
searcher.Init(argc, args);
searcher.Test(testFN, outputFN, &model);
/* train the model */
if (strcmp(config.trainFN, "") != 0) {
ENABLE_GRAD;
T2TModel model;
model.InitModel(config);
T2TTrainer trainer;
trainer.Init(config);
trainer.Train(config.trainFN, config.validFN, config.modelFN, &model);
}
delete[] trainFN;
delete[] modelFN;
delete[] testFN;
delete[] outputFN;
delete[] rawModel;
for(int i = 0; i < argc; i++)
delete[] args[i];
delete[] args;
/* translate the test file */
if (strcmp(config.testFN, "") != 0 && strcmp(config.outputFN, "") != 0) {
DISABLE_GRAD;
T2TModel model;
model.InitModel(config);
T2TTranslator translator;
translator.Init(config);
translator.Translate(config.testFN, config.srcVocabFN,
config.tgtVocabFN, config.outputFN, &model);
}
return 0;
}
}
}
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......@@ -17,13 +17,13 @@
/*
*
* An impelementation of the transformer system. See more details
* about FNNLM in
* An implementation of the transformer system. See more details
* about FNNLM in
* "Attention Is All You Need" by Vaswani et al.
* https://arxiv.org/pdf/1706.03762.pdf
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
* I start writing the code related to NMT - a long time since my last coding
* I start writing the code related to NMT - a long time since my last coding
* work on MT
*/
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
*/
#ifndef __T2TATTENTION_H__
#define __T2TATTENTION_H__
#include "T2TNNUtil.h"
#include "T2TUtility.h"
#include "../../../network/XNet.h"
#include "../../../tensor/core/CHeader.h"
using namespace nts;
namespace transformer
{
/* attention type */
enum { NONE, SELF_ATT, EN_DE_ATT };
/* layer cache for keys and values */
class Cache
{
public:
/* cache for keys, (B, L, H) */
XTensor key;
/* cache for values, (B, L, H) */
XTensor value;
public:
/* indicates cache miss if 'true' */
bool miss;
/* constructor */
Cache();
/* update the states cache */
void Update(XTensor&& k, XTensor&& v);
/* keep alive states */
void KeepAlive(XTensor& aliveIdx);
/* reorder alive states */
void Reorder(XTensor& reorder);
};
/* multi-head attention */
class T2TAttention
{
public:
/* device id */
int devID;
/* head number */
int nhead;
/* transformation matrix for Q */
XTensor wq;
/* bias for Q */
XTensor bq;
/* transformation matrix for K */
XTensor wk;
/* bias for K */
XTensor bk;
/* transformation matrix for V */
XTensor wv;
/* bias for V */
XTensor bv;
XTensor wBig;
XTensor bBig;
/* RPR emb */
XTensor RPEmbK;
/* transformation after dot-product attention */
XTensor wo;
/* bias after dot-product attention */
XTensor bo;
/* size of transformed Q and K */
int dk;
/* size of transformed V */
int dv;
/* size of input Q, K and V */
int d;
/* indicates whether we use the RPR attention */
bool useRPR;
/* dropout probability */
DTYPE dropoutP;
/* the maximum relative window size */
int maxRP;
public:
/* constructor */
T2TAttention();
/* de-constructor */
~T2TAttention();
/* initialize the model */
void InitModel(T2TConfig& config);
/* make the network */
XTensor Make(XTensor& k, XTensor& q, XTensor& v,
XTensor* mask, bool isTraining,
Cache* cache, int cacheType);
/* make the attention network given keys, queries and values (after linear transformation) */
XTensor MakeAttention(XTensor& k, XTensor& q, XTensor& v,
XTensor* mask, bool isTraining);
/* make the attention network given keys, queries and values (after linear transformation) */
XTensor MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v,
XTensor* mask, bool isTraining, bool isEnc);
XTensor GetRPEmbedding(const int lenQ, const int lenKV, const int maxRelativeLen, const bool isEnc);
XTensor RPDotProduct(XTensor& x, XTensor& y, XTensor& z, const bool is_key);
};
}
#endif
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Bei Li (libei_neu@outlook.com) 2020-02-05
* This file includes some common modules of the Transformer model
*/
#include <cmath>
#include "T2TCommonModules.h"
#include "../../../tensor/core/CHeader.h"
#include "../../../tensor/function/FHeader.h"
namespace transformer
{
/*
flexible layer normalization for the Transformer
>> input - input tensor
>> ln - the layernorm network
>> prenorm - whether we use prenorm or not
>> before - whether we use layernorm before attention/fnn
>> after - whether we use layernorm after attention/fnn
*/
XTensor LayerNorm(XTensor& input, T2TLN& ln, bool prenorm, bool before, bool after)
{
if (after ^ prenorm)
return ln.Make(input);
else
return input;
}
}
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Bei Li (libei_neu@outlook.com) 2020-02-03
*/
#ifndef __COMMONMODULE_H__
#define __COMMONMODULE_H__
#include "T2TLayerNormal.h"
#include "T2TCommonModules.h"
using namespace nts;
namespace transformer
{
/* the layer normalization module to control pre-norm or post-norm*/
XTensor LayerNorm(XTensor& input, T2TLN& ln, bool prenorm, bool before, bool after);
}
#endif
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-08-01
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-07
*/
#include <cmath>
#include "T2TUtility.h"
#include "T2TEmbedding.h"
#include "../../../tensor/core/CHeader.h"
namespace transformer
{
/* constructor */
T2TEmbedder::T2TEmbedder()
{
devID = -1;
vSize = -1;
maxLength = -1;
}
/* de-constructor */
T2TEmbedder::~T2TEmbedder()
{
}
/*
initialize the model
>> config - configurations of the model
>> isEnc - indicates if it is used for the encoder
*/
void T2TEmbedder::InitModel(T2TConfig& config, bool isEnc)
{
devID = config.devID;
d = config.modelSize;
padIdx = config.padID;
eSize = config.embSize;
maxLength = config.maxPosLen;
vSize = (isEnc) ? config.srcVocabSize : config.tgtVocabSize;
InitTensor2D(&w, vSize, eSize, X_FLOAT, devID);
maxLength = maxLength + 1 + 1;
DTYPE v = 1.0F / (float)sqrt((float)eSize);
w.SetDataRandn(0, v);
/* create the positional embedding matrix */
MakePosEmbedding(maxLength);
}
/*
make positional embeddings (of size eSize * length)
>> length - length of the sequence
*/
void T2TEmbedder::MakePosEmbedding(int length)
{
InitTensor2D(&posEmbeddingBase, length, eSize, X_FLOAT, devID);
float* data = new float[posEmbeddingBase.unitNum];
for (int pos = 0; pos < length; pos++) {
float* dp = data + pos * eSize;
int channelSize = eSize / 2;
int offset = 0;
for (int i = 0; i < channelSize; i++) {
dp[offset++] = (float)sin(pos * exp(-i * log(10000.0F) / (channelSize - 1)));
}
for (int i = 0; i < channelSize; i++) {
dp[offset++] = (float)cos(pos * exp(-i * log(10000.0F) / (channelSize - 1)));
}
}
/* padding zeros */
int padStart = padIdx * eSize;
for (int i = padStart; i < padStart + eSize; i++)
data[i] = 0.F;
posEmbeddingBase.SetData(data, posEmbeddingBase.unitNum);
if (w.dataType != posEmbeddingBase.dataType)
posEmbeddingBase = ConvertDataType(posEmbeddingBase, w.dataType);
delete[] data;
}
/*
make the network
>> input - the word indices
>> nstep - the length of current sequence
>> isDec - indicates whether it is decoder
>> isTraining - indicates whether it is training
<< return - word & position embeddings of the input
*/
XTensor T2TEmbedder::Make(XTensor& input, bool isDec, bool isTraining, int nstep)
{
/* make sure the padding index is 1 */
CheckNTErrors(input.order > 1, "Wrong input tensor size!");
CheckNTErrors(input.dimSize[input.order - 1] < maxLength, "The sequence is too long!");
CheckNTErrors(vSize > 0, "set vocabulary size by \"-vsize\"");
CheckNTErrors(eSize > 0, "set embedding size by \"-esize\"");
XTensor wordEmbedding, position, posEmbedding;
InitTensor(&position, &input);
int* posData = new int[input.unitNum];
XTensor inputCPU;
InitTensorOnCPU(&inputCPU, &input);
_CopyValues(&input, &inputCPU);
if (!isDec)
{
/* encoder embeddings */
for (int i = 0; i < inputCPU.dimSize[0]; i++) {
int startNoPad = 1 + 1;
int* p = ((int*)inputCPU.data) + i * inputCPU.dimSize[1];
for (int j = 0; j < inputCPU.dimSize[1]; j++) {
if (p[j] == 1) {
posData[i * inputCPU.dimSize[1] + j] = 1;
}
else {
posData[i * inputCPU.dimSize[1] + j] = startNoPad++;
}
}
}
position.SetData(posData, position.unitNum);
}
else
{
/* decoder embeddings */
position.SetDataFixed(nstep + 2);
}
delete[] posData;
/* we make positional embeddings first */
posEmbedding = Gather(posEmbeddingBase, position);
/* then we make word embeddings */
wordEmbedding = Gather(w, input);
wordEmbedding = Linear(wordEmbedding, (float)sqrt((float)eSize));
/* we sum over the two embeddings */
return wordEmbedding + posEmbedding;
}
}
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-08-01
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-07
*/
#ifndef __T2TEMBEDDING_H__
#define __T2TEMBEDDING_H__
#include "T2TUtility.h"
#include "../../../network/XNet.h"
using namespace nts;
namespace transformer
{
#define DEFAULT_EMBEDDING_SIZE 512
/*
embedding (of word at position i):
word embedding + positional embedding
*/
class T2TEmbedder
{
public:
/* device id */
int devID;
/* vocabulary size */
int vSize;
/* embedding size */
int eSize;
/* maximum length of the sequence */
int maxLength;
/* dimension size of the hidden layers in the t2t model */
int d;
/* padding index */
int padIdx;
/* word embedding matrix */
XTensor w;
/* predefined positional embeddings. It can speeds up
the embedding processing by re-loading. */
XTensor posEmbeddingBase;
public:
/* constructor */
T2TEmbedder();
/* de-constructor */
~T2TEmbedder();
/* initialize the model */
void InitModel(T2TConfig& config, bool isEnc = true);
/* make positional embeddings */
void MakePosEmbedding(int length);
/* make the network */
XTensor Make(XTensor& input, bool isDec, bool isTraining, int nstep = 0);
};
}
#endif
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#include <cmath>
#include "T2TFNN.h"
#include "T2TUtility.h"
#include "T2TEmbedding.h"
#include "../../../tensor/core/CHeader.h"
#include "../../../tensor/function/FHeader.h"
namespace transformer
{
/* constructor */
T2TFNN::T2TFNN()
{
inSize = -1;
outSize = -1;
hSize = -1;
}
/* de-constructor */
T2TFNN::~T2TFNN()
{
}
/*
initialize the model
>> argc - number of arguments
>> argv - list of pointers to the arguments
>> config - configurations of the model
*/
void T2TFNN::InitModel(T2TConfig& config)
{
devID = config.devID;
inSize = config.modelSize;
outSize = config.modelSize;
hSize = config.fnnHiddenSize;
dropoutP = config.fnnDropout;
InitTensor2D(&w1, inSize, hSize, X_FLOAT, devID);
InitTensor1D(&b1, hSize, X_FLOAT, devID);
InitTensor2D(&w2, hSize, outSize, X_FLOAT, devID);
InitTensor1D(&b2, outSize, X_FLOAT, devID);
float scale = 1.0F;
_SetDataFanInOut(&w1, scale);
_SetDataFanInOut(&w2, scale);
b1.SetZeroAll();
b2.SetZeroAll();
}
/*
make the network
y = max(0, x * w1 + b1) * w2 + b2
>> input - the input tensor
>> return - the output tensor
*/
XTensor T2TFNN::Make(XTensor& input, bool isTraining)
{
XTensor t1;
/* t1 = max(0, x * w1 + b1) */
t1 = Rectify(MulAndShift(input, w1, b1));
if (isTraining && dropoutP > 0)
t1 = Dropout(t1, dropoutP);
/* result = t1 * w2 + b2 */
return MulAndShift(t1, w2, b2);
}
}
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#ifndef __T2TFNN_H__
#define __T2TFNN_H__
#include "T2TUtility.h"
#include "T2TLayerNormal.h"
#include "../../../tensor/XTensor.h"
using namespace nts;
namespace transformer
{
/* a fnn: y = max(0, x * w1 + b1) * w2 + b2 */
class T2TFNN
{
public:
/* device id */
int devID;
/* size of input vector */
int inSize;
/* size of output vector */
int outSize;
/* size of hidden layers */
int hSize;
/* matrix of transformation 1 */
XTensor w1;
/* bias of transformation 1 */
XTensor b1;
/* matrix of transformation 2 */
XTensor w2;
/* bias of transformation 2 */
XTensor b2;
/* dropout probability */
DTYPE dropoutP;
public:
/* constructor */
T2TFNN();
/* de-constructor */
~T2TFNN();
/* initialize the model */
void InitModel(T2TConfig& config);
/* make the network */
XTensor Make(XTensor& input, bool isTraining);
};
}
#endif
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Bei Li (libei_neu@outlook.com) 2020-02-03
*/
#include <cmath>
#include "T2TUtility.h"
#include "T2TEmbedding.h"
#include "T2TGatedLinearUnit.h"
#include "../../../tensor/core/CHeader.h"
#include "../../../tensor/function/FHeader.h"
namespace transformer
{
/* constructor */
GLU::GLU()
{
inSize = -1;
outSize = -1;
hSize = -1;
}
/* de-constructor */
GLU::~GLU()
{
}
/*
initialize the model
>> config - configurations of the model
*/
void GLU::InitModel(T2TConfig& config)
{
devID = config.devID;
float minmax = 0;
inSize = config.modelSize;
outSize = config.modelSize;
InitTensor2D(&w1, hSize, outSize, X_FLOAT, devID);
InitTensor1D(&b1, outSize, X_FLOAT, devID);
InitTensor2D(&w2, hSize, outSize, X_FLOAT, devID);
InitTensor1D(&b2, outSize, X_FLOAT, devID);
}
/*
make the network
y = W1 * x + b1 * sigmod(W2 * x + b2)
>> input - the input tensor, size = 2 * hSize
>> return - the output tensor, size = hSize
*/
XTensor GLU::Make(XTensor& input)
{
XTensor t1;
XTensor t2;
TensorList input_list;
/* split the input into two vectors with the dim hSize */
Split(input, input_list, -1, 2);
/* t1 = W1 * x + b1 */
t1 = MulAndShift(input_list.GetItem(0), w1, b1);
/* t2 = W2 * x + b2 */
t2 = MulAndShift(input_list.GetItem(1), w2, b2);
return t1 * Sigmoid(t2);
}
}
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Bei Li (libei_neu@outlook.com) 2020-02-03
*/
#ifndef __GLU_H__
#define __GLU_H__
#include "T2TLayerNormal.h"
#include "T2TGatedLinearUnit.h"
using namespace nts;
namespace transformer
{
/* a fnn: y = max(0, x * w1 + b1) * w2 + b2 */
class GLU
{
public:
/* device id */
int devID;
/* size of input vector */
int inSize;
/* size of output vector */
int outSize;
/* size of hidden layers */
int hSize;
/* matrix of transformation 1 */
XTensor w1;
/* bias of transformation 1 */
XTensor b1;
/* matrix of transformation 2 */
XTensor w2;
/* bias of transformation 2 */
XTensor b2;
public:
/* constructor */
GLU();
/* de-constructor */
~GLU();
/* initialize the model */
void InitModel(T2TConfig& config);
/* make the network */
XTensor Make(XTensor& input);
};
}
#endif
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Bei Li (libei_neu@outlook.com) 2020-02-03
*/
#include <cmath>
#include "T2TUtility.h"
#include "T2TEmbedding.h"
#include "T2TLayerNormal.h"
#include "T2TLayerHistory.h"
#include "../../../tensor/core/CHeader.h"
#define SAFE_DELETE(x) do{ if((x) != NULL){delete (x); (x) = NULL;} } while(false)
#define SAFE_DELETE_ARRAY(x) do{ if((x) != NULL) {delete [] (x); (x)=NULL;} } while(false)
namespace transformer
{
/* constructor */
LayerHistory::LayerHistory()
{
d = -1;
count = -1;
weight = NULL;
layerNorms = NULL;
}
/* de-constructor */
LayerHistory::~LayerHistory()
{
history.Clear();
delete[] layerNorms;
}
/*
initialize the model
>> config - configurations of the model
*/
void LayerHistory::InitModel(T2TConfig& config)
{
devID = config.devID;
d = config.modelSize;
nlayer = config.nEncLayer;
InitTensor2D(&weight, nlayer + 1, nlayer + 1, X_FLOAT, devID);
layerNorms = new T2TLN[nlayer];
/* initialize the layer normalization of each layer */
for (int i = 0; i < nlayer; i++) {
layerNorms[i].InitModel(config);
}
}
/*
the Add operation
>> tensor - the previous layer output. It might be of size B * L * H
where B = batch size, L = sequence length,
and H = vector size of each position
*/
void LayerHistory::Add(XTensor& tensor)
{
/* the embedding is not normed */
count += 1;
if (history.Size() == 0) {
//sample_ = tensor;
history.Add(&tensor);
return;
}
XTensor ln = layerNorms[count - 2].Make(tensor);
history.Add(&ln);
}
/*
generate the weight sum vector of all previous layer output in the history as the layer input
*/
XTensor LayerHistory::Pop()
{
/* the number of layer output in the history */
size_t size = history.Size();
TensorList historyList;
for (size_t i = 0; i < size; i++)
historyList.Add(history[i]);
/* we need stack the tensor along the first dim*/
XTensor stackTensor = Stack(historyList, 0);
XTensor interWeight;
InitTensor2D(&interWeight, 1, weight.dimSize[1], DEFAULT_DTYPE, devID);
XTensor layerWeight;
InitTensor1D(&layerWeight, size, DEFAULT_DTYPE, devID);
_SelectRange(&weight, &interWeight, 0, size - 1, size);
interWeight.Reshape(interWeight.unitNum);
_SelectRange(&interWeight, &layerWeight, 0, 0, size);
MultiplyDimMe(stackTensor, layerWeight, 0);
XTensor result;
ReduceSum(stackTensor, result, 0);
return result;
}
void LayerHistory::ClearHistory()
{
history.Clear();
}
}
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Bei Li (libei_neu@outlook.com) 2020-02-03
*/
#ifndef __LAYERHISTORY_H__
#define __LAYERHISTORY_H__
#include "T2TLayerNormal.h"
#include "T2TLayerHistory.h"
#include "../../../tensor/function/FHeader.h"
using namespace nts;
namespace transformer
{
/*
multi-head attention
y(Q, K, V) = cat(head_1, head_2, ..., head_n)
where head_i = Attention(Q * w_i^Q, K * w_i^K, V * w_i^V)
attention(Q, K, V) = softmax(Q * K^T/d_k^0.5) V
d_k = dimension size of K
*/
class LayerHistory
{
public:
/* device id */
int devID;
/* the triangle weight matrix for dlcl */
XTensor weight;
/* hidden size */
int d;
/* layer number */
int nlayer;
/* current layer number */
int count;
/* a history to store the value of intimidate layers */
TensorList history;
/* layer normalization for each intimidate layer */
T2TLN* layerNorms;
public:
/* constructor */
LayerHistory();
/* de-constructor */
~LayerHistory();
/* initialize the model */
void InitModel(T2TConfig& config);
/* add the layer output to the history */
void Add(XTensor& tensor);
/* compute the layer input for the current layer, the weight sum of all previous layer output after normed in the history */
XTensor Pop();
/* clean the history*/
void ClearHistory();
};
}
#endif
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#include <cmath>
#include "T2TUtility.h"
#include "T2TEmbedding.h"
#include "T2TLayerNormal.h"
#include "../../../tensor/core/CHeader.h"
namespace transformer
{
/* constructor */
T2TLN::T2TLN()
{
devID = -1;
d = 0;
}
/* de-constructor */
T2TLN::~T2TLN()
{
}
/*
initialize the model
>> argc - number of arguments
>> argv - list of pointers to the arguments
>> config - configurations of the model
*/
void T2TLN::InitModel(T2TConfig& config)
{
devID = config.devID;
d = config.modelSize;
InitTensor1D(&w, d, X_FLOAT, devID);
InitTensor1D(&b, d, X_FLOAT, devID);
w.SetDataRand(1.0F, 1.0F);
b.SetZeroAll();
}
/*
make the network
>> input - the input tensor
>> return - layer normalization output
*/
XTensor T2TLN::Make(XTensor& input)
{
XTensor& x = input;
XTensor xn;
XTensor mean;
XTensor variance;
XTensor standard;
XTensor meanFilled;
XTensor standardFilled;
TENSOR_DATA_TYPE dataType = input.dataType;
if (dataType == X_FLOAT16) {
/* reduce functions can only run with FP32 */
x = ConvertDataType(input, X_FLOAT);
}
/* \mu = (sum_i x_i)/m */
mean = ReduceMean(x, x.order - 1);
/* \sigma = (sum_i (x_i - \mu)^2)/m */
variance = ReduceVariance(x, x.order - 1, mean);
/* standard = sqrt(variance) */
standard = Power(variance, 0.5F);
/* unsqueeze mean and standard deviation to fit them into
the same shape of x */
meanFilled = Unsqueeze(mean, x.order - 1, x.GetDim(-1));
standardFilled = Unsqueeze(standard, x.order - 1, x.GetDim(-1));
/* x' = (x - \mu)/standard */
xn = (x - meanFilled) / standardFilled;
if (dataType != mean.dataType) {
x = ConvertDataType(x, dataType);
xn = ConvertDataType(xn, dataType);
}
/* result = x' * w + b */
return xn * w + b;
}
}
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#ifndef __T2TLAYERNORMAL_H__
#define __T2TLAYERNORMAL_H__
#include "T2TUtility.h"
#include "../../../network/XNet.h"
using namespace nts;
namespace transformer
{
/* layer normalization: y = norm(x) * w + b
where norm(x) = (x - mean)/standardDeviation */
class T2TLN
{
public:
/* device id */
int devID;
/* the transformation matrix w */
XTensor w;
/* the bias term b */
XTensor b;
/* dimension size of the model */
int d;
public:
/* constructor */
T2TLN();
/* de-constructor */
~T2TLN();
/* initialize the model */
void InitModel(T2TConfig& config);
/* make the network */
XTensor Make(XTensor& input);
};
}
#endif
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Chi (huchinlp@foxmail.com) 2020-03-21
*/
#include "T2TNNUtil.h"
namespace transformer
{
/*
a wrapper for the gather function
>> src - the input tensor
>> index - the index tensor
<< res - the output tensor
*/
XTensor AutoGather(XTensor& src, XTensor& index)
{
if (src.order == 2)
return Gather(src, index);
else {
CheckNTErrors(src.order == 3, "the source must be 3d");
int order = src.order;
int dimSize[MAX_TENSOR_DIM_NUM];
for (int i = 0; i < src.order; i++) {
dimSize[i] = src.dimSize[i];
}
src.Reshape(src.dimSize[0], src.dimSize[1] * src.dimSize[2]);
XTensor res = Gather(src, index);
src.Reshape(order, dimSize);
dimSize[0] = index.dimSize[0];
dimSize[1] = res.unitNum / (dimSize[0] * dimSize[2]);
res.Reshape(order, dimSize);
return res;
}
}
}
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Chi (huchinlp@foxmail.com) 2020-03-21
*/
#ifndef __T2TNNUTIL_H__
#define __T2TNNUTIL_H__
#include "../../../tensor/XGlobal.h"
#include "../../../tensor/core/CHeader.h"
#include "../../../tensor/function/FHeader.h"
using namespace nts;
namespace transformer
{
/* the gather function for tensor with any dimension */
XTensor AutoGather(XTensor& src, XTensor& index);
}
#endif
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#include <cmath>
#include "T2TOutput.h"
#include "T2TUtility.h"
#include "T2TEmbedding.h"
#include "../../../tensor/core/CHeader.h"
namespace transformer
{
/* constructor */
T2TOutput::T2TOutput()
{
devID = -1;
vSize = -1;
hSize = -1;
}
/* de-constructor */
T2TOutput::~T2TOutput()
{
}
/*
initialize the model
>> config - configurations of the model
*/
void T2TOutput::InitModel(T2TConfig& config)
{
devID = config.devID;
hSize = config.modelSize;
vSize = config.tgtVocabSize;
InitTensor2D(&w, vSize, hSize, X_FLOAT, devID);
DTYPE v = 1.0F / (float)sqrt((float)hSize);
w.SetDataRandn(0, v);
}
/*
make the network (redefined output tensor)
>> input - input tensor
>> output - output tensor
>> isTraining - whether it is used for training
>> normalized - whether ignore the log-softmax
*/
void T2TOutput::Make(XTensor& input, XTensor& output, bool isTraining, bool normalized)
{
XTensor& x = input;
output = MMul(x, X_NOTRANS, w, X_TRANS);
/* use softmax for training */
if (isTraining) {
output = Softmax(output, -1);
return;
}
/* normalize the output for beam search */
if (normalized) {
auto dataType = output.dataType;
if (dataType == X_FLOAT16)
output = ConvertDataType(output, X_FLOAT);
output = LogSoftmax(output, -1);
if (output.dataType != dataType)
output = ConvertDataType(output, dataType);
}
}
}
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#ifndef __T2TOUTPUT_H__
#define __T2TOUTPUT_H__
#include "T2TUtility.h"
#include "../../../tensor/function/FHeader.h"
using namespace nts;
namespace transformer
{
/* output layer */
class T2TOutput
{
public:
/* device id */
int devID;
/* vocabulary size */
int vSize;
/* vector size of the linear transformation */
int hSize;
/* transformation matrix */
XTensor w;
public:
/* constructor */
T2TOutput();
/* de-constructor */
~T2TOutput();
/* initialize the model */
void InitModel(T2TConfig& config);
/* make the network (redefined output tensor) */
void Make(XTensor& input, XTensor& output, bool isTraining, bool normalized);
};
}
#endif
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
*/
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <string>
#include <fstream>
#include <sstream>
#include "T2TUtility.h"
#include "../../../tensor/XGlobal.h"
using namespace nts;
using namespace std;
namespace transformer
{
/*
load configurations from the command
>> argc - number of arguments
>> argv - the list of arguments
*/
T2TConfig::T2TConfig(int argc, const char** argv)
{
char** args = new char* [MAX_PARAM_NUM];
for (int i = 0; i < argc; i++) {
args[i] = new char[strlen(argv[i]) + 1];
strcpy(args[i], argv[i]);
}
char* configFN = new char[1024];
LoadParamString(argc, args, "config", configFN, "");
int argsNum = argc;
/* load configurations from a file */
if (strcmp(configFN, "") != 0)
argsNum = LoadFromFile(configFN, args);
ShowParams(argsNum, args);
/* options for the model */
LoadParamInt(argsNum, args, "nhead", &nhead, 8);
LoadParamInt(argsNum, args, "enclayer", &nEncLayer, 1);
LoadParamInt(argsNum, args, "declayer", &nDecLayer, 1);
LoadParamInt(argsNum, args, "maxrp", &maxRP, 8);
LoadParamInt(argsNum, args, "embsize", &embSize, 256);
LoadParamInt(argsNum, args, "modelsize", &modelSize, 256);
LoadParamInt(argsNum, args, "maxpos", &maxPosLen, 1024);
LoadParamInt(argsNum, args, "fnnhidden", &fnnHiddenSize, modelSize * 4);
LoadParamInt(argsNum, args, "vsize", &srcVocabSize, 10000);
LoadParamInt(argsNum, args, "vsizetgt", &tgtVocabSize, 10000);
LoadParamInt(argsNum, args, "padid", &padID, 1);
LoadParamInt(argsNum, args, "startid", &startID, 2);
LoadParamInt(argsNum, args, "endid", &endID, 2);
LoadParamBool(argsNum, args, "rpr", &useRPR, false);
LoadParamBool(argsNum, args, "prenorm", &preNorm, false);
LoadParamString(argsNum, args, "model", modelFN, "model.bin");
LoadParamString(argsNum, args, "srcvocab", srcVocabFN, "vocab.src");
LoadParamString(argsNum, args, "tgtvocab", tgtVocabFN, "vocab.tgt");
/* options for training */
LoadParamString(argsNum, args, "train", trainFN, "");
LoadParamString(argsNum, args, "valid", validFN, "");
LoadParamInt(argsNum, args, "dev", &devID, 0);
LoadParamInt(argsNum, args, "wbatch", &wBatchSize, 2048);
LoadParamInt(argsNum, args, "sbatch", &sBatchSize, 1);
isTraining = (strcmp(trainFN, "") == 0) ? false : true;
LoadParamBool(argsNum, args, "mt", &isMT, true);
LoadParamFloat(argsNum, args, "dropout", &dropout, 0.1);
LoadParamFloat(argsNum, args, "fnndrop", &fnnDropout, 0.0);
LoadParamFloat(argsNum, args, "attdrop", &attDropout, 0.0);
LoadParamFloat(argc, args, "lrate", &lrate, 1.0F);
LoadParamFloat(argc, args, "lrbias", &lrbias, 0);
LoadParamInt(argc, args, "nepoch", &nepoch, 20);
LoadParamInt(argc, args, "nstep", &nstep, 100000);
LoadParamInt(argc, args, "nwarmup", &nwarmup, 3000);
LoadParamBool(argc, args, "adam", &useAdam, true);
LoadParamFloat(argc, args, "adambeta1", &adamBeta1, 0.9F);
LoadParamFloat(argc, args, "adambeta2", &adamBeta2, 0.98F);
LoadParamFloat(argc, args, "adamdelta", &adamDelta, 1e-9F);
LoadParamBool(argc, args, "shuffled", &isShuffled, true);
LoadParamFloat(argc, args, "labelsmoothing", &labelSmoothingP, 0.1);
LoadParamInt(argc, args, "nstepcheckpoint", &nStepCheckpoint, -1);
LoadParamBool(argc, args, "epochcheckpoint", &useEpochCheckpoint, false);
LoadParamInt(argc, args, "updatestep", &updateStep, 1);
LoadParamBool(argc, args, "debug", &isDebugged, false);
LoadParamBool(argc, args, "sorted", &isLenSorted, false);
LoadParamInt(argc, args, "bufsize", &bufSize, 50000);
LoadParamBool(argc, args, "doubledend", &isDoubledEnd, false);
LoadParamBool(argc, args, "smallbatch", &isSmallBatch, true);
LoadParamBool(argc, args, "bigbatch", &isBigBatch, false);
LoadParamBool(argc, args, "randbatch", &isRandomBatch, false);
LoadParamInt(argc, args, "bucketsize", &bucketSize, 0);
/* options for translating */
LoadParamString(argsNum, args, "test", testFN, "");
LoadParamString(argsNum, args, "output", outputFN, "");
LoadParamInt(argsNum, args, "beamsize", &beamSize, 1);
LoadParamBool(argsNum, args, "fp16", &useFP16, false);
LoadParamFloat(argsNum, args, "lenalpha", &lenAlpha, 0.6);
LoadParamFloat(argsNum, args, "maxlenalpha", &maxLenAlpha, 2.0);
for (int i = 0; i < argc; i++)
delete[] args[i];
delete[] args;
delete[] configFN;
}
/*
load configurations from a file
>> configFN - path to the configuration file
>> args - the list to store the configurations
format: one option per line, separated by a blank or a tab
*/
int T2TConfig::LoadFromFile(const char* configFN, char** args) {
ifstream f(configFN, ios::in);
CheckNTErrors(f.is_open(), "unable to open the config file");
int argsNum = 0;
/* parse arguments */
string key, value;
while (f >> key >> value) {
key += '-';
strcpy(args[argsNum++], key.c_str());
strcpy(args[argsNum++], value.c_str());
}
/* record the number of arguments */
return argsNum;
}
void LoadParamString(int argc, char** argv, const char* name, char* p, const char* defaultP)
{
char vname[128];
vname[0] = '-';
strcpy(vname + 1, name);
bool hit = false;
for (int i = 0; i < argc; i++) {
if (!strcmp(argv[i], vname) && i + 1 < argc) {
strcpy(p, argv[i + 1]);
hit = true;
break;
}
}
if (!hit)
strcpy(p, defaultP);
}
void LoadParamInt(int argc, char** argv, const char* name, int* p, int defaultP)
{
char vname[128];
vname[0] = '-';
strcpy(vname + 1, name);
bool hit = false;
for (int i = 0; i < argc; i++) {
if (!strcmp(argv[i], vname) && i + 1 < argc) {
*(int*)p = atoi(argv[i + 1]);
hit = true;
break;
}
}
if (!hit)
*p = defaultP;
}
void LoadParamBool(int argc, char** argv, const char* name, bool* p, bool defaultP)
{
char vname[128];
vname[0] = '-';
strcpy(vname + 1, name);
bool hit = false;
for (int i = 0; i < argc; i++) {
if (!strcmp(argv[i], vname)) {
*(bool*)p = true;
hit = true;
break;
}
}
if (!hit)
*p = defaultP;
}
void LoadParamFloat(int argc, char** argv, const char* name, float* p, float defaultP)
{
char vname[128];
vname[0] = '-';
strcpy(vname + 1, name);
bool hit = false;
for (int i = 0; i < argc; i++) {
if (!strcmp(argv[i], vname) && i + 1 < argc) {
*p = (float)atof(argv[i + 1]);
hit = true;
break;
}
}
if (!hit)
*p = defaultP;
}
void ShowParams(int argc, char** argv)
{
fprintf(stderr, "args:\n");
for (int i = 0; i < argc; i++) {
if (argv[i][1] == 0)
continue;
if (argv[i][0] == '-' && (argv[i][1] < '1' || argv[i][1] > '9')) {
if (i + 1 < argc && argv[i + 1][0] != '-')
fprintf(stderr, " %s=%s\n", argv[i], argv[i + 1]);
else
fprintf(stderr, " %s=yes\n", argv[i]);
}
}
fprintf(stderr, "\n");
}
#define MAX_WORD_NUM 120
/*
split string by delimiter, this will return indices of all sub-strings
>> s - the original string
>> delimiter - as it is
<< indices - indices of all sub-strings
*/
UInt64List SplitToPos(const string& s, const string& delimiter)
{
UInt64List indices;
if (delimiter.length() == 0) {
indices.Add(0);
}
size_t pos = 0;
uint64_t start = 0;
while ((pos = s.find(delimiter, start)) != string::npos) {
if (pos != start) {
indices.Add(start);
}
start = pos + delimiter.length();
}
if (start != s.length()) {
indices.Add(start);
}
return indices;
}
/* split a string to a int64_t list */
IntList SplitInt(const string& s, const string& delimiter)
{
IntList values;
auto indices = SplitToPos(s, delimiter);
for (int i = 0; i < indices.Size(); i++) {
values.Add(strtol(s.data() + indices[i], nullptr, 10));
}
return values;
}
/* split a string to a float list */
FloatList SplitFloat(const string& s, const string& delimiter)
{
FloatList values;
auto indices = SplitToPos(s, delimiter);
for (int i = 0; i < indices.Size(); i++) {
values.Add(strtof(s.data() + indices[i], nullptr));
}
return values;
}
}
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-06
*/
#ifndef __T2TUTILITY_H__
#define __T2TUTILITY_H__
#include <string>
#include <cstdio>
#include "../../../tensor/XList.h"
using namespace std;
using namespace nts;
namespace transformer
{
#define MAX_PARAM_NUM 100
/* load arguments */
void LoadParamInt(int argc, char** argv, const char* name, int* p, int defaultP);
void LoadParamBool(int argc, char** argv, const char* name, bool* p, bool defaultP);
void LoadParamFloat(int argc, char** argv, const char* name, float* p, float defaultP);
void LoadParamString(int argc, char** argv, const char* name, char* p, const char* defaultP);
/* show arguments */
void ShowParams(int argc, char** argv);
/* split string */
IntList SplitInt(const string& s, const string& delimiter);
FloatList SplitFloat(const string& s, const string& delimiter);
UInt64List SplitToPos(const string& s, const string& delimiter);
/* configurations for t2t */
class T2TConfig {
public:
/* path to the model */
char modelFN[1024];
/* path to the source vocab */
char srcVocabFN[1024];
/* path to the target vocab */
char tgtVocabFN[1024];
/* path to the input file (for inference) */
char testFN[1024];
/* path to the output file (for inference) */
char outputFN[1024];
/* path to the training file */
char trainFN[1024];
/* path to the validation file */
char validFN[1024];
/* device id */
int devID;
/* beam size */
int beamSize;
/* word batch size */
int wBatchSize;
/* sentence batch size */
int sBatchSize;
/* number of heads in attention */
int nhead;
/* number of encoder layers */
int nEncLayer;
/* number of decoder layers */
int nDecLayer;
/* the maximum relative position in RPR attentions */
int maxRP;
/* the dimension of embeddings */
int embSize;
/* the dimension of hidden layer */
int modelSize;
/* the maximum length in positional embedding */
int maxPosLen;
/* the dimension of fnn hidden layer */
int fnnHiddenSize;
/* the vocab size of source sequence */
int srcVocabSize;
/* the vocab size of target sequence */
int tgtVocabSize;
/* the padding id */
int padID;
/* start symbol */
int startID;
/* end symbol */
int endID;
/* indicates whether the model uses pre-norm */
bool preNorm;
/* indicates whether the model is running for machine translation */
bool isMT;
/* indicates whether the model is running with FP16 data type */
bool useFP16;
/* indicates whether we use the RPR attention */
bool useRPR;
/* indicates whether we train the model */
bool isTraining;
/* dropout rate for the model */
float dropout;
/* dropout rate for fnn layers */
float fnnDropout;
/* dropout rate for attention layers */
float attDropout;
/* the alpha parameter controls the length preference */
float lenAlpha;
/* scalar of the input sequence (for max number of search steps) */
float maxLenAlpha;
/* learning rate */
float lrate;
/* the parameter that controls the maximum learning rate in training */
float lrbias;
/* training epoch number */
int nepoch;
/* traing step number */
int nstep;
/* indicates whether we use Adam */
bool useAdam;
/* hyper parameters of Adam */
float adamBeta1;
float adamBeta2;
float adamDelta;
/* step number of warm-up for training */
int nwarmup;
/* indicates whether the data file is shuffled for training */
bool isShuffled;
/* the factor of label smoothing */
float labelSmoothingP;
/* number of steps after which we make a checkpoint */
int nStepCheckpoint;
/* indicates whether we make a checkpoint after each training epoch */
bool useEpochCheckpoint;
/* number of batches on which we do model update */
int updateStep;
/* indicates whether we intend to debug the net */
bool isDebugged;
/* indicates whether the sequence is sorted by length */
bool isLenSorted;
/* buffer size */
int bufSize;
/* indicates whether we double the </s> symbol for the output of LM */
bool isDoubledEnd;
/* indicates whether we use batchsize = max * sc
rather rather than batchsize = word-number, where max is the maximum
length and sc is the sentence number */
bool isSmallBatch;
/* counterpart of "isSmallBatch" */
bool isBigBatch;
/* randomize batches */
bool isRandomBatch;
/* bucket size */
int bucketSize;
public:
/* load configurations from the command */
T2TConfig(int argc, const char** argv);
/* load configurations from a file */
int LoadFromFile(const char* configFN, char** args);
};
}
#endif
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-04-25
* it is cold today but I'll move to a warm place tomorrow :)
*/
#ifndef __T2TBATCHLOADER_H__
#define __T2TBATCHLOADER_H__
#include "../module/T2TUtility.h"
#include "../../../network/XNet.h"
using namespace nts;
namespace transformer
{
#define MAX_SEQUENCE_LENGTH 1024 * 4
/* node to keep batch information */
struct BatchNode
{
/* beginning position */
int beg;
/* end position */
int end;
/* maximum word number on the encoder side */
int maxEnc;
/* maximum word number on the decoder side */
int maxDec;
/* a key for sorting */
int key;
};
class T2TBatchLoader
{
public:
/* buffer for loading words */
int* buf;
/* another buffer */
int* buf2;
/* batch buf */
BatchNode* bufBatch;
/* buffer size */
int bufSize;
/* size of batch buffer */
int bufBatchSize;
/* length of each sequence */
int* seqLen;
/* another array */
int* seqLen2;
/* offset of the first word for each sequence */
int* seqOffset;
/* number of sequences in the buffer */
int nseqBuf;
/* offset for next sequence in the buffer */
int nextSeq;
/* offset for next batch */
int nextBatch;
/* indicates whether we double the </s> symbol for the output of LM */
bool isDoubledEnd;
/* indicates whether we use batchsize = max * sc
rather rather than batchsize = word-number, where max is the maximum
length and sc is the sentence number */
bool isSmallBatch;
/* counterpart of "isSmallBatch" */
bool isBigBatch;
/* randomize batches */
bool isRandomBatch;
/* bucket size */
int bucketSize;
public:
/* constructor */
T2TBatchLoader();
/* de-constructor */
~T2TBatchLoader();
/* initialization */
void Init(T2TConfig& config);
/* load data to buffer */
int LoadBuf(FILE* file, bool isSorted, int step);
/* clear data buffer */
void ClearBuf();
/* set the random batch flag */
void SetRandomBatch(bool flag = true);
/* load a batch of sequences */
int LoadBatch(FILE* file, bool isLM,
XTensor* batchEnc, XTensor* paddingEnc,
XTensor* batchDec, XTensor* paddingDec,
XTensor* gold, XTensor* label,
int* seqs,
int vsEnc, int vsDec, int sBatch, int wBatch,
bool isSorted, int& ws, int& wCount,
int devID, bool isTraining);
/* load a batch of sequences (for language modeling) */
int LoadBatchLM(FILE* file,
XTensor* batchEnc, XTensor* paddingEnc,
XTensor* batchDec, XTensor* paddingDec,
XTensor* gold, XTensor* label,
int* seqs, int vs, int sBatch, int wBatch,
bool isSorted, int& wCount,
int devID, bool isTraining);
/* load a batch of sequences (for machine translation) */
int LoadBatchMT(FILE* file,
XTensor* batchEnc, XTensor* paddingEnc,
XTensor* batchDec, XTensor* paddingDec,
XTensor* gold, XTensor* label,
int* seqs, int vsEnc, int vsDec, int sBatch, int wBatch,
bool isSorted, int& ws, int& wCount,
int devID, bool isTraining);
/* shuffle the data file */
void Shuffle(const char* srcFile, const char* tgtFile);
};
}
#endif
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-08-02
*/
#ifndef __T2TTRAINER_H__
#define __T2TTRAINER_H__
#include "../T2TModel.h"
#include "T2TBatchLoader.h"
#include "../../../tensor/function/FHeader.h"
using namespace nts;
namespace transformer
{
/* trainer of the T2T model */
class T2TTrainer
{
public:
/* configurations */
T2TConfig* cfg;
/* dimension size of each inner layer */
int d;
/* step number of warm-up for training */
int nwarmup;
/* vocabulary size of the source side */
int vSize;
/* vocabulary size of the target side */
int vSizeTgt;
/* learning rate */
float lrate;
/* the parameter that controls the maximum learning rate in training */
float lrbias;
/* sentence batch size */
int sBatchSize;
/* word batch size */
int wBatchSize;
/* training epoch number */
int nepoch;
/* traing step number */
int nstep;
/* indicates whether we use adam */
bool useAdam;
/* hyper parameters of adam*/
float adamBeta1;
float adamBeta2;
float adamDelta;
float adamBeta1T;
float adamBeta2T;
/* list of the moment of the parameter matrices */
TensorList moments;
/* list of the 2nd order moment of the parameter matrices */
TensorList moments2nd;
/* indicates whether the data file is shuffled for training */
bool isShuffled;
/* the factor of label smoothing */
DTYPE labelSmoothingP;
/* number of steps after which we make a checkpoint */
int nStepCheckpoint;
/* indicates whether we make a checkpoint after each training epoch */
bool useEpochCheckpoint;
/* number of batches on which we do model update */
int updateStep;
/* indicates whether we intend to debug the net */
bool isDebugged;
/* indicates whether the sequence is sorted by length */
bool isLenSorted;
/* for batching */
T2TBatchLoader batchLoader;
public:
/* constructor */
T2TTrainer();
/* de-constructor */
~T2TTrainer();
/* initialize the trainer */
void Init(T2TConfig& config);
/* train the model */
void Train(const char* fn, const char* validFN, const char* modelFN, T2TModel* model);
/* test the model */
void Validate(const char* fn, const char* ofn, T2TModel* model);
/* make a checkpoint */
void MakeCheckpoint(T2TModel* model, const char* validFN, const char* modelFN, const char* label, int id);
/* update the model by delta rule */
void Update(T2TModel* model, const float lr);
/* prepare model for training */
void PrepareModel(T2TModel* model);
};
}
#endif
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: HU Chi (huchinlp@foxmail.com) 2019-04-03
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-06
*/
#include <string>
#include <vector>
#include <cstdlib>
#include <fstream>
#include <algorithm>
#include "T2TDataSet.h"
#include "../module/T2TUtility.h"
using namespace transformer;
namespace nts {
/* sort the output by id (in ascending order) */
void DataSet::SortInput() {
sort(inputBuffer.items, inputBuffer.items + inputBuffer.count, [](Example* a, Example* b) {
return a->values.count > b->values.count;
});
}
/* sort the input by length (in descending order) */
void DataSet::SortOutput() {
sort(outputBuffer.items, outputBuffer.items + outputBuffer.count, [](Result* a, Result* b) {
return a->id < b->id;
});
}
/*
load data from the file to the buffer
*/
void DataSet::LoadDataToBuffer()
{
string line;
inputBuffer.Clear();
bufferUsed = 0;
int id = 0;
const string tokenDelimiter = " ";
while (getline(*fp, line)) {
IntList values;
/* load words and transform them to ids */
auto indices = SplitToPos(line, tokenDelimiter);
/* reserve the first 120 words if the input is too long */
size_t maxLen = indices.Size() > MAX_WORD_NUM ? MAX_WORD_NUM : indices.Size();
for (size_t i = 0; i < maxLen; i++) {
auto offset = (i != (indices.Size() - 1)) ?
indices[i + 1] - indices[i] - tokenDelimiter.size()
: line.size() - indices[i];
string word = line.substr(indices[i], offset);
if (srcVocab.word2id.find(word) == srcVocab.word2id.end())
values.Add(3);
else
values.Add(srcVocab.word2id.at(word));
}
/* make sure that the sequence ends with EOS */
if (values.Size() != 0 && values[-1] != EOS)
values.Add(EOS);
Example* example = new Example;
example->id = id;
example->values = values;
if (values.Size() != 0)
inputBuffer.Add(example);
else
emptyLines.Add(id);
id++;
}
fp->close();
SortInput();
XPRINT1(0, stderr, "[INFO] loaded %d sentences\n", id);
}
/*
load a mini-batch to the device
>> batchEnc - a tensor to store the batch of input
>> paddingEnc - a tensor to store the batch of paddings
>> minSentBatch - the minimum number of sentence batch
>> batchSize - the maxium number of words in a batch
>> devID - the device id, -1 for the CPU
<< indices of the sentences
*/
UInt64List DataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
size_t minSentBatch, size_t batchSize, int devID)
{
size_t realBatchSize = minSentBatch;
/* get the maximum sentence length in a mini-batch */
size_t maxLen = inputBuffer[bufferUsed]->values.Size();
/* dynamic batching for sentences */
while ((realBatchSize < (inputBuffer.Size() - bufferUsed))
&& (realBatchSize * maxLen < batchSize)) {
realBatchSize++;
}
/* real batch size */
if ((inputBuffer.Size() - bufferUsed) < realBatchSize) {
realBatchSize = inputBuffer.Size() - bufferUsed;
}
CheckNTErrors(maxLen != 0, "invalid length");
int* batchValues = new int[realBatchSize * maxLen];
float* paddingValues = new float[realBatchSize * maxLen];
for (int i = 0; i < realBatchSize * maxLen; i++) {
batchValues[i] = 1;
paddingValues[i] = 0.0F;
}
size_t cur = 0;
/* left padding */
UInt64List infos;
size_t totalLength = 0;
for (int i = 0; i < realBatchSize; ++i) {
infos.Add(inputBuffer[bufferUsed + i]->id);
totalLength += inputBuffer[bufferUsed + i]->values.Size();
cur = maxLen * (i + 1) - inputBuffer[bufferUsed + i]->values.Size();
for (int j = 0; j < inputBuffer[bufferUsed + i]->values.Size(); j++) {
batchValues[cur] = inputBuffer[bufferUsed + i]->values[j];
paddingValues[cur++] = 1.0F;
}
}
infos.Add(totalLength);
InitTensor2D(batchEnc, realBatchSize, maxLen, X_INT, devID);
InitTensor2D(paddingEnc, realBatchSize, maxLen, X_FLOAT, devID);
bufferUsed += realBatchSize;
batchEnc->SetData(batchValues, batchEnc->unitNum);
paddingEnc->SetData(paddingValues, paddingEnc->unitNum);
delete[] batchValues;
delete[] paddingValues;
return infos;
}
/*
the constructor of DataSet
>> dataFile - path of the data file
>> srcVocabFN - path of the source vocab file
>> tgtVocabFN - path of the target vocab file
*/
void DataSet::Init(const char* dataFile, const char* srcVocabFN, const char* tgtVocabFN)
{
fp = new ifstream(dataFile);
CheckNTErrors(fp->is_open(), "can not open the file");
bufferUsed = 0;
CheckNTErrors(strcmp(srcVocabFN, "") != 0, "missing source vocab file");
CheckNTErrors(strcmp(tgtVocabFN, "") != 0, "missing target vocab file");
srcVocab.Load(srcVocabFN);
/* share source and target vocabs */
if (strcmp(srcVocabFN, tgtVocabFN) == 0) {
XPRINT(0, stderr, "[INFO] share source and target vocabs \n");
tgtVocab.CopyFrom(srcVocab);
}
else {
tgtVocab.Load(tgtVocabFN);
}
LoadDataToBuffer();
}
/* check if the buffer is empty */
bool DataSet::IsEmpty() {
if (bufferUsed < inputBuffer.Size())
return false;
return true;
}
/* dump the translation to a file */
void DataSet::DumpRes(const char* ofn)
{
ofstream ofile(ofn, ios::out);
for (int t = 0; t < outputBuffer.Size(); t++) {
auto res = outputBuffer[t];
for (int i = 0; i < res->res.Size(); i++) {
if (res->res[i] < 4)
break;
ofile << tgtVocab.id2word[res->res[i]] << " ";
}
ofile << "\n";
}
ofile.close();
}
/* de-constructor */
DataSet::~DataSet()
{
/* release the file */
delete fp;
/* release the input buffer */
for (int i = 0; i < inputBuffer.Size(); i++)
delete inputBuffer[i];
/* release the output buffer */
for (int i = 0; i < outputBuffer.Size(); i++)
delete outputBuffer[i];
}
}
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: HU Chi (huchinlp@foxmail.com) 2019-04-03
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-06
*/
#ifndef __DATASET_H__
#define __DATASET_H__
#include <cstdio>
#include <vector>
#include <fstream>
#include "T2TVocab.h"
#include "../../../tensor/XList.h"
#include "../../../tensor/XTensor.h"
#include "../../../tensor/XGlobal.h"
#define MAX_WORD_NUM 120
using namespace std;
namespace nts {
/* the struct of tokenized input */
struct Example {
int id;
IntList values;
};
/* the struct of tokenized output */
struct Result {
int id;
IntList res;
};
/* A `DataSet` is associated with a file which contains variable length data.*/
struct DataSet {
public:
/* the data buffer */
InputBufferType inputBuffer;
/* a list of empty line number */
IntList emptyLines;
/* the result buffer */
OutputBufferType outputBuffer;
/* the pointer to file stream */
ifstream* fp;
/* size of used data in buffer */
size_t bufferUsed;
/* the source vocabulary */
Vocab srcVocab;
/* the target vocabulary */
Vocab tgtVocab;
public:
/* sort the input by length */
void SortInput();
/* reorder the output by ids */
void SortOutput();
/* load data from a file to the buffer */
void LoadDataToBuffer();
/* generate a mini-batch */
UInt64List LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
size_t sBatch, size_t wBatch, int devID);
/* initialization function */
void Init(const char* dataFile, const char* srcVocabFN, const char* tgtVocabFN);
/* check if the buffer is empty */
bool IsEmpty();
/* dump the translations to a file */
void DumpRes(const char* ofn);
/* de-constructor */
~DataSet();
};
}
#endif // __DATASET_H__
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-04-08
* Start of a new week - I just finished several documents.
* Writing document is harder than writing code :)
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#include "T2TLengthPenalty.h"
using namespace nts;
namespace transformer
{
/*
GNMT-like length penalty: pl = ((5 + n)/(5 + 1))^\alpha
where n = length of the sequence
>> length - length of the sequence
>> alpha - the parameter controls the length preference
<< return - length penalty of the sequence
*/
float T2TLengthPenalizer::GNMT(float length, float alpha)
{
float base;
float lp;
base = (length + 5.0F) / (1.0F + 5.0F);
lp = pow(base, alpha);
return lp;
}
}
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-04-08
* Start of a new week - I just finished several documents.
* Writing document is harder than writing code :)
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#ifndef __T2TLENGTHPENALTY_H__
#define __T2TLENGTHPENALTY_H__
#include "../module/T2TUtility.h"
#include "../../../tensor/XTensor.h"
using namespace nts;
namespace transformer
{
/* We intend to penalize short sequences because they have higher score
in product of a sequence of probability-like terms and have more chances
to beat others in search. */
class T2TLengthPenalizer
{
public:
/* GNMT-like length penalty: pl = ((5 + n)/(5 + 1))^\alpha
where n = length of the sequence */
static float GNMT(float length, float alpha);
};
}
#endif
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-13
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#include <iostream>
#include "T2TPredictor.h"
#include "../module/T2TNNUtil.h"
using namespace nts;
namespace transformer
{
/* constructor */
T2TStateBundle::T2TStateBundle()
{
states = NULL;
isStart = false;
}
/* de-constructor */
T2TStateBundle::~T2TStateBundle()
{
if (states != NULL)
delete[] states;
}
/*
create states
>> num - number of states
*/
void T2TStateBundle::MakeStates(int num)
{
CheckNTErrors(num > 0, "invalid number");
if (states != NULL)
delete[] states;
states = new T2TState[num];
for (int i = 0; i < num; i++) {
states[i].prediction = -1;
states[i].pid = T2T_PID_EMPTY;
states[i].isEnd = false;
states[i].isStart = false;
states[i].isCompleted = false;
states[i].prob = 0;
states[i].probPath = 0;
states[i].modelScore = 0;
states[i].nstep = 0;
states[i].last = NULL;
}
stateNum = num;
}
/* constructor */
T2TPredictor::T2TPredictor()
{
startSymbol = 2;
}
/* de-constructor */
T2TPredictor::~T2TPredictor()
{
}
/*
create an initial state
>> model - the t2t model
>> top - the top-most layer of the network
>> input - input of the network
>> beamSize - beam size
>> state - the state to be initialized
*/
void T2TPredictor::Create(T2TModel* model, XTensor* top, const XTensor* input,
int beamSize, T2TStateBundle* state)
{
int dims[MAX_TENSOR_DIM_NUM];
for (int i = 0; i < input->order - 1; i++)
dims[i] = input->dimSize[i];
dims[input->order - 1] = beamSize;
InitTensor(&state->probPath, input->order, dims, X_FLOAT, input->devID);
InitTensor(&state->endMark, input->order, dims, X_INT, input->devID);
state->probPath.SetZeroAll();
state->nstep = 0.0F;
state->endMark.SetZeroAll();
state->stateNum = 0;
}
/*
set start symbol
>> symbol - the symbol (in integer)
*/
void T2TPredictor::SetStartSymbol(int symbol)
{
startSymbol = symbol;
}
/*
read a state
>> model - the t2t model that keeps the network created so far
>> state - a set of states. It keeps
1) hypotheses (states)
2) probabilities of hypotheses
3) parts of the network for expanding toward the next state
*/
void T2TPredictor::Read(T2TModel* model, T2TStateBundle* state)
{
m = model;
s = state;
}
/*
predict the next state
>> next - next states
>> aliveIndices - indices of alive states, (B)
>> absoluteIdx - the absolute indices of alive states, (B)
>> encoding - encoder output, (B, L, E)
>> inputEnc - input of the encoder, (B, L)
>> paddingEnc - padding of the encoder, (B, L)
>> rawBatchSize - the raw batch size (in case of some states are pruned)
>> isStart - whether it is the start state or not
>> reorderState - the new order of states
>> needReorder - whether we need reordering the states
>> nstep - current time step of the target sequence
*/
void T2TPredictor::Predict(T2TStateBundle* next, XTensor& aliveState, XTensor& encoding,
XTensor& inputEnc, XTensor& paddingEnc, int batchSize, bool isStart,
XTensor& reorderState, bool needReorder, int nstep)
{
int dims[MAX_TENSOR_DIM_NUM];
/* word indices of positions up to next state */
XTensor inputDec;
/* the first token */
XTensor first;
InitTensor2D(&first, batchSize, 1, X_INT, inputEnc.devID);
first.SetDataFixed(startSymbol);
/* add a new word into the input sequence of the decoder side */
if (isStart) {
inputDec = Identity(first);
}
else {
/* only pass one step to the decoder */
inputDec = GetLastPrediction(s, inputEnc.devID);
}
/* keep alive states for the decoder */
if (aliveState.dimSize[0] < batchSize) {
/* alive inputs */
inputDec = AutoGather(inputDec, aliveState);
/* alive cache */
for (int i = 0; i < m->decoder->nlayer; i++) {
m->decoder->selfAttCache[i].KeepAlive(aliveState);
m->decoder->enDeAttCache[i].KeepAlive(aliveState);
}
}
if (needReorder) {
for (int i = 0; i < m->decoder->nlayer; i++) {
m->decoder->selfAttCache[i].Reorder(reorderState);
m->decoder->enDeAttCache[i].Reorder(reorderState);
}
}
/* prediction probabilities */
XTensor& output = next->prob;
XTensor decoding;
for (int i = 0; i < inputDec.order - 1; i++)
dims[i] = inputDec.dimSize[i];
dims[inputDec.order - 1] = inputDec.dimSize[inputDec.order - 1];
XTensor paddingDec;
InitTensor(&paddingDec, inputDec.order, dims, X_INT, paddingEnc.devID);
paddingDec.SetDataFixed(1);
XTensor maskDec;
XTensor maskEncDec;
/* decoder mask */
m->MakeMTMaskDec(paddingEnc, paddingDec, maskDec, maskEncDec);
/* make the decoding network */
decoding = m->decoder->Make(inputDec, encoding, NULL, &maskEncDec, nstep, false);
CheckNTErrors(decoding.order >= 2, "The tensor must be of order 2 or larger!");
/* generate the output probabilities */
m->outputLayer->Make(decoding, output, false, true);
}
/*
generate paths up to the states of the current step
>> state - state bundle of the current step
*/
XTensor T2TPredictor::GeneratePaths(T2TStateBundle* state)
{
CheckNTErrors(state->stateNum >= 0, "Illegal state!");
int distance = -1;
for (int i = 0; i < state->stateNum; i++) {
T2TState* cur = state->states + i;
int nsteps = 0;
while (cur != NULL) {
nsteps++;
cur = cur->last;
}
if (nsteps > distance)
distance = nsteps;
}
XTensor path;
InitTensor2D(&path, state->stateNum, distance, X_INT);
path.SetZeroAll();
for (int i = 0; i < state->stateNum; i++) {
T2TState* cur = state->states + i;
int nsteps = 0;
while (cur != NULL) {
nsteps++;
path.Set2DInt(cur->prediction, i, distance - nsteps);
cur = cur->last;
}
}
return path;
}
/*
get the predictions of the previous step
>> state - state bundle of the current step
>> devID - the device id for the predictions
*/
XTensor T2TPredictor::GetLastPrediction(T2TStateBundle* state, int devID)
{
CheckNTErrors(state->stateNum >= 0, "Illegal state!");
IntList last;
for (int i = 0; i < state->stateNum; i++) {
T2TState* cur = state->states + i;
last.Add(cur->prediction);
}
XTensor lastPred;
InitTensor2D(&lastPred, last.Size(), 1, X_INT, devID);
lastPred.SetData(last.items, last.Size());
return lastPred;
}
}
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-13
* This is the first source file I create in 2019 - new start!
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#ifndef __T2TPREDICTOR_H__
#define __T2TPREDICTOR_H__
#include "../T2TModel.h"
#include "T2TLengthPenalty.h"
using namespace std;
namespace transformer
{
#define T2T_PID_EMPTY -1
/* state for search. It keeps the path (back-pointer), prediction distribution,
and etc. It can be regarded as a hypotheses in translation. */
class T2TState
{
public:
/* we assume that the prediction is an integer */
int prediction;
/* id of the problem. One can regard it as the sentence id when we
translate a number of sentences in the batched manner. The hypotheses
is empty if id = -1 */
int pid;
/* indicates whether the state is an end */
bool isEnd;
/* indicates whether the state is the start */
bool isStart;
/* indicates whether the state is completed */
bool isCompleted;
/* probability of every prediction (last state of the path) */
float prob;
/* probability of every path */
float probPath;
/* model score of every path. A model score = path probability + some other stuff */
float modelScore;
/* number of steps we go over so far */
int nstep;
/* pointer to the previous state */
T2TState* last;
};
/* a bundle of states */
class T2TStateBundle
{
public:
/* predictions */
XTensor prediction;
/* id of the previous state that generates the current one */
XTensor preID;
/* mark that indicates whether each hypotheses is completed */
XTensor endMark;
/* probability of every prediction (last state of the path) */
XTensor prob;
/* probability of every path */
XTensor probPath;
/* model score of every path */
XTensor modelScore;
/* step number of each hypotheses */
float nstep;
/* list of states */
T2TState* states;
/* number of states */
int stateNum;
/* indicates whether it is the first state */
bool isStart;
public:
/* constructor */
T2TStateBundle();
/* de-constructor */
~T2TStateBundle();
/* create states */
void MakeStates(int num);
};
/* The predictor reads the current state and then predicts the next.
It is exactly the same procedure of MT inference -
we get the state of previous words and then generate the next word.
Here, a state can be regarded as the representation of words (word
indices, hidden states, embeddings and etc.). */
class T2TPredictor
{
private:
/* pointer to the transformer model */
T2TModel* m;
/* current state */
T2TStateBundle* s;
/* start symbol */
int startSymbol;
/* end symbol */
int endSymbol;
public:
/* constructor */
T2TPredictor();
/* de-constructor */
~T2TPredictor();
/* create an initial state */
void Create(T2TModel* model, XTensor* top, const XTensor* input, int beamSize, T2TStateBundle* state);
/* set the start symbol */
void SetStartSymbol(int symbol);
/* read a state */
void Read(T2TModel* model, T2TStateBundle* state);
/* predict the next state */
void Predict(T2TStateBundle* next, XTensor& aliveIndices, XTensor& encoding,
XTensor& inputEnc, XTensor& paddingEnc, int rawBatchSize,
bool isStart, XTensor& reorderState, bool needReorder, int nstep);
/* generate paths up to the states of the current step */
XTensor GeneratePaths(T2TStateBundle* state);
/* get the predictions of the previous step */
XTensor GetLastPrediction(T2TStateBundle* state, int devID);
};
}
#endif
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
*/
#ifndef __T2TSEARCH_H__
#define __T2TSEARCH_H__
#include "../T2TModel.h"
#include "T2TPredictor.h"
using namespace std;
namespace transformer
{
/* The class organizes the search process. It calls "predictors" to generate
distributions of the predictions and prunes the search space by beam pruning.
This makes a graph where each path represents a translation hypotheses.
The output can be the path with the highest model score. */
class BeamSearch
{
private:
/* the alpha parameter controls the length preference */
float alpha;
/* predictor */
T2TPredictor predictor;
/* max length of the generated sequence */
int maxLength;
/* beam size */
int beamSize;
/* batch size */
int batchSize;
/* we keep the final hypotheses in a heap for each sentence in the batch. */
XHeap<MIN_HEAP, float>* fullHypos;
/* array of the end symbols */
int* endSymbols;
/* number of the end symbols */
int endSymbolNum;
/* start symbol */
int startSymbol;
/* scalar of the input sequence (for max number of search steps) */
float scalarMaxLength;
/* indicate whether the early stop strategy is used */
bool isEarlyStop;
/* pids for alive states */
IntList aliveStatePids;
/* alive sentences */
IntList aliveSentList;
/* whether we need to reorder the states */
bool needReorder;
public:
/* constructor */
BeamSearch();
/* de-constructor */
~BeamSearch();
/* initialize the model */
void Init(T2TConfig& config);
/* search for the most promising states */
void Search(T2TModel* model, XTensor& input, XTensor& padding, IntList* output, XTensor& score);
/* preparation */
void Prepare(int myBatchSize, int myBeamSize);
/* compute the model score for each hypotheses */
void Score(T2TStateBundle* prev, T2TStateBundle* beam);
/* generate token indices via beam pruning */
void Generate(T2TStateBundle* prev, T2TStateBundle* beam);
/* expand the search graph */
void Expand(T2TStateBundle* prev, T2TStateBundle* beam, XTensor& reorderState);
/* collect hypotheses with ending symbol */
void Collect(T2TStateBundle* beam);
/* fill the hypotheses heap with incomplete hypotheses */
void FillHeap(T2TStateBundle* beam);
/* save the output sequences and score */
void Dump(IntList* output, XTensor* score);
/* check if the token is an end symbol */
bool IsEnd(int token);
/* check whether all hypotheses are completed */
bool IsAllCompleted(T2TStateBundle* beam);
/* update the beam by pruning finished states */
void RemoveFinishedStates(T2TStateBundle* beam, XTensor& aliveEncoding,
XTensor& aliveInput, XTensor& alivePadding, XTensor& aliveIdx);
/* set end symbols for search */
void SetEnd(const int* tokens, const int tokenNum);
/* make a mask to prevent duplicated entries in beam expansion for the first position */
XTensor MakeFirstMask(T2TStateBundle* beam);
};
class GreedySearch
{
private:
/* predictor */
T2TPredictor predictor;
/* max length of the generated sequence */
int maxLength;
/* batch size */
int batchSize;
/* array of the end symbols */
int* endSymbols;
/* number of the end symbols */
int endSymbolNum;
/* start symbol */
int startSymbol;
/* scalar of the input sequence (for max number of search steps) */
float scalarMaxLength;
public:
/* constructor */
GreedySearch();
/* de-constructor */
~GreedySearch();
/* initialize the model */
void Init(T2TConfig& config);
/* search for the most promising states */
void Search(T2TModel* model, XTensor& input, XTensor& padding, IntList* output);
/* preparation */
void Prepare(int myBatchSize);
/* check if the token is an end symbol */
bool IsEnd(int token);
/* set end symbols for search */
void SetEnd(const int* tokens, const int tokenNum);
};
}
#endif
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论