Commit d291f56a by liyinqiao

Merge branch liyinqiao.

parent 66f7a298
差异被折叠。 点击展开。
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
# NiuTrans.Tensor环境配置
## 注意事项
CUDA最新版本9.2尚且不支持VS2017最新版本,因此建议使用CUDA版本为9.0或9.1,建议使用VS版本为VS2015,或使用VS2017时安装v140工具集,解决方案平台设置为×64。
## CUDA配置
在已安装好VS、CUDA并配置好环境变量后,一些关键的CUDA配置选项如下所示,以下配置选项在 **项目 -> 属性** 中可以找到。
>$(CUDA_PATH)\include
加入到 **VC++目录 -> 包含** 中。
>$(CUDA_PATH)\lib\Win32
加入到 **VC++目录 -> 库** 中。
>cuda.lib;cudadevrt.lib;cudart.lib;cudart_static.lib;nvcuvid.lib;OpenCL.lib;cublas.lib;curand.lib;
加入到 **链接器->输入->附加依赖项** 中。
配置完成后,右键 **工程->项目依赖性** ,选择CUDA9。
在.cu文件上右键属性,在项类型中选择"CUDA C/C++"(最好搜索.cu文件,然后全选设置)。
## 其他配置
**C/C++->常规->SDL检查**,设为否。
**C/C++->预处理器->预处理器定义** 中,添加
>USE_CUDA;USE_BLAS;WIN32;MKL;_DEBUG;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_WARNINGS_
CONSOLE;
**链接器->系统->子系统**,设置为控制台。
**常规->字符集**,使用Unicode字符集。
**调试->命令参数**中设置可执行文件所需要的参数。
This source diff could not be displayed because it is too large. You can view the blob instead.
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-10
*/
#include <stdio.h>
#include "./network/XNet.h"
#include "./tensor/XUtility.h"
#include "./tensor/function/FHeader.h"
#include "./tensor/core/CHeader.h"
#include "./tensor/test/Test.h"
#include "./sample/fnnlm/FNNLM.h"
#include "./sample/transformer/NMT.h"
//#define CRTDBG_MAP_ALLOC
//#include <stdlib.h>
//#include <crtdbg.h>
using namespace nts;
using namespace fnnlm;
using namespace nmt;
int main( int argc, const char ** argv )
{
if(argc > 1 && !strcmp(argv[1], "-test"))
Test();
else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
FNNLMMain(argc - 1, argv + 1);
else if(argc > 1 && !strcmp(argv[1], "-t2t"))
NMTMain(argc - 1, argv + 1);
else{
fprintf(stderr, "Thanks for using NiuTensor! This is a library for building\n");
fprintf(stderr, "neural networks in an easy way. \n\n");
fprintf(stderr, "Run this program with \"-test\" for unit test!\n");
fprintf(stderr, "Or run this program with \"-fnnlm\" for sample FNNLM!\n");
fprintf(stderr, "Or run this program with \"-t2t\" for sample Transformer!\n");
}
return 0;
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-10
*/
#include <stdio.h>
#include "XNet.h"
#include "../tensor/XUtility.h"
#include "../tensor/function/FHeader.h"
#include "../tensor/core/CHeader.h"
#include "../tensor/test/Test.h"
#include "../sample/fnnlm/FNNLM.h"
#include "../sample/transformer/Transformer.h"
//#define CRTDBG_MAP_ALLOC
//#include <stdlib.h>
//#include <crtdbg.h>
void BackwardTest();
void TransposeTest();
void SumDimTest();
using namespace nts;
using namespace fnnlm;
using namespace transformer;
int main( int argc, const char ** argv )
{
//_CrtSetDbgFlag(_CrtSetDbgFlag(_CRTDBG_REPORT_FLAG) | _CRTDBG_LEAK_CHECK_DF);
//_CrtSetBreakAlloc(2708);
if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
FNNLMMain(argc - 1, argv + 1);
else if(argc > 1 && !strcmp(argv[1], "-t2t"))
TransformerMain(argc - 1, argv + 1);
else{
fprintf(stderr, "Thanks for using NiuTrans.Network! This is a library for building\n");
fprintf(stderr, "neural networks in an easy way. \n\n");
fprintf(stderr, "Run this program with \"-test\" for unit test!\n");
fprintf(stderr, "Or run this program with \"-fnnlm\" for sample FNNLM!\n");
}
//_CrtDumpMemoryLeaks();
return 0;
}
void BackwardTest()
{
XNet net;
XTensor a;
XTensor b;
XTensor c;
XTensor mean;
XTensor origin;
InitTensor2D(&a, 2, 3);
InitTensor1D(&b, 2);
a.SetZeroAll();
b.SetZeroAll();
a.Set2D(1.0F, 0, 0);
a.Set2D(2.0F, 0, 1);
a.Set2D(3.0F, 0, 2);
a.Set2D(4.0F, 1, 0);
a.Set2D(5.0F, 1, 1);
a.Set2D(6.0F, 1, 2);
b.Set1D(2.0F, 0);
b.Set1D(1.0F, 1);
c = DivDim(a, b, 0);
c.Dump(stderr, "c:");
//XLink::ShowNetwork(stderr, &c);
net.Backward(c);
net.Dump(stderr);
}
void TransposeTest()
{
#ifdef USE_CUDA
XMem mem0(0, UNI_FREE, MILLION * 64, 1024, MILLION * 64);
//XMem mem1(1, UNI_FREE, MILLION * 64, 1024, MILLION * 64);
XTensor x;
XTensor y;
XTensor z;
int loops = 2000;
int B = 3 * 2 * 4;
int K = 8 * 1;
int N = 50;
int H = 512 * 4;
int nnn = GDevs.nGPU;
InitTensor3D(&x, B, N, H, X_FLOAT, 0);
InitTensor4D(&y, K, B, N, H/K, X_FLOAT, 0);
InitTensor3D(&z, B, N, H, X_FLOAT, 0);
cudaEvent_t ctime0;
cudaEvent_t ctime1;
cudaEvent_t ctime2;
cudaEvent_t ctime3;
cudaEvent_t ctime4;
cudaEvent_t ctime5;
float elapsedSplit = 0.0;
float elapsedMerge = 0.0;
float elapsedSum = 0.0;
cudaEventCreate(&ctime0);
cudaEventCreate(&ctime1);
cudaEventCreate(&ctime2);
cudaEventCreate(&ctime3);
cudaEventCreate(&ctime4);
cudaEventCreate(&ctime5);
cudaEventRecord(ctime0, 0);
double time0 = GetClock();
for(int i = 0; i < loops; i++)
_Split(&x, &y, 2, K);
double time1 = GetClock();
cudaEventRecord(ctime1, 0);
cudaEventSynchronize(ctime1);
cudaEventElapsedTime(&elapsedSplit, ctime0, ctime1);
cudaEventRecord(ctime2, 0);
double time2 = GetClock();
for(int i = 0; i < loops; i++)
_Merge(&y, &x, 3);
double time3 = GetClock();
cudaEventRecord(ctime3, 0);
cudaEventSynchronize(ctime3);
cudaEventElapsedTime(&elapsedMerge, ctime2, ctime3);
cudaEventRecord(ctime4, 0);
double time4 = GetClock();
for(int i = 0; i < loops; i++)
_Sum(&x, &z, &x);
double time5 = GetClock();
cudaEventRecord(ctime5, 0);
cudaEventSynchronize(ctime5);
cudaEventElapsedTime(&elapsedSum, ctime4, ctime5);
fprintf(stderr, "split:%f merge:%f sum:%f\n", time1 - time0, time3 - time2, time5 - time4);
fprintf(stderr, "split:%f merge:%f sum:%f\n", elapsedSplit, elapsedMerge, elapsedSum);
#endif
}
void SumDimTest()
{
XTensor x;
XTensor y;
XTensor z;
int a = 5;
int b = 7;
int c = 3;
InitTensor3D(&x, a, b, c, X_FLOAT, -1);
InitTensor1D(&y, c, X_FLOAT, -1);
InitTensor3D(&z, a, b, c, X_FLOAT, -1);
x.SetZeroAll();
y.SetZeroAll();
z.SetZeroAll();
DTYPE * data = new DTYPE[x.unitNum];
for(int i = 0; i < x.unitNum; i++)
data[i] = (DTYPE)i;
x.SetData(data, x.unitNum);
for(int i = 0; i < y.unitNum; i++)
data[i] = -(DTYPE)i;
y.SetData(data, y.unitNum);
_SumDim(&x, &y, &z, 2);
z.Dump(stderr, "z:");
delete[] data;
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......@@ -31,37 +31,65 @@ namespace nts{
/* compute dE/dx of a node */
void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient)
{
if (!isEfficient) {
CheckNTErrors(node->grad != NULL, "No gradient found!");
}
else {
CheckNTErrors(!node->isGrad || node->grad != NULL, "No gradient found!");
}
XLink &income = node->income;
int operID = income.typeID;
CheckNTErrors(node->grad != NULL, "No gradient found!");
CheckNTErrors(income.tailNum == 1, "Too many input tensors for the function!");
XTensor * input = income.tails[0];
XTensor * output = node;
XNoder::MakeGrad(input);
if (!isEfficient || input->isGrad) {
XNoder::MakeGrad(input);
if(operID == FUNC_HARDTANH)
_HardTanHBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
else if(operID == FUNC_IDENTITY)
_IdentityBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
else if(operID == FUNC_LOGSOFTMAX){
int leadDim = income.GetParamInt(0);
CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in logsoftmax!");
_LogSoftmaxBackward(NULL, output, input, output->grad, input->grad, NULL, leadDim, NOLOSS);
}
else if(operID == FUNC_RECTIFY)
_RectifyBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
else if(operID == FUNC_SIGMOID)
_SigmoidBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
else if(operID == FUNC_SOFTMAX){
int leadDim = income.GetParamInt(0);
CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in softmax!");
_SoftmaxBackward(NULL, output, input, output->grad, input->grad, NULL, leadDim, NOLOSS);
}
else{
ShowNTErrors("Wrong activation function type!");
XTensor * dedx = input->grad;
XTensor * dedy = output->grad;
XTensor* tmp;
/* store the result to a temporary node if the input has multiple children */
if (input->outgo.tailNum > 1) {
tmp = NewTensor(output);
tmp->SetZeroAll();
}
/* otherwise, the result is directly stored into the input node */
else {
tmp = dedx;
}
if (operID == FUNC_HARDTANH)
_HardTanHBackward(output, input, dedy, tmp);
else if (operID == FUNC_IDENTITY)
_IdentityBackward(output, input, dedy, tmp);
else if (operID == FUNC_LOGSOFTMAX) {
int leadDim = income.GetParamInt(0);
CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in logsoftmax!");
_LogSoftmaxBackward(NULL, output, input, dedy, tmp, NULL, leadDim, NOLOSS);
}
else if (operID == FUNC_RECTIFY)
_RectifyBackward(output, input, dedy, tmp);
else if (operID == FUNC_SIGMOID)
_SigmoidBackward(output, input, dedy, tmp);
else if (operID == FUNC_SOFTMAX) {
int leadDim = income.GetParamInt(0);
CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in softmax!");
_SoftmaxBackward(NULL, output, input, dedy, tmp, NULL, leadDim, NOLOSS);
}
else {
ShowNTErrors("Unsupported backward computation! TODO!");
}
if (input->outgo.tailNum > 1) {
_SumMe(dedx, tmp);
DelTensor(tmp);
}
}
node->visitMark = NODE_FINISHED;
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......@@ -33,7 +33,6 @@
namespace nts{
/* compute dE/dx of a node */
void XLossGrad::MakeGrad(XTensor * node, bool isEfficient)
{
......@@ -48,33 +47,45 @@ void XLossGrad::MakeGrad(XTensor * node, bool isEfficient)
XTensor * padding = NULL;
int leadingDim;
XNoder::MakeGrad(output);
XTensor * dedy = output->grad;
bool isRoot = XNoder::IsRoot(node);
if (income.tailNum == 1) {
if(dedy->dataType == X_FLOAT)
_SetDataFixedFloat(dedy, 1.0F);
else if(dedy->dataType == X_DOUBLE)
_SetDataFixedDouble(dedy, 1.0);
else if(dedy->dataType == X_INT)
_SetDataFixedInt(dedy, 1);
else
ShowNTErrors("TODO");
if (!isEfficient || output->isGrad) {
XNoder::MakeGrad(output);
XTensor * dedy = output->grad;
return;
}
if (income.tailNum == 1) {
dedy->SetDataFixed(1);
return;
}
gold = income.tails[1];
gold = income.tails[1];
if(operID == LOSS_CROSSENTROPY) {
if (income.tailNum == 3)
padding = income.tails[2];
leadingDim = income.GetParamInt(0);
CheckNTErrors(leadingDim >= 0 && leadingDim < output->order, "wrong leading dimension in logsoftmax!");
_CrossEntropyBackward(dedy, output, gold, weight, padding, leadingDim);
}
else{
ShowNTErrors("Wrong activation function type!");
XTensor* tmp;
if (!isRoot) {
tmp = NewTensor(output);
tmp->SetZeroAll();
}
else{
tmp = dedy;
}
if (operID == LOSS_CROSSENTROPY) {
if (income.tailNum == 3)
padding = income.tails[2];
leadingDim = income.GetParamInt(0);
CheckNTErrors(leadingDim >= 0 && leadingDim < output->order, "wrong leading dimension in logsoftmax!");
_CrossEntropyBackward(tmp, output, gold, weight, padding, leadingDim);
if (isRoot)
gold->DestroyData();
else
_SumMe(dedy, tmp);
}
else {
ShowNTErrors("Unsupported backward computation! TODO!");
}
if (!isRoot)
DelTensor(tmp);
}
node->visitMark = NODE_FINISHED;
......@@ -87,79 +98,4 @@ bool XLossGrad::IsLossOP(XTensor * node)
return (income.typeID & LOSS_BASE) != 0;
}
/*
compute dE/dx for a given function y = f(x)
>> gold - gold standard to measure error (or loss)
>> y - output of the function
>> x - input of the function
>> dedy - dE/dy
>> dedx - dE/dx
>> funcID - id of the function f
>> params - parameters of the function
>> lossName - name of the loss, e.g., cross entropy
*/
void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx, XTensor * padding,
int funcID, void * params,
LOSS_FUNCTION_NAME lossName)
{
CheckNTErrors(gold && y && x, "Empty input tensors!");
CheckNTErrors(dedx, "Empty gradient tensors!");
CheckNTErrors((funcID & FUNCTION_BASE) != 0, "Illegal function id");
if(funcID == FUNC_HARDTANH){
_HardTanHBackward(gold, y, x, dedy, dedx, lossName);
}
else if(funcID == FUNC_IDENTITY){
_IdentityBackward(gold, y, x, dedy, dedx, lossName);
}
else if(funcID == FUNC_LOGSOFTMAX){
int leadDim = *(int*)params;
_LogSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
}
else if(funcID == FUNC_RECTIFY){
_RectifyBackward(gold, y, x, dedy, dedx, lossName);
}
else if(funcID == FUNC_SIGMOID){
_SigmoidBackward(gold, y, x, dedy, dedx, lossName);
}else if(funcID == FUNC_SOFTMAX){
int leadDim = *(int*)params;
_SoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
}
else{
ShowNTErrors("wrong function found when call the backward process!");
}
}
/*
compute dE/dy for variable y and error(loss) function E
>> gold - gold standard to measure error (or loss)
>> y - output of the function
>> dedy - dE/dy
>> lossName - name of the loss, e.g., cross entropy
*/
void XLossGrad::Compute(XTensor * gold, XTensor * y,
XTensor * dedy, XTensor * padding,
LOSS_FUNCTION_NAME lossName)
{
if(gold == NULL){
if(dedy->dataType == X_FLOAT)
_SetDataFixedFloat(dedy, 1.0F);
else if(dedy->dataType == X_DOUBLE)
_SetDataFixedDouble(dedy, 1.0);
else if(dedy->dataType == X_INT)
_SetDataFixedInt(dedy, 1);
else{
ShowNTErrors("TODO");
}
return;
}
//_LossBackward(dedy, gold, y, lossName);
if(lossName == CROSSENTROPY)
_CrossEntropyBackward(dedy, y, gold, NULL, padding);
}
}
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......@@ -43,11 +43,11 @@ public:
static
bool IsLossOP(XTensor * node);
/* compute dE/dx for a given function y = f(x) */
void Compute(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx, XTensor * padding,
int funcID, void * params,
LOSS_FUNCTION_NAME lossName);
///* compute dE/dx for a given function y = f(x) */
//void Compute(XTensor * gold, XTensor * y, XTensor * x,
// XTensor * dedy, XTensor * dedx, XTensor * padding,
// int funcID, void * params,
// LOSS_FUNCTION_NAME lossName);
/* compute dE/dy for variable y and error(loss) function E */
void Compute(XTensor * gold, XTensor * y,
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......@@ -126,6 +126,18 @@ private:
static
void GradPower(XTensor * node, bool isEfficient);
/* gradient for power */
static
void GradReciprocal(XTensor* node, bool isEfficient);
/* gradient for sqrt */
static
void GradSqrt(XTensor* node, bool isEfficient);
/* gradient for square */
static
void GradSquare(XTensor* node, bool isEfficient);
/* gradient for ScaleAndShift */
static
void GradScaleAndShift(XTensor * node, bool isEfficient);
......@@ -146,10 +158,10 @@ private:
static
void GradSub(XTensor * node, bool isEfficient);
/* gradient for sub with one dimension: c = a - b * \beta
where the size of b is equal to that of one dimension of a */
static
void GradSubDim(XTensor * node, bool isEfficient);
/* gradient for sub with one dimension: c = a - b * \beta
where the size of b is equal to that of one dimension of a */
static
void GradSubDim(XTensor * node, bool isEfficient);
/* gradient for sum: c = a + b * \beta */
static
......@@ -173,6 +185,10 @@ private:
static
void GradReduceSum(XTensor * node, bool isEfficient);
/* gradient for reduceSumAll */
static
void GradReduceSumAll(XTensor * node, bool isEfficient);
/* gradient for reduceSumSquared */
static
void GradReduceSumSquared(XTensor * node, bool isEfficient);
......@@ -184,6 +200,10 @@ private:
/* gradient for operation */
static
void GradMulAndShift(XTensor * node, bool isEfficient);
/* gradient for MLP */
static
void GradMLP(XTensor* node, bool isEfficient);
};
}
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......@@ -34,7 +34,7 @@ class XShapeGrad
public:
/* compute dE/dx of a node */
static
void MakeGrad(XTensor * node, bool isEfficent);
void MakeGrad(XTensor * node, bool isEfficient);
/* indicates whether the node is for a shaping operation */
static
......@@ -42,55 +42,59 @@ public:
/* post processing of a node */
static
void PostProcessing(XTensor * node, int typeId, bool isEfficent);
void PostProcessing(XTensor * node, int typeId, bool isEfficient);
private:
/* gradient computation for convertdatatype: b = convertdatatype(a) */
static
void GradConvertDataType(XTensor * node, bool isEfficient);
/* gradient computation for copying indexed sub-tensors: b = copyindexed(a, srcIndex, indexSize, tgtIndex, copyNum) */
static
void GradCopyIndexed(XTensor * node, bool isEfficent);
void GradCopyIndexed(XTensor * node, bool isEfficient);
/* gradient computation for copying indexed sub-tensors: b = gather(a, index) */
static
void GradGather(XTensor * node, bool isEfficent);
void GradGather(XTensor * node, bool isEfficient);
/* gradient computation for dropout with index: b = dropoutwithindex(a, index) */
static
void GradDropoutWithIndex(XTensor * node, bool isEfficent);
void GradDropoutWithIndex(XTensor * node, bool isEfficient);
/* gradient computation for merge: c = merge(a, b, ...) */
static
void GradMerge(XTensor * node, bool isEfficent);
void GradMerge(XTensor * node, bool isEfficient);
/* gradient computation for merging a list of tensors : c = merge(list(a, b, ...)) */
static
void GradMergeList(XTensor * node, bool isEfficent);
void GradMergeList(XTensor * node, bool isEfficient);
/* gradient computation for transposing a tensor : b = transpose(a) */
static
void GradTranspose(XTensor * node, bool isEfficent);
void GradTranspose(XTensor * node, bool isEfficient);
/* gradient computation for reshaping a tensor: c = reshape(a) */
static
void GradReshape(XTensor * node, bool isEfficent);
void GradReshape(XTensor * node, bool isEfficient);
/* gradient computation for split: c = split(a) */
static
void GradSplit(XTensor * node, bool isEfficent);
void GradSplit(XTensor * node, bool isEfficient);
/* gradient computation for spliting. we return the list of the splits : list(c_1, ...) = split(a) */
static
void GradSplitList(XTensor * node, bool isEfficent);
void GradSplitList(XTensor * node, bool isEfficient);
/* gradient computation for spliting. we return the list of the splits : list(c_1, ...) = split(a).
this method is called only when all nodes of spliting have been processed. We do this in a post-processing
manner because we can fuze multiple memory copy jobs one time. This is good for system speed up. */
static
void GradSplitListPost(XTensor * node, bool isEfficent);
void GradSplitListPost(XTensor * node, bool isEfficient);
/* gradient computation for unsqueezing a tensor : c = unsqueeze(a) */
static
void GradUnsqueeze(XTensor * node, bool isEfficent);
void GradUnsqueeze(XTensor * node, bool isEfficient);
};
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......@@ -77,104 +77,20 @@ backward propagation to obtain gradient
>> root - root node (output) of the network
>> loss - name of loss function
*/
void XNet::Backward(XTensor &root, LOSS_FUNCTION_NAME loss)
void XNet::Backward(XTensor &root)
{
TensorList roots(1);
roots.Add(&root);
TensorList golds(1);
golds.Add(NULL);
TensorList paddings(1);
paddings.Add(NULL);
Backward(roots, golds, paddings, loss);
}
/*
backward propagation to obtain gradient wrt. the loss/error function
>> root - root node (output) of the network
>> gold - gold standard for the output
>> loss - name of loss function
*/
void XNet::Backward(XTensor &root, XTensor &gold, LOSS_FUNCTION_NAME loss)
{
TensorList roots(1);
roots.Add(&root);
TensorList golds(1);
golds.Add(&gold);
TensorList paddings(1);
paddings.Add(NULL);
Backward(roots, golds, paddings, loss);
}
/*
backward propagation to obtain gradient wrt. the loss/error function
>> root - root node (output) of the network
>> gold - gold standard for the output
>> padding - specify a target value that is ignored and does not contribute to the gradient computation
>> loss - name of loss function
*/
void XNet::Backward(XTensor &root, XTensor &gold, XTensor &padding, LOSS_FUNCTION_NAME loss)
{
TensorList roots(1);
roots.Add(&root);
TensorList golds(1);
golds.Add(&gold);
TensorList paddings(1);
paddings.Add(&padding);
Backward(roots, golds, paddings, loss);
}
/*
backward propagation to obtain gradient
with a number of root nodes
>> roots - a list of root nodes (output) of the network
>> loss - name of loss function
*/
void XNet::Backward(TensorList &roots, LOSS_FUNCTION_NAME loss)
{
TensorList golds(roots.count);
TensorList paddings(roots.count);
for (int i = 0; i < roots.count; i++) {
golds.Add(NULL);
paddings.Add(NULL);
}
Backward(roots, golds, paddings, loss);
}
/*
backward propagation to obtain gradient
with a number of root nodes
>> roots - a list of root nodes (output) of the network
>> golds - a list of gold standard for the output
>> loss - name of loss function
*/
void XNet::Backward(TensorList &roots, TensorList &golds, LOSS_FUNCTION_NAME loss)
{
TensorList paddings(roots.count);
for (int i = 0; i < roots.count; i++)
paddings.Add(NULL);
Backward(roots, golds, paddings, loss);
Backward(roots);
}
/*
backward propagation to obtain gradient wrt. the loss/error function
with a number of root nodes
>> roots - a list of root nodes (output) of the network
>> golds - a list of gold standard for the output
>> paddings - specify a target value that is ignored
>> loss - name of loss function
*/
void XNet::Backward(TensorList &roots, TensorList &golds, TensorList &paddings, LOSS_FUNCTION_NAME loss)
void XNet::Backward(TensorList &roots)
{
Traverse(roots);
......@@ -187,39 +103,6 @@ void XNet::Backward(TensorList &roots, TensorList &golds, TensorList &paddings,
node->visitMark = NODE_UNFINISHED;
}
//XLossGrad lossGrad;
/* we start with the gradient with respect to the loss for output layers */
/*for(int i = 0; i < roots.count; i++){
XTensor * root = (XTensor*)roots.Get(i);
XTensor * gold = (XTensor*)golds.Get(i);
XTensor * padding = (XTensor*)paddings.Get(i);
XLink &income = root->income;
int funcID = income.typeID;
void * params = income.params;*/
/* we compute dE/dx if the output is generated by an activation function y = f(x).
Note that we do not need to obtain dE/dy here because it is no use in the
folloing process of back-propagation */
/*if(gold != NULL && income.tailNum == 1 && (funcID & FUNCTION_BASE)){
if(funcID == FUNC_LOGSOFTMAX || funcID == FUNC_SOFTMAX) {
XTensor * x = income.tails[0];
XNoder::MakeGrad(x);
lossGrad.Compute(gold, root, x, NULL, x->grad, padding, funcID, params, loss);
root->visitMark = NODE_FINISHED;
}
else {
XNoder::MakeGrad(root);
lossGrad.Compute(gold, root, root->grad, padding, loss);
}
}*/
/* we compuate dE/dy (y is the output) if no predefined activation function is used */
/*else{
XNoder::MakeGrad(root);
lossGrad.Compute(gold, root, root->grad, NULL, loss);
}
}*/
/* back-propagation from output to input */
for(int i = nodes.count - 1; i >= 0; i--){
XTensor * node = (XTensor*)nodes.Get(i);
......@@ -238,8 +121,13 @@ void XNet::Backward(TensorList &roots, TensorList &golds, TensorList &paddings,
ClearGrad(parent);
}
if(XNoder::IsLeaf(node))
if (XNoder::IsLeaf(node)) {
ClearGrad(node);
if (node->outgo.tailNum == 0) {
delete node;
}
}
}
}
}
......@@ -267,7 +155,7 @@ void XNet::BackwardNode(XTensor * node, bool isEfficent)
else if(XShapeGrad::IsShapeOP(node))
XShapeGrad::MakeGrad(node, isEfficent);
else if(XLossGrad::IsLossOP(node))
XLossGrad::MakeGrad(node, isEfficent);
XLossGrad::MakeGrad(node, isEfficent);
else{
ShowNTErrors("Wrong node type!");
}
......@@ -433,7 +321,6 @@ void XNet::ClearGrad(XTensor * node)
}
if(finished){
//fprintf(stderr, "del %d %ld\n", node->id, node->grad->unitNum);
delete node->grad;
node->grad = NULL;
}
......@@ -451,7 +338,7 @@ void XNet::ShowNetwork(FILE * file, XTensor * node)
Traverse(roots);
XLink::ShowNode(file, node);
//XLink::ShowNode(file, node);
/* go over nodes in its topological order */
for(int i = nodes.count - 1; i >= 0; i--){
......@@ -460,7 +347,6 @@ void XNet::ShowNetwork(FILE * file, XTensor * node)
}
}
/*
search for a node in a top-down manner by its name
>> top - the top most node
......@@ -468,7 +354,7 @@ search for a node in a top-down manner by its name
*/
//XTensor * XNet::SearchNode(XTensor * top, const char * name)
//{
//return XLink::SearchNode(top, name);
//return XLink::SearchNode(top, name);
//}
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......@@ -61,25 +61,11 @@ struct XNet
void Clear();
/* backward propagation to obtain gradient */
void Backward(XTensor &root, LOSS_FUNCTION_NAME loss = NOLOSS);
/* backward propagation to obtain gradient wrt. the loss/error function */
void Backward(XTensor &root, XTensor &gold, LOSS_FUNCTION_NAME loss = NOLOSS);
/* backward propagation to obtain gradient wrt. the loss/error function */
void Backward(XTensor &root, XTensor &gold, XTensor &padding, LOSS_FUNCTION_NAME loss = NOLOSS);
/* backward propagation to obtain gradient
with a number of root nodes */
void Backward(TensorList &roots, LOSS_FUNCTION_NAME loss = NOLOSS);
/* backward propagation to obtain gradient
with a number of root nodes */
void Backward(TensorList &roots, TensorList &golds, LOSS_FUNCTION_NAME loss = NOLOSS);
void Backward(XTensor &root);
/* backward propagation to obtain gradient wrt. the loss/error function
with a number of root nodes */
void Backward(TensorList &roots, TensorList &golds, TensorList &paddings, LOSS_FUNCTION_NAME loss = NOLOSS);
void Backward(TensorList &roots);
/* backward computation for a given node */
void BackwardNode(XTensor * node, bool isEfficent = false);
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......@@ -29,7 +29,7 @@ void XNoder::MakeGrad(XTensor * node)
if(node == NULL)
return;
if(!XTensor::IsSameShaped(node, node->grad)){
if(!_IsSameShaped(node, node->grad)){
delete node->grad;
node->grad = NewTensor(node);
node->grad->SetZeroAll();
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......@@ -20,7 +20,7 @@
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-18
*/
#include "../tensor/XTensor.h"
#include "../tensor/core/CHeader.h"
#ifndef __XNODER_H__
#define __XNODER_H__
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-10-09
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#include "Decoder.h"
#include "Utility.h"
#include "module/LayerNorm.h"
#include "module/CommonModules.h"
#include "../../tensor/core/CHeader.h"
namespace nmt
{
/* constructor */
AttDecoder::AttDecoder()
{
selfAtt = NULL;
fnns = NULL;
selfAttLayerNorms = NULL;
fnnLayerNorms = NULL;
enDeAtt = NULL;
enDeAttLayerNorms = NULL;
decoderLayerNorm = NULL;
selfAttCache = NULL;
enDeAttCache = NULL;
}
/* de-constructor */
AttDecoder::~AttDecoder()
{
delete[] selfAttCache;
delete[] enDeAttCache;
delete[] selfAtt;
delete[] fnns;
delete[] selfAttLayerNorms;
delete[] fnnLayerNorms;
delete[] enDeAtt;
delete[] enDeAttLayerNorms;
if (preNorm)
delete decoderLayerNorm;
}
/*
initialize the model
>> config - configurations of the model
*/
void AttDecoder::InitModel(Config& config)
{
devID = config.devID;
nlayer = config.nDecLayer;
hSize = config.modelSize;
eSize = config.embSize;
vSize = config.tgtVocabSize;
dropoutP = config.dropout;
preNorm = config.preNorm;
CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsizetgt\"");
/* embedding model */
embedder.InitModel(config, false);
selfAtt = new Attention[nlayer];
fnns = new FNN[nlayer];
selfAttLayerNorms = new LN[nlayer];
enDeAtt = new Attention[nlayer];
enDeAttLayerNorms = new LN[nlayer];
fnnLayerNorms = new LN[nlayer];
selfAttCache = new Cache[nlayer];
enDeAttCache = new Cache[nlayer];
if (preNorm)
decoderLayerNorm = new LN;
/* initialize the stacked layers */
for (int i = 0; i < nlayer; i++) {
selfAtt[i].InitModel(config);
fnns[i].InitModel(config);
selfAttLayerNorms[i].InitModel(config);
fnnLayerNorms[i].InitModel(config);
enDeAtt[i].InitModel(config);
enDeAttLayerNorms[i].InitModel(config);
selfAttCache[i].enable = true;
enDeAttCache[i].enable = true;
}
if (preNorm)
decoderLayerNorm->InitModel(config);
}
/*
make the decoding network
>> inputDec - the input tensor of the decoder
>> outputEnc - the output tensor of the encoder
>> mask - mask that indicates which position is valid
>> maskEncDec - mask for the encoder-decoder attention
>> nstep - the current length of the decoder input
>> isTraining - indicates whether the model is used for training
<< return - the output tensor of the decoder
*/
XTensor AttDecoder::Make(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
XTensor* maskEncDec, int nstep, bool isTraining)
{
XTensor x;
x = embedder.Make(inputDec, true, isTraining, nstep);
/* dropout */
if (isTraining && dropoutP > 0)
x = Dropout(x, dropoutP);
for (int i = 0; i < nlayer; i++) {
XTensor att;
XTensor ende;
XTensor fnn;
XTensor res;
XTensor selfAttnBefore;
XTensor selfAttnAfter;
XTensor endeAttnBefore;
XTensor endeAttnAfter;
XTensor fnnBefore;
/* layer normalization with pre-norm for self-attn */
selfAttnBefore = LayerNorm(x, selfAttLayerNorms[i], preNorm, true, false);
/******************/
/* self attention */
att = selfAtt[i].Make(selfAttnBefore, selfAttnBefore, selfAttnBefore,
mask, isTraining, &selfAttCache[i], SELF_ATT);
/* dropout */
if (isTraining && dropoutP > 0)
att = Dropout(att, dropoutP);
/* residual connection */
res = Sum(att, x);
/* layer normalization with post-norm for self-attention */
selfAttnAfter = LayerNorm(res, selfAttLayerNorms[i], preNorm, false, true);
/* layer normalization with pre-norm for encoder-decoder attention */
endeAttnBefore = LayerNorm(selfAttnAfter, enDeAttLayerNorms[i], preNorm, true, false);
/* encoder-decoder attention */
ende = enDeAtt[i].Make(outputEnc, endeAttnBefore, outputEnc, maskEncDec,
isTraining, &enDeAttCache[i], EN_DE_ATT);
/* dropout */
if (isTraining && dropoutP > 0)
ende = Dropout(ende, dropoutP);
/* residual connection */
res = Sum(ende, selfAttnAfter);
/* layer normalization with post-norm for encoder-decoder attention */
endeAttnAfter = LayerNorm(res, enDeAttLayerNorms[i], preNorm, false, true);
/* layer normalization with pre-norm for fnn */
fnnBefore = LayerNorm(endeAttnAfter, fnnLayerNorms[i], preNorm, true, false);
/* fnn */
fnn = fnns[i].Make(fnnBefore, isTraining);
/* dropout */
if (isTraining && dropoutP > 0)
fnn = Dropout(fnn, dropoutP);
/* residual connection */
res = Sum(fnn, endeAttnAfter);
/* layer normalization with post-norm for fnn */
x = LayerNorm(res, fnnLayerNorms[i], preNorm, false, true);
}
if (preNorm)
return decoderLayerNorm->Make(x);
return x;
}
/*
make the decoding network
>> inputDec - the input tensor of the decoder
>> outputEnc - the output tensor of the encoder
>> mask - mask that indicates which position is valid
>> maskEncDec - mask for the encoder-decoder attention
>> nstep - the current length of the decoder input
>> isTraining - indicates whether the model is used for training
<< return - the output tensor of the decoder
*/
XTensor AttDecoder::MakeFast(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
XTensor* maskEncDec, int nstep, bool isTraining)
{
XTensor x;
x = embedder.Make(inputDec, true, isTraining, nstep);
/* dropout */
if (isTraining && dropoutP > 0)
x = Dropout(x, dropoutP);
for (int i = 0; i < nlayer; i++) {
XTensor res;
res = x;
/* layer normalization with pre-norm for self-attn */
x = selfAttLayerNorms[i].Make(x);
/******************/
/* self attention */
x = selfAtt[i].Make(x, x, x, mask, isTraining, &selfAttCache[i], SELF_ATT);
/* dropout */
if (isTraining && dropoutP > 0)
x = Dropout(x, dropoutP);
/* residual connection */
x = Sum(res, x);
res = x;
/* layer normalization with pre-norm for encoder-decoder attention */
x = enDeAttLayerNorms[i].Make(x);
/* encoder-decoder attention */
x = enDeAtt[i].Make(outputEnc, x, outputEnc, maskEncDec,
isTraining, &enDeAttCache[i], EN_DE_ATT);
/* dropout */
if (isTraining && dropoutP > 0)
x = Dropout(x, dropoutP);
/* residual connection */
x = Sum(res, x);
res = x;
/* layer normalization with pre-norm for fnn */
x = fnnLayerNorms[i].Make(x);
/* fnn */
x = fnns[i].Make(x, isTraining);
/* dropout */
if (isTraining && dropoutP > 0)
x = Dropout(x, dropoutP);
/* residual connection */
x = Sum(res, x);
}
x = decoderLayerNorm->Make(x);
return x;
}
}
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -17,18 +16,17 @@
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#ifndef __T2TDECODER_H__
#define __T2TDECODER_H__
#ifndef __DECODER_H__
#define __DECODER_H__
#include "T2TEncoder.h"
#include "Encoder.h"
#include "Utility.h"
namespace transformer
namespace nmt
{
#define DECODING_NAME "decoding"
#define DECODING_INPUT_NAME "decoding_input"
class AttDecoder
{
......@@ -37,9 +35,6 @@ public:
/* device id */
int devID;
/* memory pool */
XMem * mem;
/* layer number */
int nlayer;
......@@ -55,50 +50,56 @@ public:
/* dropout probability */
DTYPE dropoutP;
/* some positions can be ignored in attention. this is useful in lm where the first position needs
* special design for the attention model. */
int ignored;
/* embedding of word at each position */
T2TEmbedder embedder;
Embedder embedder;
/* FNN model of each layer */
T2TFNN * fnns;
FNN* fnns;
/* attention model of each layer */
T2TAttention * attentions;
/* layer normalization for fnn */
T2TLN * fnnLayerNorms;
Attention* selfAtt;
/* layer normalization for attention */
T2TLN * attLayerNorms;
LN* selfAttLayerNorms;
/* input tensor of the encoder */
XTensor * input;
/* layer normalization for fnn */
LN* fnnLayerNorms;
/* output tensor of the encoder */
XTensor * output;
/* layer normalization for decoder */
LN* decoderLayerNorm;
/* encoder-decoder attention model of each layer */
T2TAttention * attentionsEnde;
Attention* enDeAtt;
/* layer normalization for encoder-decoder attention */
T2TLN * attEndeLayerNorms;
LN* enDeAttLayerNorms;
/* layer cache list */
Cache* selfAttCache;
/* layer cache list */
Cache* enDeAttCache;
/* the location of layer normalization */
bool preNorm;
public:
/* constructor */
AttDecoder();
/* deconstructor */
/* de-constructor */
~AttDecoder();
/* initialize the model */
void InitModel(int argc, char ** argv,
bool myIsMasked, int myIgnored,
int myDevID = -1, XMem * myMem = NULL);
void InitModel(Config& config);
/* make the decoding network */
XTensor Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, XTensor &maskEncDec, bool isTraining);
XTensor Make(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
XTensor* maskEncDec, int nstep, bool isTraining);
/* make the decoding network (pre norm) */
XTensor MakeFast(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
XTensor* maskEncDec, int nstep, bool isTraining);
};
}
......
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#include "Encoder.h"
#include "Utility.h"
#include "module/LayerNorm.h"
#include "module/CommonModules.h"
#include "../../tensor/core/CHeader.h"
namespace nmt
{
/* constructor */
AttEncoder::AttEncoder()
{
selfAtt = NULL;
fnns = NULL;
attLayerNorms = NULL;
fnnLayerNorms = NULL;
encoderLayerNorm = NULL;
}
/* de-constructor */
AttEncoder::~AttEncoder()
{
delete[] selfAtt;
delete[] fnns;
delete[] attLayerNorms;
delete[] fnnLayerNorms;
if (preNorm)
delete encoderLayerNorm;
}
/*
initialize the model
>> config - configurations for the model
*/
void AttEncoder::InitModel(Config& config)
{
devID = config.devID;
nlayer = config.nEncLayer;
eSize = config.embSize;
hSize = config.modelSize;
vSize = config.srcVocabSize;
preNorm = config.preNorm;
dropoutP = config.dropout;
CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
CheckNTErrors(vSize > 1, "Set vocabulary size by \"-vsize\"");
/* embedding model */
embedder.InitModel(config);
selfAtt = new Attention[nlayer];
fnns = new FNN[nlayer];
attLayerNorms = new LN[nlayer];
fnnLayerNorms = new LN[nlayer];
if (preNorm)
encoderLayerNorm = new LN;
/* initialize the stacked layers */
for (int i = 0; i < nlayer; i++) {
selfAtt[i].InitModel(config);
fnns[i].InitModel(config);
attLayerNorms[i].InitModel(config);
fnnLayerNorms[i].InitModel(config);
}
if (preNorm)
encoderLayerNorm->InitModel(config);
}
/*
make the encoding network
>> input - the input tensor of the encoder
>> mask - the mask that indicate each position is valid
>> maskEncDec - no use
>> isTraining - indicates whether the model is used for training
<< return - the output tensor of the encoder
*/
XTensor AttEncoder::Make(XTensor& input, XTensor* mask, XTensor& maskEncDec, bool isTraining)
{
XTensor x;
x = embedder.Make(input, false, isTraining);
/* dropout */
if (isTraining && dropoutP > 0)
x = Dropout(x, dropoutP);
for (int i = 0; i < nlayer; i++) {
XTensor att;
XTensor fnn;
XTensor res;
XTensor attnBefore;
XTensor attnAfter;
XTensor fnnBefore;
/* layer normalization with pre-norm for self-attn */
attnBefore = LayerNorm(x, attLayerNorms[i], preNorm, true, false);
/* self attention */
att = selfAtt[i].Make(attnBefore, attnBefore, attnBefore, mask, isTraining, NULL, SELF_ATT);
/* dropout */
if (isTraining && dropoutP > 0)
att = Dropout(att, dropoutP);
/* residual connection */
res = Sum(att, x);
/* layer normalization with post-norm for self-attn */
attnAfter = LayerNorm(res, attLayerNorms[i], preNorm, false, true);
/* layer normalization with pre-norm for fnn */
fnnBefore = LayerNorm(attnAfter, fnnLayerNorms[i], preNorm, true, false);
/* fnn */
fnn = fnns[i].Make(fnnBefore, isTraining);
/* dropout */
if (isTraining && dropoutP > 0)
fnn = Dropout(fnn, dropoutP);
/* residual connection */
res = Sum(fnn, attnAfter);
/* layer normalization with post-norm for fnn */
x = LayerNorm(res, fnnLayerNorms[i], preNorm, false, true);
}
if (preNorm)
return encoderLayerNorm->Make(x);
return x;
}
/*
make the encoding network
>> input - the input tensor of the encoder
>> mask - the mask that indicate each position is valid
>> maskEncDec - no use
>> isTraining - indicates whether the model is used for training
<< return - the output tensor of the encoder
*/
XTensor AttEncoder::MakeFast(XTensor& input, XTensor* mask, XTensor& maskEncDec, bool isTraining)
{
XTensor x;
x = embedder.Make(input, false, isTraining);
/* dropout */
if (isTraining && dropoutP > 0)
x = Dropout(x, dropoutP);
for (int i = 0; i < nlayer; i++) {
XTensor res;
res = x;
/* layer normalization with pre-norm for self-attn */
x = attLayerNorms[i].Make(x);
/* self attention */
x = selfAtt[i].Make(x, x, x, mask, isTraining, NULL, SELF_ATT);
/* dropout */
if (isTraining && dropoutP > 0)
x = Dropout(x, dropoutP);
/* residual connection */
x = Sum(res, x);
res = x;
/* layer normalization with pre-norm for fnn */
x = fnnLayerNorms[i].Make(x);
/* fnn */
x = fnns[i].Make(x, isTraining);
/* dropout */
if (isTraining && dropoutP > 0)
x = Dropout(x, dropoutP);
/* residual connection */
x = Sum(res, x);
}
x = encoderLayerNorm->Make(x);
return x;
}
/*
make the encoding network (wrapper)
>> input - the input tensor of the encoder
>> mask - the mask that indicate each position is valid
>> isTraining - indicates whether the model is used for training
<< return - the output tensor of the encoder
*/
XTensor AttEncoder::Make(XTensor& input, XTensor* mask, bool isTraining)
{
XTensor nothing;
return Make(input, mask, nothing, isTraining);
}
}
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -17,57 +16,42 @@
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#ifndef __T2TENCODER_H__
#define __T2TENCODER_H__
#ifndef __ENCODER_H__
#define __ENCODER_H__
#include "T2TFNN.h"
#include "T2TAttention.h"
#include "T2TEmbedding.h"
#include "T2TLayerNormal.h"
#include "Utility.h"
#include "module/FNN.h"
#include "module/Attention.h"
#include "module/Embedding.h"
#include "module/LayerNorm.h"
#include "../../network/XNet.h"
using namespace nts;
namespace transformer
namespace nmt
{
#define ENCODING_NAME "encoding"
#define ENCODING_INPUT_NAME "encoding_input"
/*
base class of the encoder
*/
class T2TEncoder
{
public:
virtual
XTensor Make(XTensor &input, XTensor &mask, XTensor &mask2, bool isTraining) = 0;
};
/*
the encoder based on RNN
/*
base class of the encoder
*/
class RNNEncoder : T2TEncoder
class Encoder
{
public:
XTensor Make(XTensor &input, XTensor &mask, XTensor &mask2, bool isTraining);
virtual XTensor Make(XTensor& input, XTensor* mask, XTensor& mask2, bool isTraining) = 0;
};
/*
the encoder based on self-attention
/*
the encoder based on self-attention
*/
class AttEncoder : T2TEncoder
class AttEncoder : Encoder
{
public:
/* device id */
int devID;
/* memory pool */
XMem * mem;
/* layer number */
int nlayer;
......@@ -88,26 +72,26 @@ public:
int ignored;
/* embedding of word at each position */
T2TEmbedder embedder;
Embedder embedder;
/* FNN model of each layer */
T2TFNN * fnns;
FNN* fnns;
/* attention model of each layer */
T2TAttention * attentions;
Attention* selfAtt;
/* layer normalizations for attention */
LN* attLayerNorms;
/* layer normalization for fnn */
T2TLN * fnnLayerNorms;
LN* fnnLayerNorms;
/* layer normalization for attention */
T2TLN * attLayerNorms;
/* layer normalization for encoder */
LN* encoderLayerNorm;
/* input tensor of the encoder */
XTensor * input;
/* the location of layer normalization */
bool preNorm;
/* output tensor of the encoder */
XTensor * output;
public:
/* constructor */
AttEncoder();
......@@ -116,18 +100,18 @@ public:
~AttEncoder();
/* initialize the model */
void InitModel(int argc, char ** argv,
bool myIsMasked, int myIgnored,
int myDevID = -1, XMem * myMem = NULL);
void InitModel(Config& config);
/* make the encoding network */
XTensor Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, bool isTraining);
XTensor Make(XTensor& input, XTensor* mask, XTensor& maskEncDec, bool isTraining);
/* make the encoding network */
XTensor MakeFast(XTensor& input, XTensor* mask, XTensor& maskEncDec, bool isTraining);
/* make the encoding network (wrapper) */
XTensor Make(XTensor &input, XTensor &mask, bool isTraining);
XTensor Make(XTensor& input, XTensor* mask, bool isTraining);
};
}
#endif
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -17,40 +16,38 @@
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#ifndef __T2TMODEL_H__
#define __T2TMODEL_H__
#ifndef __MODEL_H__
#define __MODEL_H__
#include "T2TFNN.h"
#include "T2TAttention.h"
#include "T2TEncoder.h"
#include "T2TDecoder.h"
#include "T2TOutput.h"
#include "Encoder.h"
#include "Decoder.h"
#include "module/FNN.h"
#include "module/Output.h"
#include "Utility.h"
#include "module/Attention.h"
namespace transformer
namespace nmt
{
/* a transformer model that keeps parameters of the encoder,
the decoder and the output layer (softmax). Also, it creates
the network used in transformer. */
class T2TModel
/* a nmt model that keeps parameters of the encoder,
the decoder and the output layer (softmax). */
class Model
{
public:
/* device id */
int devID;
/* memory pool */
XMem * mem;
/* the encoder */
AttEncoder * encoder;
AttEncoder* encoder;
/* the decoder */
AttDecoder * decoder;
AttDecoder* decoder;
/* output layer */
T2TOutput * outputLayer;
Output* outputLayer;
/* indicates whether the model is running for language modeling */
bool isLM;
......@@ -58,53 +55,65 @@ public:
/* indicates whether the model is running for machine translation */
bool isMT;
/* indicates whether the model is running with FP16 data type */
bool useFP16;
/* number of heads in the attention model */
int nhead;
/* indicates whether share encoders embeddings with decoders */
int shareAllEmbeddings;
/* indicates whether share decoder embeddings with output weights */
int shareDecInputOutputWeight;
public:
/* constructor */
T2TModel();
Model();
/* de-constructor */
~T2TModel();
~Model();
/* initialize the model */
void InitModel(int argc, char ** argv);
void InitModel(Config& config);
/* print model configurations */
void ShowModelConfig(Config& config);
/* make the encoding network */
XTensor MakeEncoder(XTensor &input, XTensor &mask, bool isTraining);
XTensor MakeEncoder(XTensor& input, XTensor* mask, bool isTraining);
/* make the encoding network */
XTensor MakeDecoder(XTensor &inputEnc, XTensor &inputDec, XTensor &mask, XTensor &MaskEncDec, bool isTraining);
XTensor MakeDecoder(XTensor& inputEnc, XTensor& inputDec, XTensor* mask,
XTensor& MaskEncDec, bool isTraining);
/* make the network for langauge modeling (with the output softmax layer) */
void MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool isTraining);
/* make the network for language modeling (with the output softmax layer) */
void MakeLM(XTensor& input, XTensor& output, XTensor& padding, bool isTraining);
/* make the network for machine translation (with the output softmax layer) */
void MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output,
XTensor &paddingEnc, XTensor &paddingDec, bool isTraining);
void MakeMT(XTensor& inputEnc, XTensor& inputDec, XTensor& output,
XTensor& paddingEnc, XTensor& paddingDec, bool isTraining);
/* make the mask for training MT models */
void MakeMTMask(XTensor &inputEnc, XTensor &inputDec,
XTensor &paddingEnc, XTensor &paddingDec,
XTensor &maskEnc, XTensor &maskDec, XTensor &maskEncDec);
void MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
XTensor& paddingEnc, XTensor& paddingDec,
XTensor& maskEnc, XTensor& maskDec, XTensor& maskEncDec);
/* make the mask of the encoder */
void MakeMTMaskEnc(XTensor &inputEnc, XTensor &paddingEnc, XTensor &maskEnc);
void MakeMTMaskEnc(XTensor& paddingEnc, XTensor& maskEnc);
/* make the mask of the decoder */
void MakeMTMaskDec(XTensor &inputEnc, XTensor &inputDec,
XTensor &paddingEnc, XTensor &paddingDec,
XTensor &maskDec, XTensor &maskEncDec);
void MakeMTMaskDec(XTensor& paddingEnc, XTensor& paddingDec,
XTensor& maskDec, XTensor& maskEncDec);
/* get parameter matrics */
void GetParams(TensorList &list);
/* get parameter matrices */
void GetParams(TensorList& list);
/* dump the parameters */
void Dump(const char * fn);
/* dump the model to a file */
void Dump(const char* fn);
/* read the parameters */
void Read(const char * fn);
void Read(FILE* file);
};
}
......
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-06, 2020-07
*/
#include <ctime>
#include "NMT.h"
#include "train/Trainer.h"
#include "translate/Translator.h"
namespace nmt
{
int NMTMain(int argc, const char** argv)
{
if (argc == 0)
return 1;
/* load configurations */
Config config(argc, argv);
srand(1);
/* training */
if (strcmp(config.trainFN, "") != 0) {
Model model;
model.InitModel(config);
Trainer trainer;
trainer.Init(config);
trainer.Train(config.trainFN, config.validFN, config.modelFN, &model);
}
/* translating */
if (strcmp(config.testFN, "") != 0 && strcmp(config.outputFN, "") != 0) {
/* disable grad flow */
DISABLE_GRAD;
Model model;
model.InitModel(config);
Translator translator;
translator.Init(config);
translator.Translate(config.testFN, config.srcVocabFN,
config.tgtVocabFN, config.outputFN, &model);
}
return 0;
}
}
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -16,29 +15,17 @@
*/
/*
*
* An impelementation of the transformer system. See more details
* about FNNLM in
* "Attention Is All You Need" by Vaswani et al.
* https://arxiv.org/pdf/1706.03762.pdf
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
* I start writing the code related to NMT - a long time since my last coding
* work on MT
* An implementation of the NMT system.
*/
#ifndef __TRANSFORMER_H__
#define __TRANSFORMER_H__
#include "../../tensor/XGlobal.h"
#include "../../tensor/XTensor.h"
#include "../../tensor/core/CHeader.h"
#ifndef __NMT_H__
#define __NMT_H__
namespace transformer
namespace nmt
{
/* entrance of the program */
int TransformerMain(int argc, const char ** argv);
int NMTMain(int argc, const char** argv);
}
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
*/
#include <math.h>
#include "T2TAttention.h"
#include "T2TUtility.h"
#include "T2TEmbedding.h"
#include "../../tensor/core/CHeader.h"
namespace transformer
{
/* constructor */
T2TAttention::T2TAttention()
{
nhead = -1;
dk = -1;
dv = -1;
d = -1;
isMasked = false;
ignored = 0;
}
/* deconstructor */
T2TAttention::~T2TAttention()
{
}
/*
initialize the model
>> argc - number of arguments
>> argv - list of pointers to the arguments
>> myIgnored - number of position ignored in attention (from the begining)
>> myIsMasked - indicates whether the attention is with a mask
>> myDevID - device id
>> myMem - the memory pool
*/
void T2TAttention::InitModel(int argc, char ** argv,
bool myIsMasked, int myIgnored,
int myDevID, XMem * myMem)
{
devID = myDevID;
mem = myMem;
isMasked = myIsMasked;
ignored = myIgnored;
float minmax = 0;
LoadParamInt(argc, argv, "nhead", &nhead, 8);
LoadParamInt(argc, argv, "d", &dk, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "d", &dv, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
LoadParamFloat(argc, argv, "attminmax", &minmax, 0.1F);
LoadParamFloat(argc, argv, "dropoutatt", &dropoutP, 0);
InitTensor2D(&wk, d, dk, X_FLOAT, devID, mem);
InitTensor2D(&wq, d, dk, X_FLOAT, devID, mem);
InitTensor2D(&wv, d, dv, X_FLOAT, devID, mem);
InitTensor2D(&wa, d, d, X_FLOAT, devID, mem);
InitTensor2D(&wbig, d, 3 * d, X_FLOAT, devID, mem);
float scale = 1.0F;
float finfoutk = (float)sqrt(6.0F * scale/(d + dk));
float finfoutv = (float)sqrt(6.0F * scale/(d + dv));
float finfouta = (float)sqrt(6.0F * scale / (d + d));
float finfoutbig = (float)sqrt(6.0F * scale / (d + 3*d));
wk.SetDataRand(-finfoutk, finfoutk);
wq.SetDataRand(-finfoutk, finfoutk);
wv.SetDataRand(-finfoutv, finfoutv);
wa.SetDataRand(-finfouta, finfouta);
wbig.SetDataRand(-finfoutbig, finfoutbig);
}
/*
make the network
>> k - keys. It might be of size B * L * H
where B = batch size, L = sequence length,
and H = vector size of each position
>> q - queries
>> v - values
>> mask - as it is
>> isTraining - indicates whether the model is used for training
<< return - multi-attention result
*/
XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining)
{
XTensor k2;
XTensor q2;
XTensor v2;
/* linear transformation before self-attention */
k2 = MMul(k, wk);
q2 = MMul(q, wq);
v2 = MMul(v, wv);
return MakeAttention(k2, q2, v2, mask, isTraining);
}
/*
make the network given a big tensor that keeps keys, queries and values
>> kqv - the big tensor
>> mask - as it is
>> isTraining - indicates whether the model is used for training
*/
XTensor T2TAttention::MakeBig(XTensor &kqv, XTensor &mask, bool isTraining)
{
XTensor k2;
XTensor q2;
XTensor v2;
XTensor kqv2;
TensorList split;
kqv2 = MMul(kqv, wbig);
int d1 = kqv2.GetDim(0);
int d2 = kqv2.GetDim(1);
int d3 = kqv2.GetDim(2) / 3;
InitTensor3D(&k2, d1, d2, d3, X_FLOAT, devID, mem);
InitTensor3D(&q2, d1, d2, d3, X_FLOAT, devID, mem);
InitTensor3D(&v2, d1, d2, d3, X_FLOAT, devID, mem);
split.Add(&q2);
split.Add(&k2);
split.Add(&v2);
Split(kqv2, split, 2, 3);
return MakeAttention(k2, q2, v2, mask, isTraining);
}
/*
make the attention network given keys, queries and values (after linear transformation)
>> k - keys. It might be of size B * L * H
where B = batch size, L = sequence length,
and H = vector size of each position
>> q - queries
>> v - values
>> mask - as it is
>> isTraining - indicates whether the model is used for training
*/
XTensor T2TAttention::MakeAttention(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining)
{
XTensor kheads;
XTensor qheads;
XTensor vheads;
/* multi head */
kheads = Split(k, k.order - 1, nhead);
qheads = Split(q, q.order - 1, nhead);
vheads = Split(v, v.order - 1, nhead);
XTensor att;
XTensor dot;
XTensor scalar;
/* scalar = softmax(Q * K^T / sqrt(dk)) * V */
dot = BMMul(qheads, X_NOTRANS, kheads, X_TRANS);
if(isMasked)
dot = dot + mask;
dot = Linear(dot, 1.0F/(float)sqrt((float)dk/nhead));
scalar = Softmax(dot, -1);
if(isTraining && dropoutP > 0)
scalar = Dropout(scalar, dropoutP);
att = BMMul(scalar, vheads);
/* concatenate the heads */
return MMul(Merge(att, att.order - 1), wa);
}
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
*/
#ifndef __T2TATTENTION_H__
#define __T2TATTENTION_H__
#include "../../network/XNet.h"
using namespace nts;
namespace transformer
{
/*
multi-head attention
y(Q, K, V) = cat(head_1, head_2, ..., head_n)
where head_i = Attention(Q * w_i^Q, K * w_i^K, V * w_i^V)
attention(Q, K, V) = softmax(Q * K^T/d_k^0.5) V
d_k = dimension size of K
*/
class T2TAttention
{
public:
/* device id */
int devID;
/* memory pool */
XMem * mem;
/* head number */
int nhead;
/* transformation matrix for K */
XTensor wk;
/* transformation matrix for Q */
XTensor wq;
/* transformation matrix for V */
XTensor wv;
/* transformation after dot-product attention */
XTensor wa;
XTensor wbig;
/* size of transformed Q and K */
int dk;
/* size of transformed V */
int dv;
/* size of input Q, K and V */
int d;
/* indicates whether the attention is masked */
bool isMasked;
/* some positions can be ignored in attention. this is useful in lm where the first position needs
special design for the attention model. */
int ignored;
/* indicates whether the model is used for training */
bool isTraining;
/* dropout probability */
DTYPE dropoutP;
public:
/* constructor */
T2TAttention();
/* de-constructor */
~T2TAttention();
/* initialize the model */
void InitModel(int argc, char ** argv,
bool myIsMasked, int myIgnored,
int myDevID = -1, XMem * myMem = NULL);
/* make the network */
XTensor Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining);
/* make the network given a big tensor that keeps keys, queries and values */
XTensor MakeBig(XTensor &kqv, XTensor &mask, bool isTraining);
/* make the attention network given keys, queries and values (after linear transformation) */
XTensor MakeAttention(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining);
};
}
#endif
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-04-25
* it is cold today but i'll move to a warm place tomorrow :)
*/
#ifndef __T2TBATCHLOADER_H__
#define __T2TBATCHLOADER_H__
#include "../../network/XNet.h"
using namespace nts;
namespace transformer
{
#define MAX_SEQUENCE_LENGTH 1024 * 16
/* node to keep batch information */
struct BatchNode
{
/* begining position */
int beg;
/* end position */
int end;
/* maximum word number on the encoder side */
int maxEnc;
/* maximum word number on the decoder side */
int maxDec;
/* a key for sorting */
int key;
};
class T2TBatchLoader
{
public:
/* buffer for loading words */
int * buf;
/* another buffer */
int * buf2;
/* batch buf */
BatchNode * bufBatch;
/* buffer size */
int bufSize;
/* size of batch buffer */
int bufBatchSize;
/* length of each sequence */
int * seqLen;
/* another array */
int * seqLen2;
/* offset of the first word for each sequence */
int * seqOffset;
/* number of sequences in the buffer */
int nseqBuf;
/* offset for next sequence in the buffer */
int nextSeq;
/* offset for next batch */
int nextBatch;
/* indicates whether we double the </s> symbol for the output of lms */
bool isDoubledEnd;
/* indicates whether we use batchsize = max * sc
rather rather than batchsize = word-number, where max is the maximum
length and sc is the sentence number */
bool isSmallBatch;
/* counterpart of "isSmallBatch" */
bool isBigBatch;
/* randomize batches */
bool isRandomBatch;
/* bucket size */
int bucketSize;
public:
/* constructor */
T2TBatchLoader();
/* de-constructor */
~T2TBatchLoader();
/* initialization */
void Init(int argc, char ** argv);
/* load data to buffer */
int LoadBuf(FILE * file, bool isSorted, int step);
/* clear data buffer */
void ClearBuf();
/* set the random batch flag */
void SetRandomBatch(bool flag = true);
/* load a batch of sequences */
int LoadBatch(FILE * file, bool isLM,
XTensor * batchEnc, XTensor * paddingEnc,
XTensor * batchDec, XTensor * paddingDec,
XTensor * gold, XTensor * label,
int * seqs,
int vsEnc, int vsDec, int sBatch, int wBatch,
bool isSorted, int &ws, int &wCount,
int devID, XMem * mem,
bool isTraining);
/* load a batch of sequences (for language modeling) */
int LoadBatchLM(FILE * file,
XTensor * batchEnc, XTensor * paddingEnc,
XTensor * batchDec, XTensor * paddingDec,
XTensor * gold, XTensor * label,
int * seqs, int vs, int sBatch, int wBatch,
bool isSorted, int &wCount,
int devID, XMem * mem,
bool isTraining);
/* load a batch of sequences (for machine translation) */
int LoadBatchMT(FILE * file,
XTensor * batchEnc, XTensor * paddingEnc,
XTensor * batchDec, XTensor * paddingDec,
XTensor * gold, XTensor * label,
int * seqs, int vsEnc, int vsDec, int sBatch, int wBatch,
bool isSorted, int &ws, int &wCount,
int devID, XMem * mem,
bool isTraining);
/* shuffle the data file */
void Shuffle(const char * srcFile, const char * tgtFile);
};
}
#endif
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-10-09
*/
#include <math.h>
#include "T2TDecoder.h"
#include "T2TUtility.h"
#include "T2TLayerNormal.h"
#include "../../tensor/core/CHeader.h"
namespace transformer
{
/* constructor */
AttDecoder::AttDecoder()
{
attentions = NULL;
fnns = NULL;
attLayerNorms = NULL;
fnnLayerNorms = NULL;
attentionsEnde = NULL;
attEndeLayerNorms = NULL;
}
/* de-constructor */
AttDecoder::~AttDecoder()
{
delete[] attentions;
delete[] fnns;
delete[] attLayerNorms;
delete[] fnnLayerNorms;
delete[] attentionsEnde;
delete[] attEndeLayerNorms;
}
/*
initialize the model
>> argc - number of arguments
>> argv - list of pointers to the arguments
>> myIsMasked - indicates whether the masked attention is employed
>> myIgnored - number of positions ignored in attention (from the start)
>> myDevID - device id
>> myMem - the memory pool
*/
void AttDecoder::InitModel(int argc, char ** argv,
bool myIsMasked, int myIgnored,
int myDevID, XMem * myMem)
{
//AttEncoder::InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
devID = myDevID;
mem = myMem;
ignored = myIgnored;
LoadParamInt(argc, argv, "nlayer", &nlayer, 6);
LoadParamInt(argc, argv, "hsize", &hSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "esize", &eSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "vsizetgt", &vSize, -1);
LoadParamFloat(argc, argv, "dropout", &dropoutP, 0);
CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsizetgt\"");
/* embedding model */
embedder.InitModel(argc, argv, devID, mem, false);
attentions = new T2TAttention[nlayer];
fnns = new T2TFNN[nlayer];
attLayerNorms = new T2TLN[nlayer];
fnnLayerNorms = new T2TLN[nlayer];
attentionsEnde = new T2TAttention[nlayer];
attEndeLayerNorms = new T2TLN[nlayer];
/* initialize the stacked layers */
for (int i = 0; i < nlayer; i++) {
attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
fnns[i].InitModel(argc, argv, myDevID, myMem);
attLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
fnnLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
attentionsEnde[i].InitModel(argc, argv, true, myIgnored, myDevID, myMem);
attEndeLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
}
}
/*
make the decoding network
>> inputDec - the input tensor of the decoder
>> outputEnc - the output tensor of the encoder
>> mask - mask that indicates which position is valid
>> maskEncDec - mask for the encoder-decoder attention
>> isTraining - indicates whether the model is used for training
<< return - the output tensor of the encoder
*/
XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, XTensor &maskEncDec, bool isTraining)
{
XTensor x;
x = embedder.Make(inputDec);
/* dropout */
if(isTraining && dropoutP > 0)
x = Dropout(x, dropoutP);
for(int i = 0; i < nlayer; i++){
XTensor att;
XTensor ende;
XTensor ln;
XTensor fnn;
XTensor res;
/******************/
/* self attention */
att = attentions[i].MakeBig(x, mask, isTraining);
/* dropout */
if(isTraining && dropoutP > 0)
att = Dropout(att, dropoutP);
/* residual connection */
res = Sum(att, x);
/* layer normalization */
x = attLayerNorms[i].Make(res);
/*****************************/
/* encoder-decoder attention */
ende = attentionsEnde[i].Make(outputEnc, x, outputEnc, maskEncDec, isTraining);
/* dropout */
if(isTraining && dropoutP > 0)
ende = Dropout(ende, dropoutP);
/* residual connection */
res = Sum(ende, x);
/* layer normalization */
x = attEndeLayerNorms[i].Make(res);
/*******/
/* fnn */
fnn = fnns[i].Make(x, isTraining);
/* dropout */
if(isTraining && dropoutP > 0)
fnn = Dropout(fnn, dropoutP);
/* residual connection */
res = Sum(fnn, x);
/* layer normalization */
x = fnnLayerNorms[i].Make(res);
}
x.SetName(DECODING_NAME);
return x;
}
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-08-01
*/
#include <math.h>
#include "T2TEmbedding.h"
#include "T2TUtility.h"
#include "../../tensor/core/CHeader.h"
namespace transformer
{
/* constructor */
T2TEmbedder::T2TEmbedder()
{
devID = -1;
mem = NULL;
vSize = -1;
maxLength = -1;
}
/* deconstructor */
T2TEmbedder::~T2TEmbedder()
{
}
/*
initialize the model
>> argc - number of arguments
>> argv - list of pointers to the arguments
>> myDevID - device id
>> myMem - the memory pool
*/
void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, XMem * myMem, bool isEnc)
{
devID = myDevID;
mem = myMem;
if(isEnc){
LoadParamInt(argc, argv, "vsize", &vSize, -1);
}
else{
LoadParamInt(argc, argv, "vsizetgt", &vSize, -1);
}
//LoadParamInt(argc, argv, "vsize", &vSize, -1);
LoadParamInt(argc, argv, "maxlen", &maxLength, 512);
LoadParamInt(argc, argv, "d", &eSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
InitTensor2D(&w, vSize, eSize, X_FLOAT, devID, mem);
DTYPE v = 1.0F/(float)sqrt((float)eSize);
w.SetDataRandn(0, v);
/* create the positional embedding matrix */
MakePosEmbedding(eSize, d, maxLength);
}
/*
make positional embeddings (of size eSize * length)
>> eSize - embedding size
>> d - dimension size of the hidden layers
>> length - length of the sequence
*/
void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length)
{
InitTensor2D(&posEmbeddingBase, length, eSize, X_FLOAT, devID, mem);
float * data = new float[posEmbeddingBase.unitNum];
for(int pos = 0; pos < length; pos++){
float * dp = data + pos * eSize;
int channelSize = eSize / 2;
int offset = 0;
for(int i = 0; i < channelSize; i++){
dp[offset++] = (float)sin(pos/pow(10000.0F, 2.0F*i/(d - 2)));
}
for(int i = 0; i < channelSize; i++){
dp[offset++] = (float)cos(pos/pow(10000.0F, 2.0F*i/(d - 2)));
}
/*
for(int k = 0; k < eSize; k++){
if(k % 2 == 0){
int i = k/2;
dp[k] = (float)sin(pos/pow(10000.0F, 2.0F*i/d));
}
else{
int i = (k - 1)/2;
dp[k] = (float)cos(pos/pow(10000.0F, 2.0F*i/d));
}
}
*/
}
posEmbeddingBase.SetData(data, posEmbeddingBase.unitNum);
delete[] data;
}
/*
make the network
*/
XTensor T2TEmbedder::Make(XTensor &input)
{
//CheckNTErrors(input.GetDim(-1) == vSize, "Wrong vocabulary size!");
CheckNTErrors(input.order > 1, "Wrong input tensor size!");
CheckNTErrors(input.dimSize[input.order - 1] < maxLength, "The sequence is too long!");
CheckNTErrors(vSize > 0, "set vocabulary size by \"-vsize\"");
CheckNTErrors(eSize > 0, "set embedding size by \"-esize\"");
int dims[MAX_TENSOR_DIM_NUM];
memcpy(dims, input.dimSize, input.order * sizeof(int));
dims[input.order] = eSize;
XTensor wordEmbedding;
XTensor posEmbedding;
bool match = (posEmbedding.order == input.order);
if(match){
for(int i = 0; i < input.order; i++){
if(dims[i] != posEmbedding.GetDim(i))
match = false;
}
}
/* we make positional embeddings first */
//if(!match){
if(true){
InitTensor(&posEmbedding, input.order + 1, dims, X_FLOAT, 1.0F, devID, mem);
XTensor * posTMP = NewTensorBuf(2, dims + 1, X_FLOAT, 1.0F, devID, mem);
_CopyValues(&posEmbeddingBase, 0, posTMP->unitNum, posTMP, 0);
_Unsqueeze(posTMP, &posEmbedding, 0, dims[0]);
DelTensorBuf(posTMP);
}
/* then we make word embeddings */
wordEmbedding = Gather(w, input);
wordEmbedding = Linear(wordEmbedding, (float)sqrt((float)eSize));
/* we sum over the two embeddings */
return wordEmbedding + posEmbedding;
}
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
*/
#include <math.h>
#include "T2TEncoder.h"
#include "T2TLayerNormal.h"
#include "T2TUtility.h"
#include "../../tensor/core/CHeader.h"
namespace transformer
{
/* constructor */
AttEncoder::AttEncoder()
{
attentions = NULL;
fnns = NULL;
attLayerNorms = NULL;
fnnLayerNorms = NULL;
}
/* de-constructor */
AttEncoder::~AttEncoder()
{
delete[] attentions;
delete[] fnns;
delete[] attLayerNorms;
delete[] fnnLayerNorms;
}
/*
initialize the model
>> argc - number of arguments
>> argv - list of pointers to the arguments
>> myIsMasked - indicates whether the masked attention is employed
>> myIgnored - number of positions ignored in attention (from the start)
>> myDevID - device id
>> myMem - the memory pool
*/
void AttEncoder::InitModel(int argc, char ** argv,
bool myIsMasked, int myIgnored,
int myDevID, XMem * myMem)
{
devID = myDevID;
mem = myMem;
ignored = myIgnored;
LoadParamInt(argc, argv, "nlayer", &nlayer, 6);
LoadParamInt(argc, argv, "hsize", &hSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "esize", &eSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "vsize", &vSize, -1);
LoadParamFloat(argc, argv, "dropout", &dropoutP, 0);
CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsize\"");
/* embedding model */
embedder.InitModel(argc, argv, devID, mem);
attentions = new T2TAttention[nlayer];
fnns = new T2TFNN[nlayer];
attLayerNorms = new T2TLN[nlayer];
fnnLayerNorms = new T2TLN[nlayer];
/* initialize the stacked layers */
for(int i = 0; i < nlayer; i++){
attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
fnns[i].InitModel(argc, argv, myDevID, myMem);
attLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
fnnLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
}
}
/*
make the encoding network
>> input - the input tensor of the encoder
>> mask - the mask that indicate each position is valid
>> maskEncDec - no use
>> isTraining - indicates whether the model is used for training
<< return - the output tensor of the encoder
*/
XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, bool isTraining)
{
XTensor x;
x = embedder.Make(input);
/* dropout */
if(isTraining && dropoutP > 0)
x = Dropout(x, dropoutP);
for(int i = 0; i < nlayer; i++){
XTensor att;
XTensor ln;
XTensor fnn;
XTensor res;
/* self attention */
att = attentions[i].MakeBig(x, mask, isTraining);
/* dropout */
if(isTraining && dropoutP > 0)
att = Dropout(att, dropoutP);
/* residual connection */
res = Sum(att, x);
/* layer normalization */
x = attLayerNorms[i].Make(res);
/* fnn */
fnn = fnns[i].Make(x, isTraining);
/* dropout */
if(isTraining && dropoutP > 0)
fnn = Dropout(fnn, dropoutP);
/* residual connection */
res = Sum(fnn, x);
/* layer normalization */
x = fnnLayerNorms[i].Make(res);
}
x.SetName(ENCODING_NAME);
input.SetName(ENCODING_INPUT_NAME);
return x;
}
/*
make the encoding network (wrapper)
>> input - the input tensor of the encoder
>> mask - the mask that indicate each position is valid
>> isTraining - indicates whether the model is used for training
<< return - the output tensor of the encoder
*/
XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool isTraining)
{
XTensor nothing;
return Make(input, mask, nothing, isTraining);
}
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
*/
#include <math.h>
#include "T2TOutput.h"
#include "T2TUtility.h"
#include "T2TEmbedding.h"
#include "../../tensor/core/CHeader.h"
namespace transformer
{
/* constructor */
T2TOutput::T2TOutput()
{
devID = -1;
mem = NULL;
vSize = -1;
inSize = -1;
hSize = -1;
}
/* de-constructor */
T2TOutput::~T2TOutput()
{
}
/*
initialize the model
>> argc - number of arguments
>> argv - list of pointers to the arguments
>> myDevID - device id
>> myMem - the memory pool
*/
void T2TOutput::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
{
devID = myDevID;
mem = myMem;
float minmax = 0;
LoadParamInt(argc, argv, "vsizetgt", &vSize, -1);
LoadParamInt(argc, argv, "d", &inSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "d", &hSize, DEFAULT_EMBEDDING_SIZE);
LoadParamFloat(argc, argv, "outputminmax", &minmax, 0.08F);
InitTensor2D(&w, hSize, vSize, X_FLOAT, devID, mem);
float scale = 1.0F;
float finfout = (float)sqrt(6.0F * scale/(hSize + vSize));
w.SetDataRand(-finfout, finfout);
DTYPE v = 1.0F/(float)sqrt((float)hSize);
w.SetDataRandn(0, v);
}
/*
make the network
y = softmax(x * w)
>> input - input tensor
<< return - output tensor
*/
XTensor T2TOutput::Make(XTensor &input)
{
XTensor &x = input;
return LogSoftmax(MMul(x, w), -1);
}
/*
make the network (redefined output tensor)
>> input - input tensor
>> output - output tensor
*/
void T2TOutput::Make(XTensor &input, XTensor &output)
{
XTensor &x = input;
//output = LogSoftmax(MMul(x, w), -1);
output = Softmax(MMul(x, w), -1);
output.SetName(OUTPUT_NAME);
}
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
*/
#include <math.h>
#include "T2TUtility.h"
#include "T2TTester.h"
#include "T2TSearch.h"
#include "../../tensor/XUtility.h"
#include "../../tensor/core/CHeader.h"
#include "../../network/XNoder.h"
using namespace nts;
namespace transformer
{
/* constructor */
T2TTester::T2TTester()
{
}
/* de-constructor */
T2TTester::~T2TTester()
{
}
/* initialize the model */
void T2TTester::Init(int argc, char ** argv)
{
LoadParamInt(argc, argv, "vsize", &vSize, 1);
LoadParamInt(argc, argv, "vsizetgt", &vSizeTgt, vSize);
batchLoader.Init(argc, argv);
seacher.Init(argc, argv);
}
/*
test the model
>> fn - test data file
>> ofn - output data file
>> model - model that is trained
*/
void T2TTester::Test(const char * fn, const char * ofn, T2TModel * model)
{
int wc = 0;
int ws = 0;
int wordCount = 0;
int wordCountTotal = 0;
int sentCount = 0;
int batchCount = 0;
float loss = 0;
/* data files */
FILE * file = fopen(fn, "rb");
CheckNTErrors(file, "Cannot read the test file");
FILE * ofile = fopen(ofn, "wb");
CheckNTErrors(ofile, "Cannot open the output file");
int devID = model->devID;
XMem * mem = model->mem;
XNet net;
double startT = GetClockSec();
wordCount = 0;
/* batch of input sequences */
XTensor batchEnc;
XTensor batchDec;
/* label */
XTensor label;
/* padding */
XTensor paddingEnc;
XTensor paddingDec;
/* gold standard */
XTensor gold;
/* an array that keeps the sequences */
int * seqs = new int[MILLION];
batchLoader.SetRandomBatch(false);
batchLoader.ClearBuf();
while(batchLoader.LoadBatch(file, model->isLM,
&batchEnc, &paddingEnc, &paddingDec, &paddingDec, &gold, &label,
seqs, vSize, vSizeTgt,
1, 1, false, ws, wc, devID, mem, false))
{
CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch!");
CheckNTErrors(!model->isLM, "Only MT model is supported!");
XTensor output;
XTensor score;
seacher.Search(model, &batchEnc, &paddingEnc, &output, &score);
Dump(ofile, &output);
float prob = 0;
loss += -prob;
wc = batchEnc.GetDim(-1);
wordCount += wc;
wordCountTotal += wc;
sentCount += batchEnc.GetDim(-2);
batchCount += 1;
if (batchCount % 1 == 0) {
double elapsed = GetClockSec() - startT;
XPRINT3(0, stderr,
"[INFO] elapsed=%.1fs, sent=%d, sword=%d\n",
elapsed, sentCount, wordCount);
}
}
fclose(file);
fclose(ofile);
delete[] seqs;
double elapsed = GetClockSec() - startT;
XPRINT4(0, stderr, "[INFO] test finished (took %.1fs, word=%d, sent=%d, and ppl=%.3f)\n",
elapsed,wordCountTotal, sentCount, exp(loss/wordCount));
}
/*
dump the result into the file
>> file - data file
>> output - output tensor
*/
void T2TTester::Dump(FILE * file, XTensor * output)
{
int seqLength = output->GetDim(-1);
for (int i = 0; i < output->unitNum; i += seqLength) {
for (int j = 0; j < seqLength; j++) {
int w = output->GetInt(i + j);
fprintf(file, "%d ", w);
if (w < 0)
break;
}
fprintf(file, "\n");
}
}
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
namespace transformer
{
FILE * tmpFILE;
int llnum = 0;
FILE * tf = NULL;
void LoadParamString(int argc, char ** argv, const char * name, char * p, const char * defaultP)
{
char vname[128];
vname[0] = '-';
strcpy(vname + 1, name);
bool hit = false;
for(int i = 0; i < argc; i++){
if(!strcmp(argv[i], vname) && i + 1 < argc){
strcpy(p, argv[i + 1]);
//fprintf(stderr, " %s=%s\n", name, argv[i + 1]);
hit = true;
}
}
if(!hit)
strcpy(p, defaultP);
}
void LoadParamInt(int argc, char ** argv, const char * name, int * p, int defaultP)
{
char vname[128];
vname[0] = '-';
strcpy(vname + 1, name);
bool hit = false;
for(int i = 0; i < argc; i++){
if(!strcmp(argv[i], vname) && i + 1 < argc){
*(int*)p = atoi(argv[i + 1]);
//fprintf(stderr, " %s=%s\n", name, argv[i + 1]);
hit = true;
}
}
if(!hit)
*p = defaultP;
}
void LoadParamBool(int argc, char ** argv, const char * name, bool * p, bool defaultP)
{
char vname[128];
vname[0] = '-';
strcpy(vname + 1, name);
bool hit = false;
for(int i = 0; i < argc; i++){
if(!strcmp(argv[i], vname)){
*(bool*)p = true;
//fprintf(stderr, " %s=%s\n", name, "true");
hit = true;
}
}
if(!hit)
*p = defaultP;
}
void LoadParamFloat(int argc, char ** argv, const char * name, float * p, float defaultP)
{
char vname[128];
vname[0] = '-';
strcpy(vname + 1, name);
bool hit = false;
for(int i = 0; i < argc; i++){
if(!strcmp(argv[i], vname) && i + 1 < argc){
*p = (float)atof(argv[i + 1]);
//fprintf(stderr, " %s=%s\n", name, argv[i + 1]);
hit = true;
}
}
if(!hit)
*p = defaultP;
}
void ShowParams(int argc, char ** argv)
{
fprintf(stderr, "args:\n");
for(int i = 0; i < argc; i++){
if(argv[i][1] == 0)
continue;
if(argv[i][0] == '-' && (argv[i][1] < '1' || argv[i][1] > '9')){
if(i + 1 < argc && argv[i + 1][0] != '-')
fprintf(stderr, " %s=%s\n", argv[i], argv[i + 1]);
else
fprintf(stderr, " %s=yes\n", argv[i]);
}
}
fprintf(stderr, "\n");
}
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
*/
#include <math.h>
#include <time.h>
#include "Transformer.h"
#include "T2TModel.h"
#include "T2TUtility.h"
#include "T2TTrainer.h"
#include "T2TPredictor.h"
#include "T2TTester.h"
#include "../../tensor/XDevice.h"
#include "../../tensor/XUtility.h"
#include "../../tensor/XGlobal.h"
namespace transformer
{
int TransformerMain(int argc, const char ** argv)
{
if(argc == 0)
return 1;
char ** args = new char*[argc];
for(int i = 0; i < argc; i++){
args[i] = new char[strlen(argv[i]) + 1];
strcpy(args[i], argv[i]);
}
tmpFILE = fopen("tmp.txt", "wb");
ShowParams(argc, args);
bool isBeamSearch = false;
char * trainFN = new char[MAX_LINE_LENGTH];
char * modelFN = new char[MAX_LINE_LENGTH];
char * testFN = new char[MAX_LINE_LENGTH];
char * outputFN = new char[MAX_LINE_LENGTH];
LoadParamString(argc, args, "train", trainFN, "");
LoadParamString(argc, args, "model", modelFN, "");
LoadParamString(argc, args, "test", testFN, "");
LoadParamString(argc, args, "output", outputFN, "");
LoadParamBool(argc, args, "beamsearch", &isBeamSearch, false);
srand((unsigned int)time(NULL));
T2TTrainer trainer;
trainer.Init(argc, args);
T2TModel model;
model.InitModel(argc, args);
/* learn model parameters */
if(strcmp(trainFN, ""))
trainer.Train(trainFN, testFN, strcmp(modelFN, "") ? modelFN : "checkpoint.model", &model);
/* save the final model */
if(strcmp(modelFN, "") && strcmp(trainFN, ""))
model.Dump(modelFN);
/* load the model if neccessary */
if(strcmp(modelFN, ""))
model.Read(modelFN);
/* test the model on the new data */
if(strcmp(testFN, "") && strcmp(outputFN, "")){
/* beam search */
if(isBeamSearch){
T2TTester searcher;
searcher.Init(argc, args);
searcher.Test(testFN, outputFN, &model);
}
/* forced decoding */
else{
T2TTrainer tester;
tester.Init(argc, args);
tester.Test(testFN, outputFN, &model);
}
}
delete[] trainFN;
delete[] modelFN;
delete[] testFN;
delete[] outputFN;
for(int i = 0; i < argc; i++)
delete[] args[i];
delete[] args;
fclose(tmpFILE);
return 0;
}
}
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
*/
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <string>
#include <fstream>
#include <sstream>
#include "Utility.h"
#include "../../tensor/XGlobal.h"
using namespace nts;
using namespace std;
namespace nmt
{
/*
load configurations from the command
>> argc - number of arguments
>> argv - the list of arguments
*/
Config::Config(int argc, const char** argv)
{
char** args = new char* [MAX_PARAM_NUM];
for (int i = 0; i < argc; i++) {
args[i] = new char[strlen(argv[i]) + 1];
strcpy(args[i], argv[i]);
}
char* configFN = new char[1024];
LoadParamString(argc, args, "config", configFN, "");
int argsNum = argc;
/* load configurations from a file */
if (strcmp(configFN, "") != 0)
argsNum = LoadFromFile(configFN, args);
ShowParams(argsNum, args);
/* options for the model */
LoadParamInt(argsNum, args, "nhead", &nhead, 4);
LoadParamInt(argsNum, args, "enclayer", &nEncLayer, 6);
LoadParamInt(argsNum, args, "declayer", &nDecLayer, 6);
LoadParamInt(argsNum, args, "maxrp", &maxRP, 8);
LoadParamInt(argsNum, args, "embsize", &embSize, 512);
LoadParamInt(argsNum, args, "modelsize", &modelSize, 512);
LoadParamInt(argsNum, args, "maxpos", &maxPosLen, 1024);
LoadParamInt(argsNum, args, "fnnhidden", &fnnHiddenSize, modelSize * 2);
LoadParamInt(argsNum, args, "vsize", &srcVocabSize, 10152);
LoadParamInt(argsNum, args, "vsizetgt", &tgtVocabSize, 10152);
LoadParamInt(argsNum, args, "padid", &padID, 1);
LoadParamInt(argsNum, args, "startid", &startID, 2);
LoadParamInt(argsNum, args, "endid", &endID, 2);
LoadParamBool(argsNum, args, "rpr", &useRPR, false);
LoadParamBool(argsNum, args, "prenorm", &preNorm, true);
// TODO: refactor the parameters type to support weight sharing during training
LoadParamInt(argsNum, args, "shareemb", &shareAllEmbeddings, 0);
LoadParamInt(argsNum, args, "sharedec", &shareDecInputOutputWeight, 0);
LoadParamString(argsNum, args, "model", modelFN, "");
LoadParamString(argsNum, args, "srcvocab", srcVocabFN, "vocab.src");
LoadParamString(argsNum, args, "tgtvocab", tgtVocabFN, "vocab.tgt");
/* options for training */
LoadParamString(argsNum, args, "train", trainFN, "");
LoadParamString(argsNum, args, "valid", validFN, "");
LoadParamInt(argsNum, args, "dev", &devID, 0);
LoadParamInt(argsNum, args, "wbatch", &wBatchSize, 4096);
LoadParamInt(argsNum, args, "sbatch", &sBatchSize, 8);
isTraining = (strcmp(trainFN, "") == 0) ? false : true;
LoadParamBool(argsNum, args, "mt", &isMT, true);
LoadParamFloat(argsNum, args, "dropout", &dropout, 0.3);
LoadParamFloat(argsNum, args, "fnndrop", &fnnDropout, 0.1);
LoadParamFloat(argsNum, args, "attdrop", &attDropout, 0.1);
LoadParamFloat(argc, args, "lrate", &lrate, 0.0015F);
LoadParamFloat(argc, args, "lrbias", &lrbias, 0);
LoadParamInt(argc, args, "nepoch", &nepoch, 50);
LoadParamInt(argc, args, "maxcheckpoint", &maxCheckpoint, 10);
LoadParamInt(argc, args, "nstep", &nstep, 100000);
LoadParamInt(argc, args, "nwarmup", &nwarmup, 8000);
LoadParamBool(argc, args, "adam", &useAdam, true);
LoadParamFloat(argc, args, "adambeta1", &adamBeta1, 0.9F);
LoadParamFloat(argc, args, "adambeta2", &adamBeta2, 0.98F);
LoadParamFloat(argc, args, "adamdelta", &adamDelta, 1e-9F);
LoadParamBool(argc, args, "shuffled", &isShuffled, true);
LoadParamFloat(argc, args, "labelsmoothing", &labelSmoothingP, 0.1);
LoadParamInt(argc, args, "nstepcheckpoint", &nStepCheckpoint, -1);
LoadParamBool(argc, args, "epochcheckpoint", &useEpochCheckpoint, true);
LoadParamInt(argc, args, "updatestep", &updateStep, 1);
LoadParamBool(argc, args, "sorted", &isLenSorted, false);
LoadParamInt(argc, args, "bufsize", &bufSize, 50000);
LoadParamBool(argc, args, "doubledend", &isDoubledEnd, false);
LoadParamBool(argc, args, "smallbatch", &isSmallBatch, true);
LoadParamBool(argc, args, "bigbatch", &isBigBatch, false);
LoadParamBool(argc, args, "randbatch", &isRandomBatch, false);
LoadParamInt(argc, args, "bucketsize", &bucketSize, wBatchSize * 10);
/* options for translating */
LoadParamString(argsNum, args, "test", testFN, "");
LoadParamString(argsNum, args, "output", outputFN, "");
LoadParamInt(argsNum, args, "beamsize", &beamSize, 1);
LoadParamBool(argsNum, args, "fp16", &useFP16, false);
LoadParamFloat(argsNum, args, "lenalpha", &lenAlpha, 0.6);
LoadParamFloat(argsNum, args, "maxlenalpha", &maxLenAlpha, 1.2);
for (int i = 0; i < argc; i++)
delete[] args[i];
delete[] args;
delete[] configFN;
}
/*
load configurations from a file
>> configFN - path to the configuration file
>> args - the list to store the configurations
format: one option per line, separated by a blank or a tab
*/
int Config::LoadFromFile(const char* configFN, char** args) {
ifstream f(configFN, ios::in);
CheckNTErrors(f.is_open(), "unable to open the config file");
int argsNum = 0;
/* parse arguments */
string key, value;
while (f >> key >> value) {
key += '-';
strcpy(args[argsNum++], key.c_str());
strcpy(args[argsNum++], value.c_str());
}
/* record the number of arguments */
return argsNum;
}
void LoadParamString(int argc, char** argv, const char* name, char* p, const char* defaultP)
{
char vname[128];
vname[0] = '-';
strcpy(vname + 1, name);
bool hit = false;
for (int i = 0; i < argc; i++) {
if (!strcmp(argv[i], vname) && i + 1 < argc) {
strcpy(p, argv[i + 1]);
hit = true;
break;
}
}
if (!hit)
strcpy(p, defaultP);
}
void LoadParamInt(int argc, char** argv, const char* name, int* p, int defaultP)
{
char vname[128];
vname[0] = '-';
strcpy(vname + 1, name);
bool hit = false;
for (int i = 0; i < argc; i++) {
if (!strcmp(argv[i], vname) && i + 1 < argc) {
*(int*)p = atoi(argv[i + 1]);
hit = true;
break;
}
}
if (!hit)
*p = defaultP;
}
void LoadParamBool(int argc, char** argv, const char* name, bool* p, bool defaultP)
{
char vname[128];
vname[0] = '-';
strcpy(vname + 1, name);
bool hit = false;
for (int i = 0; i < argc; i++) {
if (!strcmp(argv[i], vname)) {
*(bool*)p = true;
hit = true;
break;
}
}
if (!hit)
*p = defaultP;
}
void LoadParamFloat(int argc, char** argv, const char* name, float* p, float defaultP)
{
char vname[128];
vname[0] = '-';
strcpy(vname + 1, name);
bool hit = false;
for (int i = 0; i < argc; i++) {
if (!strcmp(argv[i], vname) && i + 1 < argc) {
*p = (float)atof(argv[i + 1]);
hit = true;
break;
}
}
if (!hit)
*p = defaultP;
}
void ShowParams(int argc, char** argv)
{
fprintf(stderr, "args:\n");
for (int i = 0; i < argc; i++) {
if (argv[i][1] == 0)
continue;
if (argv[i][0] == '-' && (argv[i][1] < '1' || argv[i][1] > '9')) {
if (i + 1 < argc && argv[i + 1][0] != '-')
fprintf(stderr, " %s=%s\n", argv[i], argv[i + 1]);
else
fprintf(stderr, " %s=yes\n", argv[i]);
}
}
fprintf(stderr, "\n");
}
#define MAX_WORD_NUM 120
/*
split string by delimiter, this will return indices of all sub-strings
>> s - the original string
>> delimiter - as it is
<< indices - indices of all sub-strings
*/
UInt64List SplitToPos(const string& s, const string& delimiter)
{
UInt64List indices;
if (delimiter.length() == 0) {
indices.Add(0);
}
size_t pos = 0;
uint64_t start = 0;
while ((pos = s.find(delimiter, start)) != string::npos) {
if (pos != start) {
indices.Add(start);
}
start = pos + delimiter.length();
}
if (start != s.length()) {
indices.Add(start);
}
return indices;
}
/* split a string to a int64_t list */
IntList SplitInt(const string& s, const string& delimiter)
{
IntList values;
auto indices = SplitToPos(s, delimiter);
for (int i = 0; i < indices.Size(); i++) {
values.Add(strtol(s.data() + indices[i], nullptr, 10));
}
return values;
}
/* split a string to a float list */
FloatList SplitFloat(const string& s, const string& delimiter)
{
FloatList values;
auto indices = SplitToPos(s, delimiter);
for (int i = 0; i < indices.Size(); i++) {
values.Add(strtof(s.data() + indices[i], nullptr));
}
return values;
}
}
\ No newline at end of file
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-06
*/
#ifndef __UTILITY_H__
#define __UTILITY_H__
#include <string>
#include <cstdio>
#include "../../tensor/XList.h"
using namespace std;
using namespace nts;
namespace nmt
{
#define MAX_PARAM_NUM 100
/* load arguments */
void LoadParamInt(int argc, char** argv, const char* name, int* p, int defaultP);
void LoadParamBool(int argc, char** argv, const char* name, bool* p, bool defaultP);
void LoadParamFloat(int argc, char** argv, const char* name, float* p, float defaultP);
void LoadParamString(int argc, char** argv, const char* name, char* p, const char* defaultP);
/* show arguments */
void ShowParams(int argc, char** argv);
/* split string */
IntList SplitInt(const string& s, const string& delimiter);
FloatList SplitFloat(const string& s, const string& delimiter);
UInt64List SplitToPos(const string& s, const string& delimiter);
/* configurations for */
class Config {
public:
/* path to the model */
char modelFN[1024];
/* path to the source vocab */
char srcVocabFN[1024];
/* path to the target vocab */
char tgtVocabFN[1024];
/* path to the input file (for inference) */
char testFN[1024];
/* path to the output file (for inference) */
char outputFN[1024];
/* path to the training file */
char trainFN[1024];
/* path to the validation file */
char validFN[1024];
/* device id */
int devID;
/* beam size */
int beamSize;
/* word batch size */
int wBatchSize;
/* sentence batch size */
int sBatchSize;
/* number of heads in attention */
int nhead;
/* number of encoder layers */
int nEncLayer;
/* number of decoder layers */
int nDecLayer;
/* the maximum relative position in RPR attentions */
int maxRP;
/* the dimension of embeddings */
int embSize;
/* the dimension of hidden layer */
int modelSize;
/* the maximum length in positional embedding */
int maxPosLen;
/* the dimension of fnn hidden layer */
int fnnHiddenSize;
/* the vocab size of source sequence */
int srcVocabSize;
/* the vocab size of target sequence */
int tgtVocabSize;
/* the padding id */
int padID;
/* start symbol */
int startID;
/* end symbol */
int endID;
/* indicates whether the model uses pre-norm */
bool preNorm;
/* indicates whether the model is running for machine translation */
bool isMT;
/* indicates whether share encoder decoder embeddings */
int shareAllEmbeddings;
/* indicates whether share decoder embeddings and output weights */
int shareDecInputOutputWeight;
/* indicates whether the model is running with FP16 data type */
bool useFP16;
/* indicates whether we use the RPR attention */
bool useRPR;
/* indicates whether we train the model */
bool isTraining;
/* dropout rate for the model */
float dropout;
/* dropout rate for fnn layers */
float fnnDropout;
/* dropout rate for attention layers */
float attDropout;
/* the alpha parameter controls the length preference */
float lenAlpha;
/* scalar of the input sequence (for max number of search steps) */
float maxLenAlpha;
/* learning rate */
float lrate;
/* the parameter that controls the maximum learning rate in training */
float lrbias;
/* training epoch number */
int nepoch;
/* training step number */
int nstep;
/* the maximum number of saved checkpoints */
int maxCheckpoint;
/* indicates whether we use Adam */
bool useAdam;
/* hyper parameters of Adam */
float adamBeta1;
float adamBeta2;
float adamDelta;
/* step number of warm-up for training */
int nwarmup;
/* indicates whether the data file is shuffled for training */
bool isShuffled;
/* the factor of label smoothing */
float labelSmoothingP;
/* number of steps after which we make a checkpoint */
int nStepCheckpoint;
/* indicates whether we make a checkpoint after each training epoch */
bool useEpochCheckpoint;
/* number of batches on which we do model update */
int updateStep;
/* indicates whether the sequence is sorted by length */
bool isLenSorted;
/* buffer size */
int bufSize;
/* indicates whether we double the </s> symbol for the output of LM */
bool isDoubledEnd;
/* indicates whether we use batchsize = max * sc
rather rather than batchsize = word-number, where max is the maximum
length and sc is the sentence number */
bool isSmallBatch;
/* counterpart of "isSmallBatch" */
bool isBigBatch;
/* randomize batches */
bool isRandomBatch;
/* bucket size */
int bucketSize;
public:
/* load configurations from the command */
Config(int argc, const char** argv);
/* load configurations from a file */
int LoadFromFile(const char* configFN, char** args);
};
}
#endif
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
*/
#ifndef __ATTENTION_H__
#define __ATTENTION_H__
#include "NNUtil.h"
#include "../Utility.h"
#include "../../../network/XNet.h"
#include "../../../tensor/core/CHeader.h"
using namespace nts;
namespace nmt
{
/* attention type */
enum { NONE, SELF_ATT, EN_DE_ATT };
/* layer cache for keys and values */
class Cache
{
public:
/* cache for keys, (B, L, H) */
XTensor key;
/* cache for values, (B, L, H) */
XTensor value;
public:
/* indicates cache miss if 'true' */
bool miss;
/* indicates whether we use cache */
bool enable;
/* constructor */
Cache();
/* update the states cache */
void Update(XTensor&& k, XTensor&& v);
/* keep alive states */
void KeepAlive(XTensor& aliveIdx);
/* reorder alive states */
void Reorder(XTensor& reorder);
};
/* multi-head attention */
class Attention
{
public:
/* device id */
int devID;
/* head number */
int nhead;
/* transformation matrix for Q */
XTensor weightQ;
/* bias for Q */
XTensor biasQ;
/* transformation matrix for K */
XTensor weightK;
/* bias for K */
XTensor biasK;
/* transformation matrix for V */
XTensor weightV;
/* bias for V */
XTensor biasV;
XTensor wBig;
XTensor bBig;
/* RPR emb */
XTensor RPEmbK;
/* transformation after dot-product attention */
XTensor weightO;
/* bias after dot-product attention */
XTensor biasO;
/* size of transformed Q and K */
int dk;
/* size of transformed V */
int dv;
/* size of input Q, K and V */
int d;
/* indicates whether we use the RPR attention */
bool useRPR;
/* dropout probability */
DTYPE dropoutP;
/* the maximum relative window size */
int maxRP;
public:
/* constructor */
Attention();
/* de-constructor */
~Attention();
/* initialize the model */
void InitModel(Config& config);
/* make the network */
XTensor Make(XTensor& k, XTensor& q, XTensor& v,
XTensor* mask, bool isTraining,
Cache* cache, int cacheType);
/* make the attention network given keys, queries and values (after linear transformation) */
XTensor MakeAttention(XTensor& k, XTensor& q, XTensor& v,
XTensor* mask, bool isTraining);
/* make the attention network given keys, queries and values (after linear transformation) */
XTensor MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v,
XTensor* mask, bool isTraining, bool isEnc);
/* generate relative position embeddings */
XTensor GetRPEmbedding(const int lenQ, const int lenKV, const int maxRelativeLen, const bool isEnc);
/* relative position-aware dot-product attention inner calculation */
XTensor RPDotProduct(XTensor& x, XTensor& y, XTensor& z, const bool is_key);
};
}
#endif
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Bei Li (libei_neu@outlook.com) 2020-02-05
* This file includes some common modules of the Transformer model
*/
#include "CommonModules.h"
#include "../../../tensor/core/CHeader.h"
#include "../../../tensor/function/FHeader.h"
namespace nmt
{
/*
flexible layer normalization for the Transformer
>> input - input tensor
>> ln - the layernorm network
>> prenorm - whether we use prenorm or not
>> before - whether we use layernorm before attention/fnn
>> after - whether we use layernorm after attention/fnn
*/
XTensor LayerNorm(XTensor& input, LN& ln, bool prenorm, bool before, bool after)
{
if (after ^ prenorm)
return ln.Make(input);
else
return input;
}
}
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -15,32 +14,24 @@
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
*/
#ifndef __T2TUTILITY_H__
#define __T2TUTILITY_H__
/*
* $Created by: Bei Li (libei_neu@outlook.com) 2020-02-03
*/
#include <stdio.h>
#ifndef __COMMONMODULE_H__
#define __COMMONMODULE_H__
namespace transformer
{
#include "LayerNorm.h"
#include "CommonModules.h"
extern FILE * tmpFILE;
using namespace nts;
/* load arguments */
void LoadParamString(int argc, char ** argv, const char * name, char * p, const char * defaultP);
void LoadParamInt(int argc, char ** argv, const char * name, int * p, int defaultP);
void LoadParamBool(int argc, char ** argv, const char * name, bool * p, bool defaultP);
void LoadParamFloat(int argc, char ** argv, const char * name, float * p, float defaultP);
/* show arguments */
void ShowParams(int argc, char ** argv);
namespace nmt
{
extern int llnum;
extern FILE * tf;
/* the layer normalization module to control pre-norm or post-norm*/
XTensor LayerNorm(XTensor& input, LN& ln, bool prenorm, bool before, bool after);
}
#endif
#endif
\ No newline at end of file
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-08-01
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-07
*/
#include "Embedding.h"
#include "../Utility.h"
#include "../../../tensor/core/CHeader.h"
namespace nmt
{
/* constructor */
Embedder::Embedder()
{
devID = -1;
vSize = -1;
maxLength = -1;
}
/* de-constructor */
Embedder::~Embedder()
{
}
/*
initialize the model
>> config - configurations of the model
>> isEnc - indicates if it is used for the encoder
*/
void Embedder::InitModel(Config& config, bool isEnc)
{
devID = config.devID;
d = config.modelSize;
padIdx = config.padID;
eSize = config.embSize;
maxLength = config.maxPosLen;
vSize = (isEnc) ? config.srcVocabSize : config.tgtVocabSize;
InitTensor2D(&w, vSize, eSize, X_FLOAT, devID);
maxLength = maxLength + 1 + 1;
DTYPE v = 1.0F / (float)sqrt((float)eSize);
w.SetDataRandn(0, v);
/* create the positional embedding matrix */
MakePosEmbedding(maxLength);
}
/*
make positional embeddings (of size eSize * length)
>> length - length of the sequence
*/
void Embedder::MakePosEmbedding(int length)
{
InitTensor2D(&posEmbeddingBase, length, eSize, X_FLOAT, devID);
float* data = new float[posEmbeddingBase.unitNum];
for (int pos = 0; pos < length; pos++) {
float* dp = data + pos * eSize;
int channelSize = eSize / 2;
int offset = 0;
for (int i = 0; i < channelSize; i++) {
dp[offset++] = (float)sin(pos * exp(-i * log(10000.0F) / (channelSize - 1)));
}
for (int i = 0; i < channelSize; i++) {
dp[offset++] = (float)cos(pos * exp(-i * log(10000.0F) / (channelSize - 1)));
}
}
/* padding zeros */
int padStart = padIdx * eSize;
for (int i = padStart; i < padStart + eSize; i++)
data[i] = 0.F;
posEmbeddingBase.SetData(data, posEmbeddingBase.unitNum);
if (w.dataType != posEmbeddingBase.dataType)
posEmbeddingBase = ConvertDataType(posEmbeddingBase, w.dataType);
delete[] data;
}
/*
make the network
>> input - the word indices
>> nstep - the length of current sequence
>> isDec - indicates whether it is decoder
>> isTraining - indicates whether it is training
<< return - word & position embeddings of the input
*/
XTensor Embedder::Make(XTensor& input, bool isDec, bool isTraining, int nstep)
{
/* make sure the padding index is 1 */
CheckNTErrors(input.order > 1, "Wrong input tensor size!");
CheckNTErrors(input.dimSize[input.order - 1] < maxLength, "The sequence is too long!");
CheckNTErrors(vSize > 0, "Set vocabulary size by \"-vsize\"");
CheckNTErrors(eSize > 0, "Set embedding size by \"-esize\"");
XTensor wordEmbedding, position, posEmbedding;
InitTensor1D(&position, input.GetDim(-1), X_INT, devID);
if (!isDec || isTraining || input.GetDim(-1) > 1)
{
position.Range(0, position.unitNum, 1);
// disable grad
ScaleAndShiftMe(position, 1.0F, float(padIdx + 1));
}
else
{
/* decoder embeddings during decoding */
position.SetDataFixed(nstep + padIdx + 1);
}
/* we make positional embeddings first */
XTensor embTMP;
embTMP = Gather(posEmbeddingBase, position);
posEmbedding = Unsqueeze(embTMP, 0, input.GetDim(0));
/* then we make word embeddings */
//w.enableGrad = false;
wordEmbedding = Gather(w, input);
wordEmbedding = Linear(wordEmbedding, (float)sqrt((float)eSize));
/* we sum over the two embeddings */
SumMe(wordEmbedding, posEmbedding);
return wordEmbedding;
}
}
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -17,33 +16,32 @@
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-08-01
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-07
*/
#ifndef __T2TEMBEDDING_H__
#define __T2TEMBEDDING_H__
#ifndef __EMBEDDING_H__
#define __EMBEDDING_H__
#include "../../network/XNet.h"
#include "../Utility.h"
#include "../../../network/XNet.h"
using namespace nts;
namespace transformer
namespace nmt
{
#define DEFAULT_EMBEDDING_SIZE 512
/*
/*
embedding (of word at position i):
word embedding + positional embedding
*/
class T2TEmbedder
class Embedder
{
public:
/* device id */
int devID;
/* memory pool */
XMem * mem;
/* vocabulary size */
int vSize;
......@@ -53,31 +51,34 @@ public:
/* maximum length of the sequence */
int maxLength;
/* dimension size of the hidden layers in the t2t model */
/* dimension size of the hidden layers in the model */
int d;
/* padding index */
int padIdx;
/* word embedding matrix */
XTensor w;
/* predefined positional embeddings. It can speeds up
/* predefined positional embeddings. It can speeds up
the embedding processing by re-loading. */
XTensor posEmbeddingBase;
public:
/* constructor */
T2TEmbedder();
Embedder();
/* de-constructor */
~T2TEmbedder();
~Embedder();
/* initialize the model */
void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL, bool isEnc = true);
void InitModel(Config& config, bool isEnc = true);
/* make positional embeddings */
void MakePosEmbedding(int eSize, int d, int length);
void MakePosEmbedding(int length);
/* make the network */
XTensor Make(XTensor &input);
XTensor Make(XTensor& input, bool isDec, bool isTraining, int nstep = 0);
};
}
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -17,88 +16,81 @@
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#include <math.h>
#include "T2TFNN.h"
#include "T2TUtility.h"
#include "T2TEmbedding.h"
#include "../../tensor/core/CHeader.h"
#include "../../tensor/function/FHeader.h"
#include "FNN.h"
#include "Embedding.h"
#include "../Utility.h"
#include "../../../tensor/core/CHeader.h"
#include "../../../tensor/function/FHeader.h"
namespace transformer
namespace nmt
{
/* constructor */
T2TFNN::T2TFNN()
FNN::FNN()
{
inSize = -1;
inSize = -1;
outSize = -1;
hSize = -1;
hSize = -1;
}
/* deconstructor */
T2TFNN::~T2TFNN()
/* de-constructor */
FNN::~FNN()
{
}
/*
initialize the model
/*
initialize the model
>> argc - number of arguments
>> argv - list of pointers to the arguments
>> myDevID - device id
>> myMem - the memory pool
>> config - configurations of the model
*/
void T2TFNN::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
void FNN::InitModel(Config& config)
{
devID = myDevID;
mem = myMem;
float minmax = 0;
devID = config.devID;
LoadParamInt(argc, argv, "d", &inSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "d", &outSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "fnnh", &hSize, outSize * 4);
LoadParamFloat(argc, argv, "fnnminmax", &minmax, 0.1F);
LoadParamFloat(argc, argv, "dropoutfnn", &dropoutP, 0);
inSize = config.modelSize;
outSize = config.modelSize;
hSize = config.fnnHiddenSize;
dropoutP = config.fnnDropout;
InitTensor2D(&w1, inSize, hSize, X_FLOAT, devID, mem);
InitTensor1D(&b1, hSize, X_FLOAT, devID, mem);
InitTensor2D(&w1, inSize, hSize, X_FLOAT, devID);
InitTensor1D(&b1, hSize, X_FLOAT, devID);
InitTensor2D(&w2, hSize, outSize, X_FLOAT, devID, mem);
InitTensor1D(&b2, outSize, X_FLOAT, devID, mem);
InitTensor2D(&w2, hSize, outSize, X_FLOAT, devID);
InitTensor1D(&b2, outSize, X_FLOAT, devID);
float scale = 1.0F;
float finfout1 = (float)sqrt(6.0F * scale/(inSize + hSize));
float finfout2 = (float)sqrt(6.0F * scale/(hSize + outSize));
w1.SetDataRand(-finfout1, finfout1);
_SetDataFanInOut(&w1, scale);
_SetDataFanInOut(&w2, scale);
w1.SetDataRand(-(DTYPE)sqrt(6.0F / inSize), (DTYPE)sqrt(6.0F / inSize));
w2.SetDataRand(-(DTYPE)sqrt(6.0F / hSize), (DTYPE)sqrt(6.0F / hSize));
b1.SetZeroAll();
w2.SetDataRand(-finfout2, finfout2);
b2.SetZeroAll();
}
/*
make the network
/*
make the network
y = max(0, x * w1 + b1) * w2 + b2
>> input - the input tensor
>> return - the output tensor
>> return - the output tensor
*/
XTensor T2TFNN::Make(XTensor &input, bool isTraining)
XTensor FNN::Make(XTensor& input, bool isTraining)
{
XTensor t1;
/* t1 = max(0, x * w1 + b1) */
//t1 = Rectify(MMul(input, w1) + b1);
t1 = Rectify(MulAndShift(input, w1, b1));
if(isTraining && dropoutP > 0)
if (isTraining && dropoutP > 0)
t1 = Dropout(t1, dropoutP);
/* result = t1 * w2 + b2 */
//return MMul(t1, w2) + b2;
return MulAndShift(t1, w2, b2);
}
}
}
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -17,28 +16,28 @@
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#ifndef __T2TFNN_H__
#define __T2TFNN_H__
#ifndef __FNN_H__
#define __FNN_H__
#include "../../tensor/XTensor.h"
#include "LayerNorm.h"
#include "../Utility.h"
#include "../../../tensor/XTensor.h"
using namespace nts;
namespace transformer
namespace nmt
{
/* a fnn: y = max(0, x * w1 + b1) * w2 + b2 */
class T2TFNN
class FNN
{
public:
/* device id */
int devID;
/* memory pool */
XMem * mem;
/* size of input vector */
int inSize;
......@@ -59,24 +58,23 @@ public:
/* bias of transformation 2 */
XTensor b2;
/* dropout probability */
DTYPE dropoutP;
public:
/* constructor */
T2TFNN();
FNN();
/* deconstructor */
~T2TFNN();
/* de-constructor */
~FNN();
/* initialize the model */
void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL);
void InitModel(Config& config);
/* make the network */
XTensor Make(XTensor &input, bool isTraining);
XTensor Make(XTensor& input, bool isTraining);
};
}
......
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Bei Li (libei_neu@outlook.com) 2020-02-03
*/
#include "GLU.h"
#include "Embedding.h"
#include "../Utility.h"
#include "../../../tensor/core/CHeader.h"
#include "../../../tensor/function/FHeader.h"
namespace nmt
{
/* constructor */
GLU::GLU()
{
inSize = -1;
outSize = -1;
hSize = -1;
}
/* de-constructor */
GLU::~GLU()
{
}
/*
initialize the model
>> config - configurations of the model
*/
void GLU::InitModel(Config& config)
{
devID = config.devID;
float minmax = 0;
inSize = config.modelSize;
outSize = config.modelSize;
InitTensor2D(&w1, hSize, outSize, X_FLOAT, devID);
InitTensor1D(&b1, outSize, X_FLOAT, devID);
InitTensor2D(&w2, hSize, outSize, X_FLOAT, devID);
InitTensor1D(&b2, outSize, X_FLOAT, devID);
}
/*
make the network
y = W1 * x + b1 * sigmod(W2 * x + b2)
>> input - the input tensor, size = 2 * hSize
>> return - the output tensor, size = hSize
*/
XTensor GLU::Make(XTensor& input)
{
XTensor t1;
XTensor t2;
TensorList input_list;
/* split the input into two vectors with the dim hSize */
Split(input, input_list, -1, 2);
/* t1 = W1 * x + b1 */
t1 = MulAndShift(input_list.GetItem(0), w1, b1);
/* t2 = W2 * x + b2 */
t2 = MulAndShift(input_list.GetItem(1), w2, b2);
return t1 * Sigmoid(t2);
}
}
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论