Commit b9c318bd by hello

no message

parent 1d17c439
......@@ -4,3 +4,4 @@ x64/
vc140.pdb
NiuTrans.Tensor.vcxproj.user
NiuTrans.Tensor.aps
*.tgz
......@@ -97,35 +97,47 @@ if(USE_CUDA)
add_definitions(-DHALF_PRECISION)
endif()
find_package(CUDA REQUIRED)
if(WIN32)
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4819")
set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-maxrregcount=0 -m64 --disable-warnings -use_fast_math -DUSE_CUDA")
if(USE_HALF_PRECISION)
set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-DHALF_PRECISION")
set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} -arch=sm_60
-gencode=arch=compute_60,code=sm_60
-gencode=arch=compute_61,code=sm_61
-gencode=arch=compute_62,code=sm_62
-gencode=arch=compute_70,code=sm_70
-gencode=arch=compute_70,code=compute_70
)
else()
set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} -arch=sm_30
-gencode=arch=compute_30,code=sm_30
-gencode=arch=compute_50,code=sm_50
-gencode=arch=compute_52,code=sm_52
if(GPU_ARCH STREQUAL K) # Kepler cards (CUDA 5 until CUDA 10)
set(ARCH_FLAGS -arch=compute_30 -code=compute_30,sm_30,sm_35,sm_37)
elseif(GPU_ARCH STREQUAL M) # Maxwell cards (CUDA 6 until CUDA 11)
set(ARCH_FLAGS -arch=compute_50 -code=compute_50,sm_50,sm_52,sm_53)
elseif(GPU_ARCH STREQUAL P) # Pascal (CUDA 8 and later)
set(ARCH_FLAGS -arch=compute_60 -code=compute_60,sm_60,sm_61,sm_62)
elseif(GPU_ARCH STREQUAL V) # Volta (CUDA 9 and later)
set(ARCH_FLAGS -arch=compute_70 -code=compute_70,sm_70,sm_72)
elseif(GPU_ARCH STREQUAL T) # Turing (CUDA 10 and later)
set(ARCH_FLAGS -arch=compute_75 -code=sm_75)
elseif(GPU_ARCH STREQUAL A) # Ampere (CUDA 11 and later)
set(ARCH_FLAGS -arch=compute_80 -code=sm_80)
endif()
if(USE_HALF_PRECISION)
if(NOT DEFINED GPU_ARCH)
set(ARCH_FLAGS -arch=sm_60
-gencode=arch=compute_60,code=sm_60
-gencode=arch=compute_61,code=sm_61
-gencode=arch=compute_62,code=sm_62
-gencode=arch=compute_70,code=sm_70
-gencode=arch=compute_72,code=sm_72
-gencode=arch=compute_70,code=compute_70
)
elseif(${GPU_ARCH} STREQUAL K OR ${GPU_ARCH} STREQUAL M)
message(FATAL_ERROR "your GPU cannot use the function half precision")
endif()
endif()
if(WIN32)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4819")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-maxrregcount=0 -m64 -Wno-deprecated-gpu-targets -use_fast_math")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${ARCH_FLAGS})
set(CMAKE_POLICY_DEFAULT_CMP0028 NEW)
link_directories("${CUDA_ROOT}/lib/x64")
include_directories("${CUDA_ROOT}/include")
set(CUDA_LIB_DIR "${CUDA_ROOT}/lib/x64/")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}cublas.lib")
if(CUDA_VERSION_MAJOR EQUAL 11)
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}cublasLt.lib")
endif()
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}npps.lib")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}nppc.lib")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}cudadevrt.lib")
......@@ -133,31 +145,14 @@ if(USE_CUDA)
else()
set(CMAKE_CXX_FLAGS "-fPIC -msse4.2 -w -march=native -Wno-enum-compare -Wno-sign-compare -Wno-format -Wno-dev -O3 -DNDEBUG -rdynamic")
set(CUDA_NVCC_FLAGS "-Xcompiler -fPIC -maxrregcount=0 --disable-warnings -use_fast_math -DUSE_CUDA -Wno-deprecated-gpu-targets -std=c++11")
if(USE_HALF_PRECISION)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-DHALF_PRECISION")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -arch=sm_60
-gencode=arch=compute_60,code=sm_60
-gencode=arch=compute_61,code=sm_61
-gencode=arch=compute_62,code=sm_62
-gencode=arch=compute_70,code=sm_70
-gencode=arch=compute_70,code=compute_70
)
else()
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -arch=sm_30
-gencode=arch=compute_30,code=sm_30
-gencode=arch=compute_50,code=sm_50
-gencode=arch=compute_52,code=sm_52
-gencode=arch=compute_60,code=sm_60
-gencode=arch=compute_61,code=sm_61
-gencode=arch=compute_62,code=sm_62
-gencode=arch=compute_70,code=sm_70
-gencode=arch=compute_70,code=compute_70
)
endif()
link_directories(${CUDA_ROOT}/lib64)
include_directories(${CUDA_ROOT}/include)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${ARCH_FLAGS})
link_directories("${CUDA_ROOT}/lib64")
include_directories("${CUDA_ROOT}/include")
set(CUDA_LIB_DIR "${CUDA_ROOT}/lib64/")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcublas_static.a")
if(CUDA_VERSION_MAJOR EQUAL 11)
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcublasLt_static.a")
endif()
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libculibos.a")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libnpps_static.a")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libnppc_static.a")
......
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
# NiuTrans.Tensor环境配置
## 注意事项
CUDA最新版本9.2尚且不支持VS2017最新版本,因此建议使用CUDA版本为9.0或9.1,建议使用VS版本为VS2015,或使用VS2017时安装v140工具集,解决方案平台设置为×64。
## CUDA配置
在已安装好VS、CUDA并配置好环境变量后,一些关键的CUDA配置选项如下所示,以下配置选项在 **项目 -> 属性** 中可以找到。
>$(CUDA_PATH)\include
加入到 **VC++目录 -> 包含** 中。
>$(CUDA_PATH)\lib\Win32
加入到 **VC++目录 -> 库** 中。
>cuda.lib;cudadevrt.lib;cudart.lib;cudart_static.lib;nvcuvid.lib;OpenCL.lib;cublas.lib;curand.lib;
加入到 **链接器->输入->附加依赖项** 中。
配置完成后,右键 **工程->项目依赖性** ,选择CUDA9。
在.cu文件上右键属性,在项类型中选择"CUDA C/C++"(最好搜索.cu文件,然后全选设置)。
## 其他配置
**C/C++->常规->SDL检查**,设为否。
**C/C++->预处理器->预处理器定义** 中,添加
>USE_CUDA;USE_BLAS;WIN32;MKL;_DEBUG;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_WARNINGS_
CONSOLE;
**链接器->系统->子系统**,设置为控制台。
**常规->字符集**,使用Unicode字符集。
**调试->命令参数**中设置可执行文件所需要的参数。
......@@ -39,7 +39,7 @@ NiuTensor工具包可以在Windows、Linux以及macOS环境下进行安装,支
- 执行CMake命令对Visual Studio项目进行生成(如果 visual studio 版本低于 2019,则在使用下列命令的时候需额外加上`-A x64`的CMake参数),如计划生成动态链接库,则仅需在命令中额外加上`-DGEN_DLL=ON`的CMake参数即可,否则默认生成可执行程序。
- 如项目计划启用MKL数学运算库(需用户自行安装),则仅需在CMake命令中使用`-DUSE_MKL=ON`参数,并通过`-DINTEL_ROOT='/intel/root/path'`指定MKL库(Intel工具包)的安装路径。如`cmake -DUSE_MKL=ON -DINTEL_ROOT='C:/Program Files (x86)/IntelSWTools/compilers_and_libraries_2020.2.254/windows' ..`
- 如项目计划启用OpenBLAS数学运算库(需用户自行安装),则仅需在CMake命令中使用`-DUSE_OPENBLAS=ON`参数,并通过`-DOPENBLAS_ROOT='/openblas/root/path'`指定OpenBLAS库的安装路径。如`cmake -DUSE_OPENBLAS=ON -DOPENBLAS_ROOT='C:/Program Files/OpenBLAS' ..`
- 如项目计划启用CUDA数学运算库(需用户自行安装),则仅需在CMake命令中使用`-DUSE_CUDA=ON`参数,并通过`-DCUDA_ROOT='/cuda/root/path'`指定CUDA库的安装路径。如`cmake -DUSE_CUDA=ON -DCUDA_ROOT='C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.2' ..`。如需在GPU设备上使用半精度浮点数进行运算,需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数(需要注意的是半精度但需要注意的是,半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持,该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询)。
- 如项目计划启用CUDA数学运算库(需用户自行安装),则仅需在CMake命令中使用`-DUSE_CUDA=ON`参数,并通过`-DCUDA_ROOT='/cuda/root/path'`指定CUDA库的安装路径,通过-DGPU_ARCH=ARCH指定所在GPU设备的架构(K:Kepler架构;M:Maxwell架构;P:Pascal架构;V:Volta架构;T:Turing架构;A:Ampere架构)。如`cmake -DUSE_CUDA=ON -DCUDA_ROOT='C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.2' -DGPU_ARCH=P ..`。如需在GPU设备上使用半精度浮点数进行运算,需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数(需要注意的是半精度但需要注意的是,半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持,该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询)。
- 执行成功将显示`Build files have been written to:...`
- 打开build目录中的NiuTensor.sln文件即可通过Visual Studio打开NiuTensor项目。
- 打开后在解决方案管理器中选中NiuTensor,右键将其设为启动项目即可开始使用。
......@@ -60,7 +60,7 @@ NiuTensor工具包可以在Windows、Linux以及macOS环境下进行安装,支
- 打开CLion首选项,点击“构建,执行,部署”选项卡中的CMake,在“CMake选项”中进行设置,设置完成后CLion将自动使用CMake对项目进行构建,如计划生成动态链接库,则仅需在在“CMake选项”中额外加上`-DGEN_DLL=ON`的CMake参数即可,否则默认生成可执行程序。
- 如项目计划启用MKL数学运算库(需用户自行安装),则仅需在“CMake选项”中填入`-DUSE_MKL=ON`,并通过`-DINTEL_ROOT='/intel/root/path'`指定MKL库(Intel工具包)的安装路径。如`-DUSE_MKL=ON -DINTEL_ROOT='/opt/intel/compilers_and_libraries_2020.2.254/linux'`
- 如项目计划启用OpenBLAS数学运算库(需用户自行安装),则仅需在“CMake选项”中填入`-DUSE_OPENBLAS=ON`,并通过`-DOPENBLAS_ROOT='/openblas/root/path'`指定OpenBLAS库的安装路径。如`-DUSE_OPENBLAS=ON -DOPENBLAS_ROOT='/opt/OpenBLAS'`
- 如项目计划启用CUDA数学运算库(需用户自行安装),则仅需在“CMake选项”中填入`-DUSE_CUDA=ON`,并通过`-DCUDA_ROOT='/cuda/root/path'`指定CUDA库的安装路径。如`-DUSE_CUDA=ON -DCUDA_ROOT='/usr/local/cuda-9.2'`。如需在GPU设备上使用半精度浮点数进行运算,需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数(需要注意的是半精度但需要注意的是,半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持,该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询)。
- 如项目计划启用CUDA数学运算库(需用户自行安装),则仅需在“CMake选项”中填入`-DUSE_CUDA=ON`,并通过`-DCUDA_ROOT='/cuda/root/path'`指定CUDA库的安装路径,通过-DGPU_ARCH=ARCH指定所在GPU设备的架构(K:Kepler架构;M:Maxwell架构;P:Pascal架构;V:Volta架构;T:Turing架构;A:Ampere架构)。如`-DUSE_CUDA=ON -DCUDA_ROOT='/usr/local/cuda-9.2' -DGPU_ARCH=P `。如需在GPU设备上使用半精度浮点数进行运算,需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数(需要注意的是半精度但需要注意的是,半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持,该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询)。
##### CMake方式(命令行)
......@@ -71,7 +71,7 @@ NiuTensor工具包可以在Windows、Linux以及macOS环境下进行安装,支
- 执行CMake命令对项目进行生成,如计划生成动态链接库,则仅需在命令中额外加上`-DGEN_DLL=ON`的CMake参数即可,否则默认生成可执行程序。
- 如项目计划启用MKL数学运算库(需用户自行安装),则仅需在CMake命令中使用`-DUSE_MKL=ON`参数,并通过`-DINTEL_ROOT='/intel/root/path'`指定MKL库(Intel工具包)的安装路径。如`cmake -DUSE_MKL=ON -DINTEL_ROOT='/opt/intel/compilers_and_libraries_2020.2.254/linux' ..`
- 如项目计划启用OpenBLAS数学运算库(需用户自行安装),则仅需在CMake命令中使用`-DUSE_OPENBLAS=ON`参数,并通过`-DOPENBLAS_ROOT='/openblas/root/path'`指定OpenBLAS库的安装路径。如`cmake -DUSE_OPENBLAS=ON -DOPENBLAS_ROOT='/opt/OpenBLAS' ..`
- 如项目计划启用CUDA数学运算库(需用户自行安装),则仅需在CMake命令中使用`-DUSE_CUDA=ON`参数,并通过`-DCUDA_ROOT='/cuda/root/path'`指定CUDA库的安装路径。如`cmake -DUSE_CUDA=ON -DCUDA_ROOT='/usr/local/cuda-9.2' ..`。如需在GPU设备上使用半精度浮点数进行运算,需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数(需要注意的是半精度但需要注意的是,半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持,该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询)。
- 如项目计划启用CUDA数学运算库(需用户自行安装),则仅需在CMake命令中使用`-DUSE_CUDA=ON`参数,并通过`-DCUDA_ROOT='/cuda/root/path'`指定CUDA库的安装路径,通过-DGPU_ARCH=ARCH指定所在GPU设备的架构(K:Kepler架构;M:Maxwell架构;P:Pascal架构;V:Volta架构;T:Turing架构;A:Ampere架构)。如`cmake -DUSE_CUDA=ON -DCUDA_ROOT='/usr/local/cuda-9.2' -DGPU_ARCH=P ..`。如需在GPU设备上使用半精度浮点数进行运算,需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数(需要注意的是半精度但需要注意的是,半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持,该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询)。
- 执行成功将显示`Build files have been written to:...`并在该目录下生成Makefile文件。
- 执行`make -j`命令对NiuTensor项目进行编译,执行成功将显示`Built target NiuTensor`,安装完毕。
......
......@@ -26,7 +26,7 @@
#include "./tensor/core/CHeader.h"
#include "./tensor/test/Test.h"
#include "./sample/fnnlm/FNNLM.h"
#include "./sample/transformer/Transformer.h"
#include "./sample/transformer/NMT.h"
//#define CRTDBG_MAP_ALLOC
//#include <stdlib.h>
......@@ -34,7 +34,7 @@
using namespace nts;
using namespace fnnlm;
using namespace transformer;
using namespace nmt;
int main( int argc, const char ** argv )
{
......@@ -43,7 +43,7 @@ int main( int argc, const char ** argv )
else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
FNNLMMain(argc - 1, argv + 1);
else if(argc > 1 && !strcmp(argv[1], "-t2t"))
TransformerMain(argc - 1, argv + 1);
NMTMain(argc - 1, argv + 1);
else{
fprintf(stderr, "Thanks for using NiuTensor! This is a library for building\n");
fprintf(stderr, "neural networks in an easy way. \n\n");
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -20,15 +19,13 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#include <cmath>
#include "T2TDecoder.h"
#include "module/T2TUtility.h"
#include "module/T2TLayerNormal.h"
#include "module/T2TCommonModules.h"
#include "Decoder.h"
#include "Utility.h"
#include "module/LayerNorm.h"
#include "module/CommonModules.h"
#include "../../tensor/core/CHeader.h"
namespace transformer
namespace nmt
{
/* constructor */
......@@ -64,7 +61,7 @@ AttDecoder::~AttDecoder()
initialize the model
>> config - configurations of the model
*/
void AttDecoder::InitModel(T2TConfig& config)
void AttDecoder::InitModel(Config& config)
{
devID = config.devID;
nlayer = config.nDecLayer;
......@@ -80,16 +77,17 @@ void AttDecoder::InitModel(T2TConfig& config)
/* embedding model */
embedder.InitModel(config, false);
selfAtt = new T2TAttention[nlayer];
fnns = new T2TFNN[nlayer];
selfAttLayerNorms = new T2TLN[nlayer];
enDeAtt = new T2TAttention[nlayer];
enDeAttLayerNorms = new T2TLN[nlayer];
fnnLayerNorms = new T2TLN[nlayer];
selfAtt = new Attention[nlayer];
fnns = new FNN[nlayer];
selfAttLayerNorms = new LN[nlayer];
enDeAtt = new Attention[nlayer];
enDeAttLayerNorms = new LN[nlayer];
fnnLayerNorms = new LN[nlayer];
selfAttCache = new Cache[nlayer];
enDeAttCache = new Cache[nlayer];
if (preNorm)
decoderLayerNorm = new T2TLN;
decoderLayerNorm = new LN;
/* initialize the stacked layers */
for (int i = 0; i < nlayer; i++) {
......@@ -99,6 +97,8 @@ void AttDecoder::InitModel(T2TConfig& config)
fnnLayerNorms[i].InitModel(config);
enDeAtt[i].InitModel(config);
enDeAttLayerNorms[i].InitModel(config);
selfAttCache[i].enable = true;
enDeAttCache[i].enable = true;
}
if (preNorm)
decoderLayerNorm->InitModel(config);
......@@ -115,9 +115,10 @@ make the decoding network
<< return - the output tensor of the decoder
*/
XTensor AttDecoder::Make(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
XTensor* maskEncDec, int nstep, bool isTraining)
XTensor* maskEncDec, int nstep, bool isTraining)
{
XTensor x;
x = embedder.Make(inputDec, true, isTraining, nstep);
/* dropout */
......@@ -188,8 +189,86 @@ XTensor AttDecoder::Make(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
}
if (preNorm)
x = decoderLayerNorm->Make(x);
return decoderLayerNorm->Make(x);
return x;
}
/*
make the decoding network
>> inputDec - the input tensor of the decoder
>> outputEnc - the output tensor of the encoder
>> mask - mask that indicates which position is valid
>> maskEncDec - mask for the encoder-decoder attention
>> nstep - the current length of the decoder input
>> isTraining - indicates whether the model is used for training
<< return - the output tensor of the decoder
*/
XTensor AttDecoder::MakeFast(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
XTensor* maskEncDec, int nstep, bool isTraining)
{
XTensor x;
x = embedder.Make(inputDec, true, isTraining, nstep);
/* dropout */
if (isTraining && dropoutP > 0)
x = Dropout(x, dropoutP);
for (int i = 0; i < nlayer; i++) {
XTensor res;
res = x;
/* layer normalization with pre-norm for self-attn */
x = selfAttLayerNorms[i].Make(x);
/******************/
/* self attention */
x = selfAtt[i].Make(x, x, x, mask, isTraining, &selfAttCache[i], SELF_ATT);
/* dropout */
if (isTraining && dropoutP > 0)
x = Dropout(x, dropoutP);
/* residual connection */
x = Sum(res, x);
res = x;
/* layer normalization with pre-norm for encoder-decoder attention */
x = enDeAttLayerNorms[i].Make(x);
/* encoder-decoder attention */
x = enDeAtt[i].Make(outputEnc, x, outputEnc, maskEncDec,
isTraining, &enDeAttCache[i], EN_DE_ATT);
/* dropout */
if (isTraining && dropoutP > 0)
x = Dropout(x, dropoutP);
/* residual connection */
x = Sum(res, x);
res = x;
/* layer normalization with pre-norm for fnn */
x = fnnLayerNorms[i].Make(x);
/* fnn */
x = fnns[i].Make(x, isTraining);
/* dropout */
if (isTraining && dropoutP > 0)
x = Dropout(x, dropoutP);
/* residual connection */
x = Sum(res, x);
}
x = decoderLayerNorm->Make(x);
return x;
}
}
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -20,13 +19,13 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#ifndef __T2TDECODER_H__
#define __T2TDECODER_H__
#ifndef __DECODER_H__
#define __DECODER_H__
#include "T2TEncoder.h"
#include "module/T2TUtility.h"
#include "Encoder.h"
#include "Utility.h"
namespace transformer
namespace nmt
{
class AttDecoder
......@@ -52,28 +51,28 @@ public:
DTYPE dropoutP;
/* embedding of word at each position */
T2TEmbedder embedder;
Embedder embedder;
/* FNN model of each layer */
T2TFNN* fnns;
FNN* fnns;
/* attention model of each layer */
T2TAttention* selfAtt;
Attention* selfAtt;
/* layer normalization for attention */
T2TLN* selfAttLayerNorms;
LN* selfAttLayerNorms;
/* layer normalization for fnn */
T2TLN* fnnLayerNorms;
LN* fnnLayerNorms;
/* layer normalization for decoder */
T2TLN* decoderLayerNorm;
LN* decoderLayerNorm;
/* encoder-decoder attention model of each layer */
T2TAttention* enDeAtt;
Attention* enDeAtt;
/* layer normalization for encoder-decoder attention */
T2TLN* enDeAttLayerNorms;
LN* enDeAttLayerNorms;
/* layer cache list */
Cache* selfAttCache;
......@@ -92,11 +91,15 @@ public:
~AttDecoder();
/* initialize the model */
void InitModel(T2TConfig& config);
void InitModel(Config& config);
/* make the decoding network */
XTensor Make(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
XTensor* maskEncDec, int nstep, bool isTraining);
/* make the decoding network (pre norm) */
XTensor MakeFast(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
XTensor* maskEncDec, int nstep, bool isTraining);
};
}
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -20,15 +19,13 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#include <cmath>
#include "T2TEncoder.h"
#include "module/T2TUtility.h"
#include "module/T2TLayerNormal.h"
#include "module/T2TCommonModules.h"
#include "Encoder.h"
#include "Utility.h"
#include "module/LayerNorm.h"
#include "module/CommonModules.h"
#include "../../tensor/core/CHeader.h"
namespace transformer
namespace nmt
{
/* constructor */
......@@ -56,7 +53,7 @@ AttEncoder::~AttEncoder()
initialize the model
>> config - configurations for the model
*/
void AttEncoder::InitModel(T2TConfig& config)
void AttEncoder::InitModel(Config& config)
{
devID = config.devID;
......@@ -68,18 +65,18 @@ void AttEncoder::InitModel(T2TConfig& config)
dropoutP = config.dropout;
CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsize\"");
CheckNTErrors(vSize > 1, "Set vocabulary size by \"-vsize\"");
/* embedding model */
embedder.InitModel(config);
selfAtt = new T2TAttention[nlayer];
fnns = new T2TFNN[nlayer];
attLayerNorms = new T2TLN[nlayer];
fnnLayerNorms = new T2TLN[nlayer];
selfAtt = new Attention[nlayer];
fnns = new FNN[nlayer];
attLayerNorms = new LN[nlayer];
fnnLayerNorms = new LN[nlayer];
if (preNorm)
encoderLayerNorm = new T2TLN;
encoderLayerNorm = new LN;
/* initialize the stacked layers */
for (int i = 0; i < nlayer; i++) {
......@@ -122,7 +119,7 @@ XTensor AttEncoder::Make(XTensor& input, XTensor* mask, XTensor& maskEncDec, boo
attnBefore = LayerNorm(x, attLayerNorms[i], preNorm, true, false);
/* self attention */
att = selfAtt[i].Make(attnBefore, attnBefore, attnBefore, mask, isTraining, NULL, 0);
att = selfAtt[i].Make(attnBefore, attnBefore, attnBefore, mask, isTraining, NULL, SELF_ATT);
/* dropout */
if (isTraining && dropoutP > 0)
......@@ -151,7 +148,63 @@ XTensor AttEncoder::Make(XTensor& input, XTensor* mask, XTensor& maskEncDec, boo
x = LayerNorm(res, fnnLayerNorms[i], preNorm, false, true);
}
if (preNorm)
x = encoderLayerNorm->Make(x);
return encoderLayerNorm->Make(x);
return x;
}
/*
make the encoding network
>> input - the input tensor of the encoder
>> mask - the mask that indicate each position is valid
>> maskEncDec - no use
>> isTraining - indicates whether the model is used for training
<< return - the output tensor of the encoder
*/
XTensor AttEncoder::MakeFast(XTensor& input, XTensor* mask, XTensor& maskEncDec, bool isTraining)
{
XTensor x;
x = embedder.Make(input, false, isTraining);
/* dropout */
if (isTraining && dropoutP > 0)
x = Dropout(x, dropoutP);
for (int i = 0; i < nlayer; i++) {
XTensor res;
res = x;
/* layer normalization with pre-norm for self-attn */
x = attLayerNorms[i].Make(x);
/* self attention */
x = selfAtt[i].Make(x, x, x, mask, isTraining, NULL, SELF_ATT);
/* dropout */
if (isTraining && dropoutP > 0)
x = Dropout(x, dropoutP);
/* residual connection */
x = Sum(res, x);
res = x;
/* layer normalization with pre-norm for fnn */
x = fnnLayerNorms[i].Make(x);
/* fnn */
x = fnns[i].Make(x, isTraining);
/* dropout */
if (isTraining && dropoutP > 0)
x = Dropout(x, dropoutP);
/* residual connection */
x = Sum(res, x);
}
x = encoderLayerNorm->Make(x);
return x;
}
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -20,25 +19,25 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#ifndef __T2TENCODER_H__
#define __T2TENCODER_H__
#ifndef __ENCODER_H__
#define __ENCODER_H__
#include "module/T2TFNN.h"
#include "module/T2TUtility.h"
#include "module/T2TAttention.h"
#include "module/T2TEmbedding.h"
#include "module/T2TLayerNormal.h"
#include "Utility.h"
#include "module/FNN.h"
#include "module/Attention.h"
#include "module/Embedding.h"
#include "module/LayerNorm.h"
#include "../../network/XNet.h"
using namespace nts;
namespace transformer
namespace nmt
{
/*
base class of the encoder
*/
class T2TEncoder
class Encoder
{
public:
virtual XTensor Make(XTensor& input, XTensor* mask, XTensor& mask2, bool isTraining) = 0;
......@@ -47,7 +46,7 @@ public:
/*
the encoder based on self-attention
*/
class AttEncoder : T2TEncoder
class AttEncoder : Encoder
{
public:
/* device id */
......@@ -73,22 +72,22 @@ public:
int ignored;
/* embedding of word at each position */
T2TEmbedder embedder;
Embedder embedder;
/* FNN model of each layer */
T2TFNN* fnns;
FNN* fnns;
/* attention model of each layer */
T2TAttention* selfAtt;
Attention* selfAtt;
/* layer normalizations for attention */
T2TLN* attLayerNorms;
LN* attLayerNorms;
/* layer normalization for fnn */
T2TLN* fnnLayerNorms;
LN* fnnLayerNorms;
/* layer normalization for encoder */
T2TLN* encoderLayerNorm;
LN* encoderLayerNorm;
/* the location of layer normalization */
bool preNorm;
......@@ -101,11 +100,14 @@ public:
~AttEncoder();
/* initialize the model */
void InitModel(T2TConfig& config);
void InitModel(Config& config);
/* make the encoding network */
XTensor Make(XTensor& input, XTensor* mask, XTensor& maskEncDec, bool isTraining);
/* make the encoding network */
XTensor MakeFast(XTensor& input, XTensor* mask, XTensor& maskEncDec, bool isTraining);
/* make the encoding network (wrapper) */
XTensor Make(XTensor& input, XTensor* mask, bool isTraining);
};
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -20,23 +19,22 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#ifndef __T2TMODEL_H__
#define __T2TMODEL_H__
#ifndef __MODEL_H__
#define __MODEL_H__
#include "T2TEncoder.h"
#include "T2TDecoder.h"
#include "module/T2TFNN.h"
#include "module/T2TOutput.h"
#include "module/T2TUtility.h"
#include "module/T2TAttention.h"
#include "Encoder.h"
#include "Decoder.h"
#include "module/FNN.h"
#include "module/Output.h"
#include "Utility.h"
#include "module/Attention.h"
namespace transformer
namespace nmt
{
/* a transformer model that keeps parameters of the encoder,
the decoder and the output layer (softmax). Also, it creates
the network used in transformer. */
class T2TModel
/* a nmt model that keeps parameters of the encoder,
the decoder and the output layer (softmax). */
class Model
{
public:
/* device id */
......@@ -49,7 +47,7 @@ public:
AttDecoder* decoder;
/* output layer */
T2TOutput* outputLayer;
Output* outputLayer;
/* indicates whether the model is running for language modeling */
bool isLM;
......@@ -71,13 +69,16 @@ public:
public:
/* constructor */
T2TModel();
Model();
/* de-constructor */
~T2TModel();
~Model();
/* initialize the model */
void InitModel(T2TConfig& config);
void InitModel(Config& config);
/* print model configurations */
void ShowModelConfig(Config& config);
/* make the encoding network */
XTensor MakeEncoder(XTensor& input, XTensor* mask, bool isTraining);
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -17,49 +16,47 @@
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-06
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-06, 2020-07
*/
#include <cmath>
#include <ctime>
#include "Transformer.h"
#include "train/T2TTrainer.h"
#include "module/T2TUtility.h"
#include "translate/T2TTranslator.h"
#include "../../tensor/XDevice.h"
#include "../../tensor/XGlobal.h"
#include "../../tensor/XUtility.h"
#include "NMT.h"
#include "train/Trainer.h"
#include "translate/Translator.h"
namespace transformer
namespace nmt
{
int TransformerMain(int argc, const char** argv)
int NMTMain(int argc, const char** argv)
{
if (argc == 0)
return 1;
/* load configurations */
T2TConfig config(argc, argv);
Config config(argc, argv);
srand((unsigned int)time(NULL));
srand(1);
/* train the model */
/* training */
if (strcmp(config.trainFN, "") != 0) {
ENABLE_GRAD;
T2TModel model;
Model model;
model.InitModel(config);
T2TTrainer trainer;
Trainer trainer;
trainer.Init(config);
trainer.Train(config.trainFN, config.validFN, config.modelFN, &model);
}
/* translate the test file */
/* translating */
if (strcmp(config.testFN, "") != 0 && strcmp(config.outputFN, "") != 0) {
/* disable grad flow */
DISABLE_GRAD;
T2TModel model;
Model model;
model.InitModel(config);
T2TTranslator translator;
Translator translator;
translator.Init(config);
translator.Translate(config.testFN, config.srcVocabFN,
config.tgtVocabFN, config.outputFN, &model);
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -16,29 +15,17 @@
*/
/*
*
* An implementation of the transformer system. See more details
* about FNNLM in
* "Attention Is All You Need" by Vaswani et al.
* https://arxiv.org/pdf/1706.03762.pdf
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
* I start writing the code related to NMT - a long time since my last coding
* work on MT
* An implementation of the NMT system.
*/
#ifndef __TRANSFORMER_H__
#define __TRANSFORMER_H__
#include "../../tensor/XGlobal.h"
#include "../../tensor/XTensor.h"
#include "../../tensor/core/CHeader.h"
#ifndef __NMT_H__
#define __NMT_H__
namespace transformer
namespace nmt
{
/* entrance of the program */
int TransformerMain(int argc, const char** argv);
int NMTMain(int argc, const char** argv);
}
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
*/
#ifndef __T2TATTENTION_H__
#define __T2TATTENTION_H__
#include "../../network/XNet.h"
using namespace nts;
namespace transformer
{
/* attention type */
enum { NONE, SELF_ATT, EN_DE_ATT };
/* layer cache for keys and values */
class Cache
{
public:
/* cache for keys */
XTensor key;
/* cache for values */
XTensor value;
public:
bool miss;
Cache() {
miss = true;
}
void Update(XTensor&& k, XTensor&& v) {
key = k;
value = v;
miss = false;
}
};
/*
multi-head attention
y(Q, K, V) = cat(head_1, head_2, ..., head_n)
where head_i = Attention(Q * w_i^Q, K * w_i^K, V * w_i^V)
attention(Q, K, V) = softmax(Q * K^T/d_k^0.5) V
d_k = dimension size of K
*/
class T2TAttention
{
public:
/* device id */
int devID;
/* head number */
int nhead;
/* transformation matrix for Q */
XTensor wq;
/* bias for Q */
XTensor bq;
/* transformation matrix for K */
XTensor wk;
/* bias for K */
XTensor bk;
/* transformation matrix for V */
XTensor wv;
/* bias for V */
XTensor bv;
XTensor wBig;
XTensor bBig;
/* RPR emb */
XTensor rp_embedding_k;
/* transformation after dot-product attention */
XTensor wo;
/* bias after dot-product attention */
XTensor bo;
/* size of transformed Q and K */
int dk;
/* size of transformed V */
int dv;
/* size of input Q, K and V */
int d;
/* indicates whether the attention is masked */
bool isMasked;
/* some positions can be ignored in attention. this is useful in lm where the first position needs
special design for the attention model. */
int ignored;
/* indicates whether the model is used for training */
bool isTraining;
/* dropout probability */
DTYPE dropoutP;
/* max relative window size */
int max_relative_position;
public:
/* constructor */
T2TAttention();
/* de-constructor */
~T2TAttention();
/* initialize the model */
void InitModel(int argc, char** argv,
bool myIsMasked, int myIgnored,
int myDevID = -1);
/* make the network */
XTensor Make( XTensor& k, XTensor& q, XTensor& v,
XTensor* mask, bool isTraining, Cache* cache, int cacheType);
/* make the attention network given keys, queries and values (after linear transformation) */
XTensor MakeAttention(XTensor& k, XTensor& q, XTensor& v, XTensor* mask, bool isTraining, bool is_encoder);
/* make the attention network given keys, queries and values (after linear transformation) */
XTensor MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v, XTensor* mask, bool isTraining, bool is_encoder);
void GetRPEmbedding(XTensor* emb_matrix, const int len_q, const int len_kv, const int max_relative_length, const int device_id, const bool is_encoder);
void RPDotProduct(XTensor* x, XTensor* y, XTensor* z, XTensor* attention, const bool is_key);
};
}
#endif
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-08-01
*/
#include <math.h>
#include "T2TEmbedding.h"
#include "T2TUtility.h"
#include "../../tensor/core/CHeader.h"
namespace transformer
{
/* constructor */
T2TEmbedder::T2TEmbedder()
{
devID = -1;
vSize = -1;
maxLength = -1;
}
/* deconstructor */
T2TEmbedder::~T2TEmbedder()
{
}
/*
initialize the model
>> argc - number of arguments
>> argv - list of pointers to the arguments
>> myDevID - device id
*/
void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, bool isEnc)
{
devID = myDevID;
if(isEnc){
LoadParamInt(argc, argv, "vsize", &vSize, -1);
}
else{
LoadParamInt(argc, argv, "vsizetgt", &vSize, -1);
}
//LoadParamInt(argc, argv, "vsize", &vSize, -1);
LoadParamInt(argc, argv, "maxlen", &maxLength, 1024);
LoadParamInt(argc, argv, "d", &eSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "pad", &padIdx, 1);
InitTensor2DV2(&w, vSize, eSize, X_FLOAT, devID);
maxLength = maxLength + 1 + 1;
DTYPE v = 1.0F/(float)sqrt((float)eSize);
w.SetDataRandn(0, v);
/* create the positional embedding matrix */
MakePosEmbedding(eSize, d, maxLength, padIdx);
}
/*
make positional embeddings (of size eSize * length)
>> eSize - embedding size
>> d - dimension size of the hidden layers
>> length - length of the sequence
*/
void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length, int padIdx)
{
InitTensor2DV2(&posEmbeddingBase, length, eSize, X_FLOAT, devID);
float * data = new float[posEmbeddingBase.unitNum];
for(int pos = 0; pos < length; pos++){
float * dp = data + pos * eSize;
int channelSize = eSize / 2;
int offset = 0;
for(int i = 0; i < channelSize; i++){
dp[offset++] = (float)sin(pos * exp(-i * log(10000.0F) / (channelSize - 1)));
}
for(int i = 0; i < channelSize; i++){
dp[offset++] = (float)cos(pos * exp(-i * log(10000.0F) / (channelSize - 1)));
}
}
/* padding zeros */
int padStart = padIdx * eSize;
for (int i = padStart; i < padStart + eSize; i++)
data[i] = 0.F;
posEmbeddingBase.SetData(data, posEmbeddingBase.unitNum);
delete[] data;
}
/*
make the network
*/
XTensor T2TEmbedder::Make(XTensor &input, int prevLen, int nstep, bool isDec)
{
/* assert padding index is 1 */
CheckNTErrors(input.order > 1, "Wrong input tensor size!");
CheckNTErrors(input.dimSize[input.order - 1] < maxLength, "The sequence is too long!");
CheckNTErrors(vSize > 0, "set vocabulary size by \"-vsize\"");
CheckNTErrors(eSize > 0, "set embedding size by \"-esize\"");
XTensor wordEmbedding, position, posEmbedding;
InitTensorV2(&position, &input);
int* posData = new int[input.unitNum];
XTensor inputCPU;
InitTensorOnCPU(&inputCPU, &input);
_CopyValues(&input, &inputCPU);
if (!isDec)
{
for (int i = 0; i < inputCPU.GetDim(0); i++) {
int startNoPad = 2 + prevLen;
int* p = ((int*)inputCPU.data) + i * inputCPU.GetDim(1);
for (int j = 0; j < inputCPU.GetDim(1); j++) {
if (p[j] == 1) {
posData[i * inputCPU.GetDim(1) + j] = 1;
}
else {
posData[i * inputCPU.GetDim(1) + j] = startNoPad++;
}
}
}
position.SetData(posData, position.unitNum);
}
else
{
for (int i = 0; i < position.GetDim(0); i++) {
for (int j = 0; j < position.GetDim(1); j++) {
position.Set2DInt(nstep + 2, i, j);
}
}
}
delete[] posData;
/* we make positional embeddings first */
if (true) {
posEmbedding = Gather(posEmbeddingBase, position);
}
/* then we make word embeddings */
wordEmbedding = Gather(w, input);
wordEmbedding = Linear(wordEmbedding, (float)sqrt((float)eSize));
/* we sum over the two embeddings */
return Sum(wordEmbedding, posEmbedding);
}
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-08-01
*/
#ifndef __T2TEMBEDDING_H__
#define __T2TEMBEDDING_H__
#include "../../network/XNet.h"
using namespace nts;
namespace transformer
{
#define DEFAULT_EMBEDDING_SIZE 128
/*
embedding (of word at position i):
word embedding + positional embedding
*/
class T2TEmbedder
{
public:
/* device id */
int devID;
/* vocabulary size */
int vSize;
/* embedding size */
int eSize;
/* maximum length of the sequence */
int maxLength;
/* dimension size of the hidden layers in the t2t model */
int d;
/* padding index */
int padIdx;
/* word embedding matrix */
XTensor w;
/* predefined positional embeddings. It can speeds up
the embedding processing by re-loading. */
XTensor posEmbeddingBase;
public:
/* constructor */
T2TEmbedder();
/* de-constructor */
~T2TEmbedder();
/* initialize the model */
void InitModel(int argc, char ** argv, int myDevID = -1, bool isEnc = true);
/* make positional embeddings */
void MakePosEmbedding(int eSize, int d, int length, int padIdx);
/* make the network */
XTensor Make(XTensor &input, int prevLen=0, int nstep = -1, bool isDec = false);
};
}
#endif
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
*/
#include <math.h>
#include "T2TFNN.h"
#include "T2TUtility.h"
#include "T2TEmbedding.h"
#include "../../tensor/core/CHeader.h"
#include "../../tensor/function/FHeader.h"
namespace transformer
{
/* constructor */
T2TFNN::T2TFNN()
{
inSize = -1;
outSize = -1;
hSize = -1;
}
/* deconstructor */
T2TFNN::~T2TFNN()
{
}
/*
initialize the model
>> argc - number of arguments
>> argv - list of pointers to the arguments
>> myDevID - device id
*/
void T2TFNN::InitModel(int argc, char** argv, int myDevID)
{
devID = myDevID;
float minmax = 0;
LoadParamInt(argc, argv, "d", &inSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "d", &outSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "fnnh", &hSize, outSize * 8);
LoadParamFloat(argc, argv, "fnnminmax", &minmax, 0.1F);
LoadParamFloat(argc, argv, "dropoutfnn", &dropoutP, 0);
InitTensor2DV2(&w1, inSize, hSize, X_FLOAT, devID);
InitTensor1DV2(&b1, hSize, X_FLOAT, devID);
InitTensor2DV2(&w2, hSize, outSize, X_FLOAT, devID);
InitTensor1DV2(&b2, outSize, X_FLOAT, devID);
fnnLayerNorm.InitModel(argc, argv, myDevID);
//float scale = 1.0F;
//float finfout1 = (float)sqrt(6.0F * scale/(inSize + hSize));
//float finfout2 = (float)sqrt(6.0F * scale/(hSize + outSize));
//
//w1.SetDataRand(-finfout1, finfout1);
//b1.SetZeroAll();
//w2.SetDataRand(-finfout2, finfout2);
//b2.SetZeroAll();
}
/*
make the network
y = max(0, x * w1 + b1) * w2 + b2
>> input - the input tensor
>> return - the output tensor
*/
XTensor T2TFNN::Make(XTensor& input, bool isTraining)
{
XTensor t1;
/* t1 = max(0, x * w1 + b1) */
t1 = Rectify(MulAndShift(fnnLayerNorm.Make(input), w1, b1));
if (isTraining && dropoutP > 0)
t1 = Dropout(t1, dropoutP);
/* result = t1 * w2 + b2 */
XTensor res;
res = MulAndShift(t1, w2, b2);
_SumMe(&res, &input);
return res;
}
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
*/
#ifndef __T2TFNN_H__
#define __T2TFNN_H__
#include "T2TLayerNormal.h"
#include "../../tensor/XTensor.h"
using namespace nts;
namespace transformer
{
/* a fnn: y = max(0, x * w1 + b1) * w2 + b2 */
class T2TFNN
{
public:
/* device id */
int devID;
/* size of input vector */
int inSize;
/* size of output vector */
int outSize;
/* size of hidden layers */
int hSize;
/* matrix of transformation 1 */
XTensor w1;
/* bias of transformation 1 */
XTensor b1;
/* matrix of transformation 2 */
XTensor w2;
/* bias of transformation 2 */
XTensor b2;
/* layer normalization for fnn */
T2TLN fnnLayerNorm;
/* dropout probability */
DTYPE dropoutP;
public:
/* constructor */
T2TFNN();
/* deconstructor */
~T2TFNN();
/* initialize the model */
void InitModel(int argc, char ** argv, int myDevID = -1);
/* make the network */
XTensor Make(XTensor &input, bool isTraining);
};
}
#endif
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
*/
#include <math.h>
#include "T2TLayerNormal.h"
#include "T2TUtility.h"
#include "T2TEmbedding.h"
#include "../../tensor/core/CHeader.h"
namespace transformer
{
/* constructor */
T2TLN::T2TLN()
{
devID = -1;
d = 0;
}
/* de-constructor */
T2TLN::~T2TLN()
{
}
/*
initialize the model
>> argc - number of arguments
>> argv - list of pointers to the arguments
>> myDevID - device id
*/
void T2TLN::InitModel(int argc, char ** argv, int myDevID)
{
devID = myDevID;
d = 0;
LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
InitTensor1DV2(&w, d, X_FLOAT, devID);
InitTensor1DV2(&b, d, X_FLOAT, devID);
}
/*
make the network
for each layer representation x, we have
y =
>> input - the input tensor
>> return - layer normalization output
*/
XTensor T2TLN::Make(XTensor &input)
{
XTensor &x = input;
XTensor xn;
XTensor mean;
XTensor variance;
XTensor standard;
XTensor meanFilled;
XTensor standardFilled;
/* \mu = (sum_i x_i)/m */
mean = ReduceMean(x, x.order - 1);
/* \sigma = (sum_i (x_i - \mu)^2)/m */
variance = ReduceVariance(x, x.order - 1, mean) + 1e-5F;
/* standard = sqrt(variance) */
standard = Power(variance, 0.5F);
/* unsqueeze mean and standard deviation to fit them into
the same shape of x */
meanFilled = Unsqueeze(mean, x.order - 1, x.GetDim(-1));
standardFilled = Unsqueeze(standard, x.order - 1, x.GetDim(-1));
/* x' = (x - \mu)/standard */
xn = (x - meanFilled) / standardFilled;
/* result = x' * w + b */
return xn * w + b;
}
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
*/
#ifndef __T2TLAYERNORMAL_H__
#define __T2TLAYERNORMAL_H__
#include "../../network/XNet.h"
using namespace nts;
namespace transformer
{
/* layer normalization: y = norm(x) * w + b
where norm(x) = (x - mean)/standardDeviation */
class T2TLN
{
public:
/* device id */
int devID;
/* the transformation matrix w */
XTensor w;
/* the bias term b */
XTensor b;
/* dimension size of the model */
int d;
public:
/* constructor */
T2TLN();
/* de-constructor */
~T2TLN();
/* initialize the model */
void InitModel(int argc, char ** argv, int myDevID = -1);
/* make the network */
XTensor Make(XTensor &input);
};
}
#endif
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "../../tensor/core/CHeader.h"
#include "T2TLengthPenalty.h"
using namespace nts;
namespace transformer
{
/*
GNMT-like length penalty: pl = ((5 + n)/(5 + 1))^\alpha
where n = length of the sequence
>> length - length of the sequence (for each entry)
>> alpha - the parameter controls the length preference
<< return - length penaltyof the sequence (for each entry)
*/
XTensor T2TLengthPenalizer::GNMT(const XTensor & length, float alpha)
{
XTensor base;
XTensor lp;
//base = ScaleAndShift(ScaleAndShift(length, 0, 5.0F), 1.0F/(5 + 1));
base = (length + 5)/(1 + 5);
lp = Power(base, alpha);
return lp;
}
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-04-08
* Start of a new week - I just finished several documents.
* Writing document is harder than writing code :)
*/
#ifndef __T2TLENGTHPENALTY_H__
#define __T2TLENGTHPENALTY_H__
#include "../../tensor/XTensor.h"
using namespace nts;
namespace transformer
{
/* We intend to penalize short sequences because they have higher score
in product of a sequence of probability-like terms and have more chances
to beat others in search. */
class T2TLengthPenalizer
{
public:
/* GNMT-like length penalty: pl = ((5 + n)/(5 + 1))^\alpha
where n = length of the sequence */
static
XTensor GNMT(const XTensor & length, float alpha);
};
}
#endif
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
*/
#include <math.h>
#include "T2TOutput.h"
#include "T2TUtility.h"
#include "T2TEmbedding.h"
#include "../../tensor/core/CHeader.h"
namespace transformer
{
/* constructor */
T2TOutput::T2TOutput()
{
devID = -1;
vSize = -1;
inSize = -1;
hSize = -1;
}
/* de-constructor */
T2TOutput::~T2TOutput()
{
}
/*
initialize the model
>> argc - number of arguments
>> argv - list of pointers to the arguments
>> myDevID - device id
*/
void T2TOutput::InitModel(int argc, char ** argv, int myDevID)
{
devID = myDevID;
float minmax = 0;
LoadParamInt(argc, argv, "vsizetgt", &vSize, -1);
LoadParamInt(argc, argv, "d", &inSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "d", &hSize, DEFAULT_EMBEDDING_SIZE);
InitTensor2DV2(&w, vSize, hSize, X_FLOAT, devID);
}
/*
make the network (redefined output tensor)
>> input - input tensor
>> output - output tensor
*/
void T2TOutput::Make(XTensor &input, XTensor &output)
{
XTensor &x = input;
output = LogSoftmax(MMul(x, X_NOTRANS, w, X_TRANS), -1);
}
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
*/
#ifndef __T2TOUTPUT_H__
#define __T2TOUTPUT_H__
#include "../../tensor/function/FHeader.h"
using namespace nts;
namespace transformer
{
#define OUTPUT_NAME "output"
/* output layer */
class T2TOutput
{
public:
/* device id */
int devID;
/* vocabulary size */
int vSize;
/* input vector size */
int inSize;
/* vector size of the linear transformation */
int hSize;
/* transformation matrix */
XTensor w;
public:
/* constructor */
T2TOutput();
/* de-constructor */
~T2TOutput();
/* initialize the model */
void InitModel(int argc, char ** argv, int myDevID = -1);
/* make the network */
XTensor Make(XTensor &input);
/* make the network (redefined output tensor) */
void Make(XTensor &input, XTensor &output);
};
}
#endif
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-13
*/
#include "T2TPredictor.h"
#include "../../tensor/core/CHeader.h"
#include <iostream>
using namespace nts;
namespace transformer
{
/* constructor */
T2TStateBundle::T2TStateBundle()
{
states = NULL;
isStart = false;
}
/* de-constructor */
T2TStateBundle::~T2TStateBundle()
{
if (states != NULL)
delete[] states;
}
/*
create states
>> num - number of states
*/
void T2TStateBundle::MakeStates(int num)
{
CheckNTErrors(num > 0, "invalid number");
if (states != NULL)
delete[] states;
states = new T2TState[num];
for (int i = 0; i < num; i++) {
states[i].prediction = -1;
states[i].pid = T2T_PID_EMPTY;
states[i].isEnd = false;
states[i].isStart = false;
states[i].isCompleted = false;
states[i].prob = 0;
states[i].probPath = 0;
states[i].modelScore = 0;
states[i].nstep = 0;
states[i].last = NULL;
}
stateNum = num;
}
/* constructor */
T2TPredictor::T2TPredictor()
{
startSymbol = 2;
}
/* de-constructor */
T2TPredictor::~T2TPredictor()
{
}
/*
create an initial state
>> model - the t2t model
>> top - the top-most layer of the network
>> input - input of the network
>> beamSize - beam size
>> state - the state to be initialized
*/
void T2TPredictor::Create(T2TModel* model, XTensor* top, const XTensor* input, int beamSize, T2TStateBundle* state)
{
int dims[MAX_TENSOR_DIM_NUM];
for (int i = 0; i < input->order - 1; i++)
dims[i] = input->GetDim(i);
dims[input->order - 1] = beamSize;
InitTensorV2(&state->probPath, input->order, dims, X_FLOAT, 1.0F, input->devID);
InitTensorV2(&state->nstep, input->order, dims, X_FLOAT, 1.0F, input->devID);
InitTensorV2(&state->endMark, input->order, dims, X_INT, 1.0F, input->devID);
state->probPath.SetZeroAll();
state->nstep.SetZeroAll();
state->endMark.SetZeroAll();
state->stateNum = 0;
}
/*
set start symbol
>> symbol - the symbol (in integer)
*/
void T2TPredictor::SetStartSymbol(int symbol)
{
startSymbol = symbol;
}
/*
read a state
>> model - the t2t model that keeps the network created so far
>> state - a set of states. It keeps
1) hypotheses (states)
2) probablities of hypotheses
3) parts of the network for expanding toward the next state
*/
void T2TPredictor::Read(T2TModel* model, T2TStateBundle* state)
{
m = model;
s = state;
}
/*
predict the next state
>> next - next states (assuming that the current state has been read)
>> encoding - encoder output
>> inputEnc - input of the encoder
>> paddingEnc - padding of the encoder
>>> isStart - is the start or not
*/
void T2TPredictor::Predict(T2TStateBundle* next, XTensor* encoding, XTensor* inputEnc, XTensor* paddingEnc, bool isStart)
{
int dims[MAX_TENSOR_DIM_NUM];
/* word indices of positions up to next state */
XTensor inputDec;
/* the first token */
XTensor first;
CheckNTErrors(inputEnc->order >= 2, "Wrong order of the tensor!");
for (int i = 0; i < inputEnc->order - 1; i++)
dims[i] = inputEnc->GetDim(i);
dims[inputEnc->order - 1] = 1;
InitTensorV2(&first, inputEnc->order, dims, X_INT, 1.0F, inputEnc->devID);
SetDataFixedInt(first, startSymbol);
/* add a new word into the input sequence of the decoder side */
if (isStart) {
inputDec = Identity(first);
}
else {
/* only pass one step to the decoder */
inputDec = GetLastPrediction(s);
inputDec.SetDevice(inputEnc->devID);
}
/* prediction probabilities */
XTensor& output = next->prob;
XTensor decoding;
for (int i = 0; i < inputDec.order - 1; i++)
dims[i] = inputDec.GetDim(i);
dims[inputDec.order - 1] = inputDec.GetDim(-1);
XTensor paddingDec;
InitTensorV2(&paddingDec, inputDec.order, dims, X_INT, 1.0F, paddingEnc->devID);
SetDataFixedInt(paddingDec, 1);
XTensor maskDec;
XTensor maskEncDec;
/* decoder mask */
m->MakeMTMaskDec(*inputEnc, inputDec, *paddingEnc, paddingDec, maskDec, maskEncDec);
/* make the decoding network */
decoding = m->decoder->Make(inputDec, *encoding, &maskDec, maskEncDec, false);
CheckNTErrors(decoding.order >= 2, "The tensor must be of order 2 or larger!");
/* generate the output probabilities */
m->outputLayer->Make(decoding, output);
}
/*
generate paths up to the states of the current step
>> state - state bundle of the current step
*/
XTensor T2TPredictor::GeneratePaths(T2TStateBundle* state)
{
CheckNTErrors(state->stateNum >= 0, "Illegal state!");
int distance = -1;
for (int i = 0; i < state->stateNum; i++) {
T2TState* cur = state->states + i;
int nsteps = 0;
while (cur != NULL) {
nsteps++;
cur = cur->last;
}
if (nsteps > distance)
distance = nsteps;
}
XTensor path;
InitTensor2DV2(&path, state->stateNum, distance, X_INT);
path.SetZeroAll();
for (int i = 0; i < state->stateNum; i++) {
T2TState* cur = state->states + i;
int nsteps = 0;
while (cur != NULL) {
nsteps++;
path.Set2DInt(cur->prediction, i, distance - nsteps);
cur = cur->last;
}
}
return path;
}
/*
get the predictions of the previous step
>> state - state bundle of the current step
*/
XTensor T2TPredictor::GetLastPrediction(T2TStateBundle* state)
{
CheckNTErrors(state->stateNum >= 0, "Illegal state!");
XTensor lastPred;
InitTensor2DV2(&lastPred, state->stateNum, 1, X_INT);
for (int i = 0; i < state->stateNum; i++) {
T2TState* cur = state->states + i;
lastPred.Set2DInt(cur->prediction, i, 0);
}
return lastPred;
}
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-13
* This is the first source file I create in 2019 - new start!
*/
#ifndef __T2TPREDICTOR_H__
#define __T2TPREDICTOR_H__
#include "T2TModel.h"
#include "T2TLengthPenalty.h"
namespace transformer
{
#define T2T_PID_EMPTY -1
/* state for search. It keeps the path (back-pointer), prediction distribution,
and etc. It can be regarded as a hypothsis in translation. */
class T2TState
{
public:
/* we assume that the prediction is an integer */
int prediction;
/* id of the problem. One can regard it as the sentence id when we
translate a number of sentences in the batched manner. The hypothesis
is empty if id = -1 */
int pid;
/* indicates whether the state is an end */
bool isEnd;
/* indicates whether the state is the start */
bool isStart;
/* indicates whether the state is completed */
bool isCompleted;
/* probability of every prediction (last state of the path) */
float prob;
/* probability of every path */
float probPath;
/* model score of every path. A model score = path probability + some other stuff */
float modelScore;
/* nubmer of steps we go over so far */
int nstep;
/* pointer to the previous state */
T2TState* last;
};
/* a bundle of states */
class T2TStateBundle
{
public:
/* predictions */
XTensor prediction;
/* id of the previous state that generates the current one */
XTensor preID;
/* mark that indicates whether each hypothesis is completed */
XTensor endMark;
/* probability of every prediction (last state of the path) */
XTensor prob;
/* probability of every path */
XTensor probPath;
/* model score of every path */
XTensor modelScore;
/* step number of each hypothesis */
XTensor nstep;
/* list of states */
T2TState* states;
/* number of states */
int stateNum;
/* indicates whether it is the first state */
bool isStart;
public:
/* constructor */
T2TStateBundle();
/* de-constructor */
~T2TStateBundle();
/* create states */
void MakeStates(int num);
};
/* The predictor reads the current state and then predicts the next.
It is exactly the same procedure of MT inference -
we get the state of previous words and then generate the next word.
Here, a state can be regared as the representation of words (word
indices, hidden states, embeddings and etc.). */
class T2TPredictor
{
private:
/* pointer to the transformer model */
T2TModel* m;
/* current state */
T2TStateBundle* s;
/* start symbol */
int startSymbol;
public:
/* constructor */
T2TPredictor();
/* de-constructor */
~T2TPredictor();
/* create an initial state */
void Create(T2TModel* model, XTensor* top, const XTensor* input, int beamSize, T2TStateBundle* state);
/* set the start symbol */
void SetStartSymbol(int symbol);
/* read a state */
void Read(T2TModel* model, T2TStateBundle* state);
/* predict the next state */
void Predict(T2TStateBundle* next, XTensor* encoding, XTensor* inputEnc, XTensor* paddingEnc, bool isStart);
/* generate paths up to the states of the current step */
XTensor GeneratePaths(T2TStateBundle* state);
/* get the predictions of the previous step */
XTensor GetLastPrediction(T2TStateBundle* state);
};
}
#endif
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
*/
#ifndef __T2TSEARCH_H__
#define __T2TSEARCH_H__
#include "T2TModel.h"
#include "T2TPredictor.h"
namespace transformer
{
/* The class orgnizes the search process. It calls "predictors" to generate
distributions of the predictions and prunes the search space by beam pruning.
This makes a graph where each path respresents a translation hypothsis.
The output can be the path with the highest model score. */
class T2TSearch
{
private:
/* the alpha parameter controls the length preference */
float alpha;
/* predictor */
T2TPredictor predictor;
/* max length of the generated sequence */
int maxLength;
/* beam size */
int beamSize;
/* batch size */
int batchSize;
/* we keep the final hypotheses in a heap for each sentence in the batch. */
XHeap<MIN_HEAP, float>* fullHypos;
/* array of the end symbols */
int* endSymbols;
/* number of the end symbols */
int endSymbolNum;
/* start symbol */
int startSymbol;
/* scalar of the input sequence (for max number of search steps) */
float scalarMaxLength;
/* indicate whether the early stop strategy is used */
bool isEarlyStop;
public:
/* constructor */
T2TSearch();
/* de-constructor */
~T2TSearch();
/* initialize the model */
void Init(int argc, char** argv);
/* search for the most promising states */
void Search(T2TModel* model, XTensor* input, XTensor* padding, XTensor* output, XTensor* score);
/* preparation */
void Prepare(int myBatchSize, int myBeamSize);
/* compute the model score for each hypothesis */
void Score(T2TStateBundle* prev, T2TStateBundle* beam);
/* generate token indices via beam pruning */
void Generate(T2TStateBundle* beam);
/* expand the search graph */
void Expand(T2TStateBundle* prev, T2TStateBundle* beam);
/* collect hypotheses with ending symbol */
void Collect(T2TStateBundle* beam);
/* fill the hypotheis heap with incomplete hypothses */
void FillHeap(T2TStateBundle* beam);
/* save the output sequences and score */
void Dump(XTensor* output, XTensor* score);
/* check if the token is an end symbol */
bool IsEnd(int token);
/*check whether all hypotheses are completed*/
bool IsAllCompleted(T2TStateBundle* beam);
/* set end symbols for search */
void SetEnd(const int* tokens, const int tokenNum);
/* make a mask to prevent duplicated entries in beam expansion for the first position */
XTensor MakeFirstMask(T2TStateBundle* beam);
};
}
#endif
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
*/
#include <math.h>
#include "T2TUtility.h"
#include "T2TTester.h"
#include "T2TSearch.h"
#include "../../tensor/XUtility.h"
#include "../../tensor/core/CHeader.h"
#include "../../network/XNoder.h"
#include "..//..//tensor/XTensor.h"
using namespace nts;
namespace transformer
{
/* constructor */
T2TTester::T2TTester()
{
}
/* de-constructor */
T2TTester::~T2TTester()
{
}
/* initialize the model */
void T2TTester::Init(int argc, char** argv)
{
LoadParamInt(argc, argv, "vsize", &vSize, 34040);
LoadParamInt(argc, argv, "vsizetgt", &vSizeTgt, vSize);
LoadParamInt(argc, argv, "sentbatch", &sentBatch, 1);
LoadParamBool(argc, argv, "sort", &batchLoader.sortBuffer, true);
seacher.Init(argc, argv);
}
/*
test the model
>> fn - test data file
>> ofn - output data file
>> model - model that is trained
*/
void T2TTester::Test(const char* fn, const char* ofn, T2TModel* model)
{
int wc = 0;
int wordCount = 0;
int wordCountTotal = 0;
int sentCount = 0;
int batchCount = 0;
/* data files */
FILE* ofile = fopen(ofn, "wb");
CheckNTErrors(ofile, "Cannot open the output file");
int devID = model->devID;
double startT = GetClockSec();
/* batch of input sequences */
XTensor batchEnc;
/* padding */
XTensor paddingEnc;
/* an array that keeps the sequences */
int* seqs = new int[MILLION];
batchLoader.Init(fn);
int count = 0;
while (!batchLoader.IsEmpty())
{
count++;
wordCount = 0;
for (int i = 0; i < model->decoder->nlayer; ++i) {
model->decoder->selfAttCache[i].miss = true;
model->decoder->enDeAttCache[i].miss = true;
}
vector<int> indices = batchLoader.LoadBatch(&batchEnc, &paddingEnc, sentBatch, devID);
XTensor output;
XTensor score;
seacher.Search(model, &batchEnc, &paddingEnc, &output, &score);
for (int i = 0; i < indices.size(); ++i) {
Result res;
XTensor sent, srcIdx, tgtIdx;
InitTensor1DV2(&srcIdx, 1, X_INT, output.devID);
int idx[]{ i };
srcIdx.SetData(idx, 1);
InitTensorV2(&tgtIdx, &srcIdx);
SetAscendingOrder(tgtIdx, 0);
sent = CopyIndexed(output, 0, srcIdx, tgtIdx);
res.values = sent;
res.id = indices[i];
batchLoader.resBuffer.emplace_back(res);
}
wc = batchEnc.GetDim(-1);
wordCount += wc;
wordCountTotal += wc;
sentCount += batchEnc.GetDim(-2);
batchCount += 1;
if (batchCount % 1 == 0) {
double elapsed = GetClockSec() - startT;
XPRINT3(0, stderr, "[INFO] elapsed=%.1fs, sentence=%d, sword=%d\n", elapsed, sentCount, wordCount);
}
}
batchLoader.RerankRes();
for (auto res : batchLoader.resBuffer) {
Dump(ofile, &res.values);
}
fclose(ofile);
delete[] seqs;
double elapsed = GetClockSec() - startT;
XPRINT3(0, stderr, "[INFO] test finished (took %.1fs, word=%d, sent=%d)\n", elapsed, wordCountTotal, sentCount);
}
/*
dump the result into the file
>> file - data file
>> output - output tensor
*/
void T2TTester::Dump(FILE* file, XTensor* output)
{
int seqLength = output->GetDim(-1);
for (int i = 0; i < output->unitNum; i += seqLength) {
for (int j = 0; j < seqLength; j++) {
int w = output->GetInt(i + j);
if (w < 0 || w == 1)
break;
fprintf(file, "%d ", w);
}
fprintf(file, "\n");
}
}
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
* A week with no trips :)
*/
#ifndef __T2TTESTER_H__
#define __T2TTESTER_H__
#include "T2TSearch.h"
#include "t2tdata/DataSet.h"
namespace transformer
{
/* This class translates test sentences with a trained model. */
class T2TTester
{
public:
/* vocabulary size of the source side */
int vSize;
/* vocabulary size of the target side */
int vSizeTgt;
/* batch size for sentences */
int sentBatch;
/* for batching */
DataSet batchLoader;
/* decoder for inference */
T2TSearch seacher;
public:
/* constructor */
T2TTester();
/* de-constructor */
~T2TTester();
/* initialize the model */
void Init(int argc, char** argv);
/* test the model */
void Test(const char* fn, const char* ofn, T2TModel* model);
/* dump the result into the file */
void Dump(FILE* file, XTensor* output);
};
}
#endif
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
namespace transformer
{
FILE * tmpFILE;
int llnum = 0;
FILE * tf = NULL;
void LoadParamString(int argc, char ** argv, const char * name, char * p, const char * defaultP)
{
char vname[128];
vname[0] = '-';
strcpy(vname + 1, name);
bool hit = false;
for(int i = 0; i < argc; i++){
if(!strcmp(argv[i], vname) && i + 1 < argc){
strcpy(p, argv[i + 1]);
//fprintf(stderr, " %s=%s\n", name, argv[i + 1]);
hit = true;
}
}
if(!hit)
strcpy(p, defaultP);
}
void LoadParamInt(int argc, char ** argv, const char * name, int * p, int defaultP)
{
char vname[128];
vname[0] = '-';
strcpy(vname + 1, name);
bool hit = false;
for(int i = 0; i < argc; i++){
if(!strcmp(argv[i], vname) && i + 1 < argc){
*(int*)p = atoi(argv[i + 1]);
//fprintf(stderr, " %s=%s\n", name, argv[i + 1]);
hit = true;
}
}
if(!hit)
*p = defaultP;
}
void LoadParamBool(int argc, char ** argv, const char * name, bool * p, bool defaultP)
{
char vname[128];
vname[0] = '-';
strcpy(vname + 1, name);
bool hit = false;
for(int i = 0; i < argc; i++){
if(!strcmp(argv[i], vname)){
*(bool*)p = true;
//fprintf(stderr, " %s=%s\n", name, "true");
hit = true;
}
}
if(!hit)
*p = defaultP;
}
void LoadParamFloat(int argc, char ** argv, const char * name, float * p, float defaultP)
{
char vname[128];
vname[0] = '-';
strcpy(vname + 1, name);
bool hit = false;
for(int i = 0; i < argc; i++){
if(!strcmp(argv[i], vname) && i + 1 < argc){
*p = (float)atof(argv[i + 1]);
//fprintf(stderr, " %s=%s\n", name, argv[i + 1]);
hit = true;
}
}
if(!hit)
*p = defaultP;
}
void ShowParams(int argc, char ** argv)
{
fprintf(stderr, "args:\n");
for(int i = 0; i < argc; i++){
if(argv[i][1] == 0)
continue;
if(argv[i][0] == '-' && (argv[i][1] < '1' || argv[i][1] > '9')){
if(i + 1 < argc && argv[i + 1][0] != '-')
fprintf(stderr, " %s=%s\n", argv[i], argv[i + 1]);
else
fprintf(stderr, " %s=yes\n", argv[i]);
}
}
fprintf(stderr, "\n");
}
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
*/
#ifndef __T2TUTILITY_H__
#define __T2TUTILITY_H__
#include <stdio.h>
namespace transformer
{
extern FILE * tmpFILE;
/* load arguments */
void LoadParamString(int argc, char ** argv, const char * name, char * p, const char * defaultP);
void LoadParamInt(int argc, char ** argv, const char * name, int * p, int defaultP);
void LoadParamBool(int argc, char ** argv, const char * name, bool * p, bool defaultP);
void LoadParamFloat(int argc, char ** argv, const char * name, float * p, float defaultP);
/* show arguments */
void ShowParams(int argc, char ** argv);
extern int llnum;
extern FILE * tf;
}
#endif
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -27,13 +26,13 @@
#include <fstream>
#include <sstream>
#include "T2TUtility.h"
#include "../../../tensor/XGlobal.h"
#include "Utility.h"
#include "../../tensor/XGlobal.h"
using namespace nts;
using namespace std;
namespace transformer
namespace nmt
{
/*
......@@ -41,7 +40,7 @@ load configurations from the command
>> argc - number of arguments
>> argv - the list of arguments
*/
T2TConfig::T2TConfig(int argc, const char** argv)
Config::Config(int argc, const char** argv)
{
char** args = new char* [MAX_PARAM_NUM];
for (int i = 0; i < argc; i++) {
......@@ -61,22 +60,26 @@ T2TConfig::T2TConfig(int argc, const char** argv)
ShowParams(argsNum, args);
/* options for the model */
LoadParamInt(argsNum, args, "nhead", &nhead, 8);
LoadParamInt(argsNum, args, "enclayer", &nEncLayer, 1);
LoadParamInt(argsNum, args, "declayer", &nDecLayer, 1);
LoadParamInt(argsNum, args, "nhead", &nhead, 4);
LoadParamInt(argsNum, args, "enclayer", &nEncLayer, 6);
LoadParamInt(argsNum, args, "declayer", &nDecLayer, 6);
LoadParamInt(argsNum, args, "maxrp", &maxRP, 8);
LoadParamInt(argsNum, args, "embsize", &embSize, 256);
LoadParamInt(argsNum, args, "modelsize", &modelSize, 256);
LoadParamInt(argsNum, args, "embsize", &embSize, 512);
LoadParamInt(argsNum, args, "modelsize", &modelSize, 512);
LoadParamInt(argsNum, args, "maxpos", &maxPosLen, 1024);
LoadParamInt(argsNum, args, "fnnhidden", &fnnHiddenSize, modelSize * 4);
LoadParamInt(argsNum, args, "vsize", &srcVocabSize, 10000);
LoadParamInt(argsNum, args, "vsizetgt", &tgtVocabSize, 10000);
LoadParamInt(argsNum, args, "fnnhidden", &fnnHiddenSize, modelSize * 2);
LoadParamInt(argsNum, args, "vsize", &srcVocabSize, 10152);
LoadParamInt(argsNum, args, "vsizetgt", &tgtVocabSize, 10152);
LoadParamInt(argsNum, args, "padid", &padID, 1);
LoadParamInt(argsNum, args, "startid", &startID, 2);
LoadParamInt(argsNum, args, "endid", &endID, 2);
LoadParamBool(argsNum, args, "rpr", &useRPR, false);
LoadParamBool(argsNum, args, "prenorm", &preNorm, false);
LoadParamString(argsNum, args, "model", modelFN, "model.bin");
LoadParamBool(argsNum, args, "prenorm", &preNorm, true);
// TODO: refactor the parameters type to support weight sharing during training
LoadParamInt(argsNum, args, "shareemb", &shareAllEmbeddings, 0);
LoadParamInt(argsNum, args, "sharedec", &shareDecInputOutputWeight, 0);
LoadParamString(argsNum, args, "model", modelFN, "");
LoadParamString(argsNum, args, "srcvocab", srcVocabFN, "vocab.src");
LoadParamString(argsNum, args, "tgtvocab", tgtVocabFN, "vocab.tgt");
......@@ -84,19 +87,20 @@ T2TConfig::T2TConfig(int argc, const char** argv)
LoadParamString(argsNum, args, "train", trainFN, "");
LoadParamString(argsNum, args, "valid", validFN, "");
LoadParamInt(argsNum, args, "dev", &devID, 0);
LoadParamInt(argsNum, args, "wbatch", &wBatchSize, 2048);
LoadParamInt(argsNum, args, "sbatch", &sBatchSize, 1);
LoadParamInt(argsNum, args, "wbatch", &wBatchSize, 4096);
LoadParamInt(argsNum, args, "sbatch", &sBatchSize, 8);
isTraining = (strcmp(trainFN, "") == 0) ? false : true;
LoadParamBool(argsNum, args, "mt", &isMT, true);
LoadParamFloat(argsNum, args, "dropout", &dropout, 0.1);
LoadParamFloat(argsNum, args, "fnndrop", &fnnDropout, 0.0);
LoadParamFloat(argsNum, args, "attdrop", &attDropout, 0.0);
LoadParamFloat(argsNum, args, "dropout", &dropout, 0.3);
LoadParamFloat(argsNum, args, "fnndrop", &fnnDropout, 0.1);
LoadParamFloat(argsNum, args, "attdrop", &attDropout, 0.1);
LoadParamFloat(argc, args, "lrate", &lrate, 1.0F);
LoadParamFloat(argc, args, "lrate", &lrate, 0.0015F);
LoadParamFloat(argc, args, "lrbias", &lrbias, 0);
LoadParamInt(argc, args, "nepoch", &nepoch, 20);
LoadParamInt(argc, args, "nepoch", &nepoch, 50);
LoadParamInt(argc, args, "maxcheckpoint", &maxCheckpoint, 10);
LoadParamInt(argc, args, "nstep", &nstep, 100000);
LoadParamInt(argc, args, "nwarmup", &nwarmup, 3000);
LoadParamInt(argc, args, "nwarmup", &nwarmup, 8000);
LoadParamBool(argc, args, "adam", &useAdam, true);
LoadParamFloat(argc, args, "adambeta1", &adamBeta1, 0.9F);
LoadParamFloat(argc, args, "adambeta2", &adamBeta2, 0.98F);
......@@ -104,9 +108,8 @@ T2TConfig::T2TConfig(int argc, const char** argv)
LoadParamBool(argc, args, "shuffled", &isShuffled, true);
LoadParamFloat(argc, args, "labelsmoothing", &labelSmoothingP, 0.1);
LoadParamInt(argc, args, "nstepcheckpoint", &nStepCheckpoint, -1);
LoadParamBool(argc, args, "epochcheckpoint", &useEpochCheckpoint, false);
LoadParamBool(argc, args, "epochcheckpoint", &useEpochCheckpoint, true);
LoadParamInt(argc, args, "updatestep", &updateStep, 1);
LoadParamBool(argc, args, "debug", &isDebugged, false);
LoadParamBool(argc, args, "sorted", &isLenSorted, false);
LoadParamInt(argc, args, "bufsize", &bufSize, 50000);
......@@ -114,7 +117,7 @@ T2TConfig::T2TConfig(int argc, const char** argv)
LoadParamBool(argc, args, "smallbatch", &isSmallBatch, true);
LoadParamBool(argc, args, "bigbatch", &isBigBatch, false);
LoadParamBool(argc, args, "randbatch", &isRandomBatch, false);
LoadParamInt(argc, args, "bucketsize", &bucketSize, 0);
LoadParamInt(argc, args, "bucketsize", &bucketSize, wBatchSize * 10);
/* options for translating */
LoadParamString(argsNum, args, "test", testFN, "");
......@@ -122,7 +125,7 @@ T2TConfig::T2TConfig(int argc, const char** argv)
LoadParamInt(argsNum, args, "beamsize", &beamSize, 1);
LoadParamBool(argsNum, args, "fp16", &useFP16, false);
LoadParamFloat(argsNum, args, "lenalpha", &lenAlpha, 0.6);
LoadParamFloat(argsNum, args, "maxlenalpha", &maxLenAlpha, 2.0);
LoadParamFloat(argsNum, args, "maxlenalpha", &maxLenAlpha, 1.2);
for (int i = 0; i < argc; i++)
delete[] args[i];
......@@ -136,7 +139,7 @@ load configurations from a file
>> args - the list to store the configurations
format: one option per line, separated by a blank or a tab
*/
int T2TConfig::LoadFromFile(const char* configFN, char** args) {
int Config::LoadFromFile(const char* configFN, char** args) {
ifstream f(configFN, ios::in);
CheckNTErrors(f.is_open(), "unable to open the config file");
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -20,18 +19,18 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-06
*/
#ifndef __T2TUTILITY_H__
#define __T2TUTILITY_H__
#ifndef __UTILITY_H__
#define __UTILITY_H__
#include <string>
#include <cstdio>
#include "../../../tensor/XList.h"
#include "../../tensor/XList.h"
using namespace std;
using namespace nts;
namespace transformer
namespace nmt
{
#define MAX_PARAM_NUM 100
......@@ -50,8 +49,8 @@ IntList SplitInt(const string& s, const string& delimiter);
FloatList SplitFloat(const string& s, const string& delimiter);
UInt64List SplitToPos(const string& s, const string& delimiter);
/* configurations for t2t */
class T2TConfig {
/* configurations for */
class Config {
public:
/* path to the model */
char modelFN[1024];
......@@ -131,6 +130,12 @@ public:
/* indicates whether the model is running for machine translation */
bool isMT;
/* indicates whether share encoder decoder embeddings */
int shareAllEmbeddings;
/* indicates whether share decoder embeddings and output weights */
int shareDecInputOutputWeight;
/* indicates whether the model is running with FP16 data type */
bool useFP16;
......@@ -164,9 +169,12 @@ public:
/* training epoch number */
int nepoch;
/* traing step number */
/* training step number */
int nstep;
/* the maximum number of saved checkpoints */
int maxCheckpoint;
/* indicates whether we use Adam */
bool useAdam;
......@@ -193,9 +201,6 @@ public:
/* number of batches on which we do model update */
int updateStep;
/* indicates whether we intend to debug the net */
bool isDebugged;
/* indicates whether the sequence is sorted by length */
bool isLenSorted;
......@@ -222,7 +227,7 @@ public:
public:
/* load configurations from the command */
T2TConfig(int argc, const char** argv);
Config(int argc, const char** argv);
/* load configurations from a file */
int LoadFromFile(const char* configFN, char** args);
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -20,17 +19,17 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
*/
#ifndef __T2TATTENTION_H__
#define __T2TATTENTION_H__
#ifndef __ATTENTION_H__
#define __ATTENTION_H__
#include "T2TNNUtil.h"
#include "T2TUtility.h"
#include "NNUtil.h"
#include "../Utility.h"
#include "../../../network/XNet.h"
#include "../../../tensor/core/CHeader.h"
using namespace nts;
namespace transformer
namespace nmt
{
/* attention type */
enum { NONE, SELF_ATT, EN_DE_ATT };
......@@ -50,6 +49,9 @@ public:
/* indicates cache miss if 'true' */
bool miss;
/* indicates whether we use cache */
bool enable;
/* constructor */
Cache();
......@@ -64,7 +66,7 @@ public:
};
/* multi-head attention */
class T2TAttention
class Attention
{
public:
/* device id */
......@@ -74,22 +76,22 @@ public:
int nhead;
/* transformation matrix for Q */
XTensor wq;
XTensor weightQ;
/* bias for Q */
XTensor bq;
XTensor biasQ;
/* transformation matrix for K */
XTensor wk;
XTensor weightK;
/* bias for K */
XTensor bk;
XTensor biasK;
/* transformation matrix for V */
XTensor wv;
XTensor weightV;
/* bias for V */
XTensor bv;
XTensor biasV;
XTensor wBig;
......@@ -99,10 +101,10 @@ public:
XTensor RPEmbK;
/* transformation after dot-product attention */
XTensor wo;
XTensor weightO;
/* bias after dot-product attention */
XTensor bo;
XTensor biasO;
/* size of transformed Q and K */
int dk;
......@@ -124,13 +126,13 @@ public:
public:
/* constructor */
T2TAttention();
Attention();
/* de-constructor */
~T2TAttention();
~Attention();
/* initialize the model */
void InitModel(T2TConfig& config);
void InitModel(Config& config);
/* make the network */
XTensor Make(XTensor& k, XTensor& q, XTensor& v,
......@@ -145,8 +147,10 @@ public:
XTensor MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v,
XTensor* mask, bool isTraining, bool isEnc);
/* generate relative position embeddings */
XTensor GetRPEmbedding(const int lenQ, const int lenKV, const int maxRelativeLen, const bool isEnc);
/* relative position-aware dot-product attention inner calculation */
XTensor RPDotProduct(XTensor& x, XTensor& y, XTensor& z, const bool is_key);
};
}
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northestern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -20,13 +19,11 @@
* This file includes some common modules of the Transformer model
*/
#include <cmath>
#include "T2TCommonModules.h"
#include "CommonModules.h"
#include "../../../tensor/core/CHeader.h"
#include "../../../tensor/function/FHeader.h"
namespace transformer
namespace nmt
{
/*
......@@ -37,7 +34,7 @@ flexible layer normalization for the Transformer
>> before - whether we use layernorm before attention/fnn
>> after - whether we use layernorm after attention/fnn
*/
XTensor LayerNorm(XTensor& input, T2TLN& ln, bool prenorm, bool before, bool after)
XTensor LayerNorm(XTensor& input, LN& ln, bool prenorm, bool before, bool after)
{
if (after ^ prenorm)
return ln.Make(input);
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northestern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -22,16 +21,16 @@
#ifndef __COMMONMODULE_H__
#define __COMMONMODULE_H__
#include "T2TLayerNormal.h"
#include "T2TCommonModules.h"
#include "LayerNorm.h"
#include "CommonModules.h"
using namespace nts;
namespace transformer
namespace nmt
{
/* the layer normalization module to control pre-norm or post-norm*/
XTensor LayerNorm(XTensor& input, T2TLN& ln, bool prenorm, bool before, bool after);
XTensor LayerNorm(XTensor& input, LN& ln, bool prenorm, bool before, bool after);
}
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -20,17 +19,15 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-07
*/
#include <cmath>
#include "T2TUtility.h"
#include "T2TEmbedding.h"
#include "Embedding.h"
#include "../Utility.h"
#include "../../../tensor/core/CHeader.h"
namespace transformer
namespace nmt
{
/* constructor */
T2TEmbedder::T2TEmbedder()
Embedder::Embedder()
{
devID = -1;
vSize = -1;
......@@ -38,7 +35,7 @@ T2TEmbedder::T2TEmbedder()
}
/* de-constructor */
T2TEmbedder::~T2TEmbedder()
Embedder::~Embedder()
{
}
......@@ -47,7 +44,7 @@ initialize the model
>> config - configurations of the model
>> isEnc - indicates if it is used for the encoder
*/
void T2TEmbedder::InitModel(T2TConfig& config, bool isEnc)
void Embedder::InitModel(Config& config, bool isEnc)
{
devID = config.devID;
d = config.modelSize;
......@@ -70,7 +67,7 @@ void T2TEmbedder::InitModel(T2TConfig& config, bool isEnc)
make positional embeddings (of size eSize * length)
>> length - length of the sequence
*/
void T2TEmbedder::MakePosEmbedding(int length)
void Embedder::MakePosEmbedding(int length)
{
InitTensor2D(&posEmbeddingBase, length, eSize, X_FLOAT, devID);
......@@ -110,58 +107,45 @@ make the network
>> isTraining - indicates whether it is training
<< return - word & position embeddings of the input
*/
XTensor T2TEmbedder::Make(XTensor& input, bool isDec, bool isTraining, int nstep)
XTensor Embedder::Make(XTensor& input, bool isDec, bool isTraining, int nstep)
{
/* make sure the padding index is 1 */
CheckNTErrors(input.order > 1, "Wrong input tensor size!");
CheckNTErrors(input.dimSize[input.order - 1] < maxLength, "The sequence is too long!");
CheckNTErrors(vSize > 0, "set vocabulary size by \"-vsize\"");
CheckNTErrors(eSize > 0, "set embedding size by \"-esize\"");
CheckNTErrors(vSize > 0, "Set vocabulary size by \"-vsize\"");
CheckNTErrors(eSize > 0, "Set embedding size by \"-esize\"");
XTensor wordEmbedding, position, posEmbedding;
InitTensor(&position, &input);
int* posData = new int[input.unitNum];
XTensor inputCPU;
InitTensorOnCPU(&inputCPU, &input);
_CopyValues(&input, &inputCPU);
InitTensor1D(&position, input.GetDim(-1), X_INT, devID);
if (!isDec)
if (!isDec || isTraining || input.GetDim(-1) > 1)
{
/* encoder embeddings */
for (int i = 0; i < inputCPU.dimSize[0]; i++) {
int startNoPad = 1 + 1;
int* p = ((int*)inputCPU.data) + i * inputCPU.dimSize[1];
for (int j = 0; j < inputCPU.dimSize[1]; j++) {
if (p[j] == 1) {
posData[i * inputCPU.dimSize[1] + j] = 1;
}
else {
posData[i * inputCPU.dimSize[1] + j] = startNoPad++;
}
}
}
position.SetData(posData, position.unitNum);
position.Range(0, position.unitNum, 1);
// disable grad
ScaleAndShiftMe(position, 1.0F, float(padIdx + 1));
}
else
{
/* decoder embeddings */
position.SetDataFixed(nstep + 2);
/* decoder embeddings during decoding */
position.SetDataFixed(nstep + padIdx + 1);
}
delete[] posData;
/* we make positional embeddings first */
posEmbedding = Gather(posEmbeddingBase, position);
XTensor embTMP;
embTMP = Gather(posEmbeddingBase, position);
posEmbedding = Unsqueeze(embTMP, 0, input.GetDim(0));
/* then we make word embeddings */
//w.enableGrad = false;
wordEmbedding = Gather(w, input);
wordEmbedding = Linear(wordEmbedding, (float)sqrt((float)eSize));
/* we sum over the two embeddings */
return wordEmbedding + posEmbedding;
SumMe(wordEmbedding, posEmbedding);
return wordEmbedding;
}
}
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -20,15 +19,15 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-07
*/
#ifndef __T2TEMBEDDING_H__
#define __T2TEMBEDDING_H__
#ifndef __EMBEDDING_H__
#define __EMBEDDING_H__
#include "T2TUtility.h"
#include "../Utility.h"
#include "../../../network/XNet.h"
using namespace nts;
namespace transformer
namespace nmt
{
#define DEFAULT_EMBEDDING_SIZE 512
......@@ -37,7 +36,7 @@ namespace transformer
embedding (of word at position i):
word embedding + positional embedding
*/
class T2TEmbedder
class Embedder
{
public:
/* device id */
......@@ -52,7 +51,7 @@ public:
/* maximum length of the sequence */
int maxLength;
/* dimension size of the hidden layers in the t2t model */
/* dimension size of the hidden layers in the model */
int d;
/* padding index */
......@@ -67,13 +66,13 @@ public:
public:
/* constructor */
T2TEmbedder();
Embedder();
/* de-constructor */
~T2TEmbedder();
~Embedder();
/* initialize the model */
void InitModel(T2TConfig& config, bool isEnc = true);
void InitModel(Config& config, bool isEnc = true);
/* make positional embeddings */
void MakePosEmbedding(int length);
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -20,19 +19,17 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#include <cmath>
#include "T2TFNN.h"
#include "T2TUtility.h"
#include "T2TEmbedding.h"
#include "FNN.h"
#include "Embedding.h"
#include "../Utility.h"
#include "../../../tensor/core/CHeader.h"
#include "../../../tensor/function/FHeader.h"
namespace transformer
namespace nmt
{
/* constructor */
T2TFNN::T2TFNN()
FNN::FNN()
{
inSize = -1;
outSize = -1;
......@@ -40,7 +37,7 @@ T2TFNN::T2TFNN()
}
/* de-constructor */
T2TFNN::~T2TFNN()
FNN::~FNN()
{
}
......@@ -50,7 +47,7 @@ initialize the model
>> argv - list of pointers to the arguments
>> config - configurations of the model
*/
void T2TFNN::InitModel(T2TConfig& config)
void FNN::InitModel(Config& config)
{
devID = config.devID;
......@@ -69,6 +66,9 @@ void T2TFNN::InitModel(T2TConfig& config)
_SetDataFanInOut(&w1, scale);
_SetDataFanInOut(&w2, scale);
w1.SetDataRand(-(DTYPE)sqrt(6.0F / inSize), (DTYPE)sqrt(6.0F / inSize));
w2.SetDataRand(-(DTYPE)sqrt(6.0F / hSize), (DTYPE)sqrt(6.0F / hSize));
b1.SetZeroAll();
b2.SetZeroAll();
}
......@@ -79,7 +79,7 @@ y = max(0, x * w1 + b1) * w2 + b2
>> input - the input tensor
>> return - the output tensor
*/
XTensor T2TFNN::Make(XTensor& input, bool isTraining)
XTensor FNN::Make(XTensor& input, bool isTraining)
{
XTensor t1;
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -20,20 +19,20 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#ifndef __T2TFNN_H__
#define __T2TFNN_H__
#ifndef __FNN_H__
#define __FNN_H__
#include "T2TUtility.h"
#include "T2TLayerNormal.h"
#include "LayerNorm.h"
#include "../Utility.h"
#include "../../../tensor/XTensor.h"
using namespace nts;
namespace transformer
namespace nmt
{
/* a fnn: y = max(0, x * w1 + b1) * w2 + b2 */
class T2TFNN
class FNN
{
public:
/* device id */
......@@ -66,13 +65,13 @@ public:
public:
/* constructor */
T2TFNN();
FNN();
/* de-constructor */
~T2TFNN();
~FNN();
/* initialize the model */
void InitModel(T2TConfig& config);
void InitModel(Config& config);
/* make the network */
XTensor Make(XTensor& input, bool isTraining);
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -19,16 +18,13 @@
* $Created by: Bei Li (libei_neu@outlook.com) 2020-02-03
*/
#include <cmath>
#include "T2TUtility.h"
#include "T2TEmbedding.h"
#include "T2TGatedLinearUnit.h"
#include "GLU.h"
#include "Embedding.h"
#include "../Utility.h"
#include "../../../tensor/core/CHeader.h"
#include "../../../tensor/function/FHeader.h"
namespace transformer
namespace nmt
{
/* constructor */
......@@ -48,7 +44,7 @@ GLU::~GLU()
initialize the model
>> config - configurations of the model
*/
void GLU::InitModel(T2TConfig& config)
void GLU::InitModel(Config& config)
{
devID = config.devID;
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -23,12 +22,11 @@
#ifndef __GLU_H__
#define __GLU_H__
#include "T2TLayerNormal.h"
#include "T2TGatedLinearUnit.h"
#include "LayerNorm.h"
using namespace nts;
namespace transformer
namespace nmt
{
/* a fnn: y = max(0, x * w1 + b1) * w2 + b2 */
......@@ -68,7 +66,7 @@ public:
~GLU();
/* initialize the model */
void InitModel(T2TConfig& config);
void InitModel(Config& config);
/* make the network */
XTensor Make(XTensor& input);
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -19,19 +18,16 @@
* $Created by: Bei Li (libei_neu@outlook.com) 2020-02-03
*/
#include <cmath>
#include "T2TUtility.h"
#include "T2TEmbedding.h"
#include "T2TLayerNormal.h"
#include "T2TLayerHistory.h"
#include "Embedding.h"
#include "LayerNorm.h"
#include "LayerHistory.h"
#include "../Utility.h"
#include "../../../tensor/core/CHeader.h"
#define SAFE_DELETE(x) do{ if((x) != NULL){delete (x); (x) = NULL;} } while(false)
#define SAFE_DELETE_ARRAY(x) do{ if((x) != NULL) {delete [] (x); (x)=NULL;} } while(false)
namespace transformer
namespace nmt
{
/* constructor */
......@@ -54,7 +50,7 @@ LayerHistory::~LayerHistory()
initialize the model
>> config - configurations of the model
*/
void LayerHistory::InitModel(T2TConfig& config)
void LayerHistory::InitModel(Config& config)
{
devID = config.devID;
d = config.modelSize;
......@@ -62,7 +58,7 @@ void LayerHistory::InitModel(T2TConfig& config)
InitTensor2D(&weight, nlayer + 1, nlayer + 1, X_FLOAT, devID);
layerNorms = new T2TLN[nlayer];
layerNorms = new LN[nlayer];
/* initialize the layer normalization of each layer */
for (int i = 0; i < nlayer; i++) {
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -22,14 +21,14 @@
#ifndef __LAYERHISTORY_H__
#define __LAYERHISTORY_H__
#include "T2TLayerNormal.h"
#include "T2TLayerHistory.h"
#include "LayerNorm.h"
#include "LayerHistory.h"
#include "../../../tensor/function/FHeader.h"
using namespace nts;
namespace transformer
namespace nmt
{
/*
......@@ -61,7 +60,7 @@ public:
TensorList history;
/* layer normalization for each intimidate layer */
T2TLN* layerNorms;
LN* layerNorms;
public:
/* constructor */
......@@ -71,7 +70,7 @@ public:
~LayerHistory();
/* initialize the model */
void InitModel(T2TConfig& config);
void InitModel(Config& config);
/* add the layer output to the history */
void Add(XTensor& tensor);
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -20,24 +19,23 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#include <cmath>
#include "T2TUtility.h"
#include "T2TEmbedding.h"
#include "T2TLayerNormal.h"
#include "Embedding.h"
#include "LayerNorm.h"
#include "../Utility.h"
#include "../../../tensor/core/CHeader.h"
namespace transformer
namespace nmt
{
/* constructor */
T2TLN::T2TLN()
LN::LN()
{
devID = -1;
d = 0;
}
/* de-constructor */
T2TLN::~T2TLN()
LN::~LN()
{
}
......@@ -47,7 +45,7 @@ initialize the model
>> argv - list of pointers to the arguments
>> config - configurations of the model
*/
void T2TLN::InitModel(T2TConfig& config)
void LN::InitModel(Config& config)
{
devID = config.devID;
......@@ -57,6 +55,8 @@ void T2TLN::InitModel(T2TConfig& config)
InitTensor1D(&b, d, X_FLOAT, devID);
w.SetDataRand(1.0F, 1.0F);
b.SetZeroAll();
w.SetDataFixed(1);
}
/*
......@@ -64,7 +64,7 @@ make the network
>> input - the input tensor
>> return - layer normalization output
*/
XTensor T2TLN::Make(XTensor& input)
XTensor LN::Make(XTensor& input)
{
XTensor& x = input;
XTensor xn;
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -20,20 +19,20 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#ifndef __T2TLAYERNORMAL_H__
#define __T2TLAYERNORMAL_H__
#ifndef __LAYERNORMAL_H__
#define __LAYERNORMAL_H__
#include "T2TUtility.h"
#include "../../../network/XNet.h"
#include "../Utility.h"
#include "../../../network//XNet.h"
using namespace nts;
namespace transformer
namespace nmt
{
/* layer normalization: y = norm(x) * w + b
where norm(x) = (x - mean)/standardDeviation */
class T2TLN
class LN
{
public:
/* device id */
......@@ -50,13 +49,13 @@ public:
public:
/* constructor */
T2TLN();
LN();
/* de-constructor */
~T2TLN();
~LN();
/* initialize the model */
void InitModel(T2TConfig& config);
void InitModel(Config& config);
/* make the network */
XTensor Make(XTensor& input);
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -16,12 +15,12 @@
*/
/*
* $Created by: Chi (huchinlp@foxmail.com) 2020-03-21
* $Created by: HU Chi (huchinlp@foxmail.com) 2020-03-21
*/
#include "T2TNNUtil.h"
#include "NNUtil.h"
namespace transformer
namespace nmt
{
/*
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -16,11 +15,11 @@
*/
/*
* $Created by: Chi (huchinlp@foxmail.com) 2020-03-21
* $Created by: HU Chi (huchinlp@foxmail.com) 2020-03-21
*/
#ifndef __T2TNNUTIL_H__
#define __T2TNNUTIL_H__
#ifndef __NNUTIL_H__
#define __NNUTIL_H__
#include "../../../tensor/XGlobal.h"
#include "../../../tensor/core/CHeader.h"
......@@ -28,7 +27,7 @@
using namespace nts;
namespace transformer
namespace nmt
{
/* the gather function for tensor with any dimension */
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -20,18 +19,16 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#include <cmath>
#include "T2TOutput.h"
#include "T2TUtility.h"
#include "T2TEmbedding.h"
#include "Output.h"
#include "Embedding.h"
#include "../Utility.h"
#include "../../../tensor/core/CHeader.h"
namespace transformer
namespace nmt
{
/* constructor */
T2TOutput::T2TOutput()
Output::Output()
{
devID = -1;
vSize = -1;
......@@ -39,7 +36,7 @@ T2TOutput::T2TOutput()
}
/* de-constructor */
T2TOutput::~T2TOutput()
Output::~Output()
{
}
......@@ -47,7 +44,7 @@ T2TOutput::~T2TOutput()
initialize the model
>> config - configurations of the model
*/
void T2TOutput::InitModel(T2TConfig& config)
void Output::InitModel(Config& config)
{
devID = config.devID;
hSize = config.modelSize;
......@@ -66,7 +63,7 @@ make the network (redefined output tensor)
>> isTraining - whether it is used for training
>> normalized - whether ignore the log-softmax
*/
void T2TOutput::Make(XTensor& input, XTensor& output, bool isTraining, bool normalized)
void Output::Make(XTensor& input, XTensor& output, bool isTraining, bool normalized)
{
XTensor& x = input;
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -20,19 +19,19 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#ifndef __T2TOUTPUT_H__
#define __T2TOUTPUT_H__
#ifndef __OUTPUT_H__
#define __OUTPUT_H__
#include "T2TUtility.h"
#include "../Utility.h"
#include "../../../tensor/function/FHeader.h"
using namespace nts;
namespace transformer
namespace nmt
{
/* output layer */
class T2TOutput
class Output
{
public:
/* device id */
......@@ -49,13 +48,13 @@ public:
public:
/* constructor */
T2TOutput();
Output();
/* de-constructor */
~T2TOutput();
~Output();
/* initialize the model */
void InitModel(T2TConfig& config);
void InitModel(Config& config);
/* make the network (redefined output tensor) */
void Make(XTensor& input, XTensor& output, bool isTraining, bool normalized);
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: HU Chi (huchinlp@foxmail.com) 2019-04-05
*/
#include "DataSet.h"
#include "StringUtil.h"
#include <string>
#include <vector>
#include <fstream>
#include <algorithm>
#include "..//..//..//tensor/XUtility.h"
using namespace nts;
bool Compare(const Example& a, const Example& b) {
return a.values.size() > b.values.size();
}
bool CompareRes(const Result& a, const Result& b) {
return a.id < b.id;
}
void DataSet::RerankRes(){
sort(resBuffer.begin(), resBuffer.end(), CompareRes);
}
/*
load data from the file to the buffer
*/
void DataSet::LoadDataToBuffer()
{
string line;
buffer.clear();
bufferUsed = 0;
const string tokenDelimiter = " ";
int id = 0;
while (getline(*fp, line)) {
vector<int> values = Split<int>(line, tokenDelimiter);
Example example;
example.id = id++;
example.values = values;
buffer.emplace_back(example);
}
if (fp->eof()) {
fp->seekg(fp->beg);
}
if (sortBuffer) {
sort(buffer.begin(), buffer.end(), Compare);
}
resBuffer.reserve(buffer.size());
}
/*
select a field and generate a mini-batch by indices
>>> batchEnc - a tensor to store the batch of input
>>> paddingEnc - a tensor to store the batch of paddings
>>> batchSize - batch size
>>> devID - devices id, -1 for CPU
>>> mem - the memory pool
*/
vector<int> DataSet::LoadBatch(XTensor * batchEnc, XTensor * paddingEnc,
size_t batchSize, int devID)
{
size_t realBatchSize = batchSize;
/* real batch size */
if ((buffer.size()-bufferUsed) < batchSize) {
realBatchSize = buffer.size()-bufferUsed;
}
/* get the maximum sentence length in a mini-batch */
size_t maxLen = 0;
if (realBatchSize == 1)
maxLen = buffer[bufferUsed].values.size();
for (size_t i = 0; i < realBatchSize - 1; ++i) {
maxLen = max(maxLen, buffer[bufferUsed+i].values.size());
}
CheckNTErrors(maxLen != 0, "wrong length dectected");
int* batchValues = new int[realBatchSize * maxLen];
float* paddingValues = new float[realBatchSize * maxLen];
for (int i = 0; i < realBatchSize * maxLen; ++i) {
batchValues[i] = 1.0F;
}
memset(paddingValues, 0, sizeof(float) * maxLen * realBatchSize);
size_t cur = 0;
/* left padding */
vector<int> indices;
indices.reserve(realBatchSize);
for (size_t i = 0; i < realBatchSize; ++i) {
indices.push_back(buffer[bufferUsed + i].id);
cur = maxLen * (i + 1) - buffer[bufferUsed+i].values.size();
for (int v : buffer[bufferUsed + i].values) {
batchValues[cur] = v;
paddingValues[cur++] = 1.0F;
}
cur = maxLen * (i + 1);
}
InitTensor2DV2(batchEnc, realBatchSize, maxLen, X_INT, devID);
InitTensor2DV2(paddingEnc, realBatchSize, maxLen, X_FLOAT, devID);
bufferUsed += realBatchSize;
batchEnc->SetData(batchValues, batchEnc->unitNum);
paddingEnc->SetData(paddingValues, paddingEnc->unitNum);
delete[] batchValues;
delete[] paddingValues;
return indices;
}
/*
the constructor of DataSet
>>> fname - path of the data file
*/
void DataSet::Init(const char* fname)
{
fp = new ifstream(fname);
CheckNTErrors(fp->is_open(), "can not open the file");
bufferUsed = 0;
LoadDataToBuffer();
if (bufferSize == 0)
bufferSize = buffer.size();
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: HU Chi (huchinlp@foxmail.com) 2019-04-03
*/
#ifndef __DATASET_H__
#define __DATASET_H__
#include "../../..//tensor/XTensor.h"
#include "../../..//tensor/XGlobal.h"
#include <cstdio>
#include <fstream>
#include <unordered_map>
#include <vector>
using namespace std;
using namespace nts;
struct Example {
int id;
vector<int> values;
};
struct Result {
int id;
XTensor values;
};
using BufferType = vector<Example>;
using ResBufferType = vector<Result>;
bool Compare(const Example& a, const Example& b);
bool CompareRes(const Result& a, const Result& b);
namespace nts { // namespace nts(NiuTrans.Tensor)
/* A `DataSet` is associated with a file which contains variable length data.*/
struct DataSet {
/* the data buffer */
BufferType buffer;
/* the result buffer */
ResBufferType resBuffer;
/* the pointer to file stream */
ifstream* fp{nullptr};
/* size of the data buffer */
size_t bufferSize{ 0 };
/* size of used data in buffer */
size_t bufferUsed{ 0 };
/* wether sort the dataset */
bool sortBuffer{ true };
/* load data from a file to the buffer */
void LoadDataToBuffer();
/* rerank result for output */
void RerankRes();
/* generate a mini-batch */
vector<int> LoadBatch(XTensor * batchEnc, XTensor * paddingEnc,
size_t batchSize, int devID);
/* initlization function */
void Init(const char* fname);
/* check if the buffer is empty */
bool IsEmpty() {
if (bufferUsed < bufferSize)
return false;
return true;
}
/* de-constructor */
~DataSet() {
if (fp)
fp->close();
delete fp;
}
};
} // namespace nts(NiuTrans.Tensor)
#endif // __DATASET_H__
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: HU Chi (huchinlp@foxmail.com) 2019-03-18
*/
#include "StringUtil.h"
namespace nts {
/* split string by delimiter, this will return indices of all sub-strings */
vector<pair<int, int>> SplitToPos(const string& s, const string& delimiter)
{
vector<pair<int, int>> fields;
if (delimiter.length() == 0) {
fields.emplace_back(0, s.length());
return fields;
}
int pos = 0;
int start = 0;
while ((pos = s.find(delimiter, start)) != string::npos) {
if (pos != start) {
fields.emplace_back(start, pos);
}
start = pos + delimiter.length();
}
if (start != s.length()) {
fields.emplace_back(start, s.length());
}
return fields;
}
}
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: HU Chi (huchinlp@foxmail.com) 2019-03-18
*/
#ifndef __STRING_UTIL_H__
#define __STRING_UTIL_H__
#include <cstdlib>
#include <string>
#include <utility>
#include <vector>
using namespace std;
namespace nts {
/* Splits a string based on the given delimiter string. Each pair in the
* returned vector has the start and past-the-end positions for each of the
* parts of the original string. Empty fields are not represented in the output.
*/
vector<pair<int, int>> SplitToPos(const string& s, const string& delimiter);
/* Splits the given string and converts each part to the given T. */
template <typename T>
vector<T> Split(const string& s, const string& delimiter);
template <>
inline vector<string> Split(const string& s, const string& delimiter)
{
vector<string> fields;
for (const auto& p : SplitToPos(s, delimiter)) {
fields.emplace_back(s.substr(p.first, p.second - p.first));
}
return fields;
}
template <>
inline vector<int> Split(const string& s, const string& delimiter)
{
vector<int> fields;
for (const auto& p : SplitToPos(s, delimiter)) {
fields.emplace_back(strtol(s.data() + p.first, nullptr, 10));
}
return fields;
}
template <>
inline vector<int64_t> Split(const string& s, const string& delimiter)
{
vector<int64_t> fields;
for (const auto& p : SplitToPos(s, delimiter)) {
fields.emplace_back(strtoll(s.data() + p.first, nullptr, 10));
}
return fields;
}
template <>
inline vector<float> Split(const string& s, const string& delimiter)
{
vector<float> fields;
for (const auto& p : SplitToPos(s, delimiter)) {
fields.emplace_back(strtof(s.data() + p.first, nullptr));
}
return fields;
}
template <>
inline vector<uint8_t> Split(const string& s, const string& delimiter)
{
vector<uint8_t> fields;
for (const auto& p : SplitToPos(s, delimiter)) {
fields.emplace_back(strtol(s.data() + p.first, nullptr, 10));
}
return fields;
}
} // namespace nts
#endif // __STRING_UTIL_H__
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-04-25
* it is cold today but I'll move to a warm place tomorrow :)
*/
#ifndef __T2TBATCHLOADER_H__
#define __T2TBATCHLOADER_H__
#include "../module/T2TUtility.h"
#include "../../../network/XNet.h"
using namespace nts;
namespace transformer
{
#define MAX_SEQUENCE_LENGTH 1024 * 4
/* node to keep batch information */
struct BatchNode
{
/* beginning position */
int beg;
/* end position */
int end;
/* maximum word number on the encoder side */
int maxEnc;
/* maximum word number on the decoder side */
int maxDec;
/* a key for sorting */
int key;
};
class T2TBatchLoader
{
public:
/* buffer for loading words */
int* buf;
/* another buffer */
int* buf2;
/* batch buf */
BatchNode* bufBatch;
/* buffer size */
int bufSize;
/* size of batch buffer */
int bufBatchSize;
/* length of each sequence */
int* seqLen;
/* another array */
int* seqLen2;
/* offset of the first word for each sequence */
int* seqOffset;
/* number of sequences in the buffer */
int nseqBuf;
/* offset for next sequence in the buffer */
int nextSeq;
/* offset for next batch */
int nextBatch;
/* indicates whether we double the </s> symbol for the output of LM */
bool isDoubledEnd;
/* indicates whether we use batchsize = max * sc
rather rather than batchsize = word-number, where max is the maximum
length and sc is the sentence number */
bool isSmallBatch;
/* counterpart of "isSmallBatch" */
bool isBigBatch;
/* randomize batches */
bool isRandomBatch;
/* bucket size */
int bucketSize;
public:
/* constructor */
T2TBatchLoader();
/* de-constructor */
~T2TBatchLoader();
/* initialization */
void Init(T2TConfig& config);
/* load data to buffer */
int LoadBuf(FILE* file, bool isSorted, int step);
/* clear data buffer */
void ClearBuf();
/* set the random batch flag */
void SetRandomBatch(bool flag = true);
/* load a batch of sequences */
int LoadBatch(FILE* file, bool isLM,
XTensor* batchEnc, XTensor* paddingEnc,
XTensor* batchDec, XTensor* paddingDec,
XTensor* gold, XTensor* label,
int* seqs,
int vsEnc, int vsDec, int sBatch, int wBatch,
bool isSorted, int& ws, int& wCount,
int devID, bool isTraining);
/* load a batch of sequences (for language modeling) */
int LoadBatchLM(FILE* file,
XTensor* batchEnc, XTensor* paddingEnc,
XTensor* batchDec, XTensor* paddingDec,
XTensor* gold, XTensor* label,
int* seqs, int vs, int sBatch, int wBatch,
bool isSorted, int& wCount,
int devID, bool isTraining);
/* load a batch of sequences (for machine translation) */
int LoadBatchMT(FILE* file,
XTensor* batchEnc, XTensor* paddingEnc,
XTensor* batchDec, XTensor* paddingDec,
XTensor* gold, XTensor* label,
int* seqs, int vsEnc, int vsDec, int sBatch, int wBatch,
bool isSorted, int& ws, int& wCount,
int devID, bool isTraining);
/* shuffle the data file */
void Shuffle(const char* srcFile, const char* tgtFile);
};
}
#endif
\ No newline at end of file
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: HU Chi (huchinlp@foxmail.com) 2019-04-03
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-06
*/
#ifndef __TRAIN_DATASET_H__
#define __TRAIN_DATASET_H__
#include <cstdio>
#include <vector>
#include <fstream>
#include "../../../tensor/XList.h"
#include "../../../tensor/XTensor.h"
#include "../../../tensor/XGlobal.h"
#define MAX_WORD_NUM 120
using namespace std;
namespace nts {
/* a class of sentence pairs for training */
struct TrainExample {
/* id of the sentence pair */
int id;
/* source language setence (tokenized) */
IntList srcSent;
/* target language setence (tokenized) */
IntList tgtSent;
/* the key used to shuffle items in a bucket */
int key;
/* the key used to shuffle buckets */
int bucketKey;
};
/* A `TrainDataSet` is associated with a file which contains training data. */
struct TrainDataSet {
public:
/* the data buffer */
TrainBufferType buffer;
/* a list of empty line number */
IntList emptyLines;
/* the pointer to file stream */
FILE* fp;
/* current index in the buffer */
size_t curIdx;
/* size of used data in the buffer */
size_t bufferUsed;
/* size of the bucket used for grouping sentences */
size_t bucketSize;
/* indicates whether it is used for training */
bool isTraining;
public:
/* sort the input by length (in descending order) */
void SortByLength();
/* sort buckets by key (in descending order) */
void SortBucket();
/* sort the output by key (in descending order) */
void SortInBucket(int begin, int end);
/* load data from a file to the buffer */
void LoadDataToBuffer();
/* generate a mini-batch */
UInt64List LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
XTensor* batchDec, XTensor* paddingDec, XTensor* label,
size_t minSentBatch, size_t batchSize, int devID);
/* initialization function */
void Init(const char* dataFile, int bucketSize, bool training);
/* check if the buffer is empty */
bool IsEmpty();
/* reset the buffer */
void ClearBuf();
/* group data into buckets with similar length */
void BuildBucket();
/* de-constructor */
~TrainDataSet();
};
}
#endif // __TRAIN_DATASET_H__
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -19,25 +18,24 @@
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-08-02
*/
#ifndef __T2TTRAINER_H__
#define __T2TTRAINER_H__
#ifndef __TRAINER_H__
#define __TRAINER_H__
#include "../T2TModel.h"
#include "T2TBatchLoader.h"
#include "../../../tensor/function/FHeader.h"
#include "../Model.h"
#include "TrainDataSet.h"
using namespace nts;
namespace transformer
namespace nmt
{
/* trainer of the T2T model */
class T2TTrainer
/* trainer of the model */
class Trainer
{
public:
/* configurations */
T2TConfig* cfg;
Config* cfg;
/* dimension size of each inner layer */
int d;
......@@ -63,12 +61,18 @@ public:
/* word batch size */
int wBatchSize;
/* size of bucket for grouping data by length */
int bucketSize;
/* training epoch number */
int nepoch;
/* traing step number */
int nstep;
/* the maximum number of saved checkpoints */
int maxCheckpoint;
/* indicates whether we use adam */
bool useAdam;
......@@ -100,39 +104,36 @@ public:
/* number of batches on which we do model update */
int updateStep;
/* indicates whether we intend to debug the net */
bool isDebugged;
/* indicates whether the sequence is sorted by length */
bool isLenSorted;
/* for batching */
T2TBatchLoader batchLoader;
/* used for loading batches */
TrainDataSet batchLoader;
public:
/* constructor */
T2TTrainer();
Trainer();
/* de-constructor */
~T2TTrainer();
~Trainer();
/* initialize the trainer */
void Init(T2TConfig& config);
void Init(Config& config);
/* train the model */
void Train(const char* fn, const char* validFN, const char* modelFN, T2TModel* model);
void Train(const char* fn, const char* validFN, const char* modelFN, Model* model);
/* test the model */
void Validate(const char* fn, const char* ofn, T2TModel* model);
void Validate(const char* fn, const char* ofn, Model* model);
/* make a checkpoint */
void MakeCheckpoint(T2TModel* model, const char* validFN, const char* modelFN, const char* label, int id);
void MakeCheckpoint(Model* model, const char* validFN, const char* modelFN, const char* label, int id);
/* update the model by delta rule */
void Update(T2TModel* model, const float lr);
void Update(Model* model, const float lr);
/* prepare model for training */
void PrepareModel(T2TModel* model);
void PrepareModel(Model* model);
};
}
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论