Commit b9c318bd by hello

no message

parent 1d17c439
...@@ -4,3 +4,4 @@ x64/ ...@@ -4,3 +4,4 @@ x64/
vc140.pdb vc140.pdb
NiuTrans.Tensor.vcxproj.user NiuTrans.Tensor.vcxproj.user
NiuTrans.Tensor.aps NiuTrans.Tensor.aps
*.tgz
...@@ -97,35 +97,47 @@ if(USE_CUDA) ...@@ -97,35 +97,47 @@ if(USE_CUDA)
add_definitions(-DHALF_PRECISION) add_definitions(-DHALF_PRECISION)
endif() endif()
find_package(CUDA REQUIRED) find_package(CUDA REQUIRED)
if(WIN32) if(GPU_ARCH STREQUAL K) # Kepler cards (CUDA 5 until CUDA 10)
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4819") set(ARCH_FLAGS -arch=compute_30 -code=compute_30,sm_30,sm_35,sm_37)
set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-maxrregcount=0 -m64 --disable-warnings -use_fast_math -DUSE_CUDA") elseif(GPU_ARCH STREQUAL M) # Maxwell cards (CUDA 6 until CUDA 11)
set(ARCH_FLAGS -arch=compute_50 -code=compute_50,sm_50,sm_52,sm_53)
elseif(GPU_ARCH STREQUAL P) # Pascal (CUDA 8 and later)
set(ARCH_FLAGS -arch=compute_60 -code=compute_60,sm_60,sm_61,sm_62)
elseif(GPU_ARCH STREQUAL V) # Volta (CUDA 9 and later)
set(ARCH_FLAGS -arch=compute_70 -code=compute_70,sm_70,sm_72)
elseif(GPU_ARCH STREQUAL T) # Turing (CUDA 10 and later)
set(ARCH_FLAGS -arch=compute_75 -code=sm_75)
elseif(GPU_ARCH STREQUAL A) # Ampere (CUDA 11 and later)
set(ARCH_FLAGS -arch=compute_80 -code=sm_80)
endif()
if(USE_HALF_PRECISION) if(USE_HALF_PRECISION)
set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-DHALF_PRECISION") if(NOT DEFINED GPU_ARCH)
set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} -arch=sm_60 set(ARCH_FLAGS -arch=sm_60
-gencode=arch=compute_60,code=sm_60
-gencode=arch=compute_61,code=sm_61
-gencode=arch=compute_62,code=sm_62
-gencode=arch=compute_70,code=sm_70
-gencode=arch=compute_70,code=compute_70
)
else()
set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} -arch=sm_30
-gencode=arch=compute_30,code=sm_30
-gencode=arch=compute_50,code=sm_50
-gencode=arch=compute_52,code=sm_52
-gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_60,code=sm_60
-gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_61,code=sm_61
-gencode=arch=compute_62,code=sm_62 -gencode=arch=compute_62,code=sm_62
-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_70,code=sm_70
-gencode=arch=compute_72,code=sm_72
-gencode=arch=compute_70,code=compute_70 -gencode=arch=compute_70,code=compute_70
) )
elseif(${GPU_ARCH} STREQUAL K OR ${GPU_ARCH} STREQUAL M)
message(FATAL_ERROR "your GPU cannot use the function half precision")
endif() endif()
endif()
if(WIN32)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4819")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-maxrregcount=0 -m64 -Wno-deprecated-gpu-targets -use_fast_math")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${ARCH_FLAGS})
set(CMAKE_POLICY_DEFAULT_CMP0028 NEW) set(CMAKE_POLICY_DEFAULT_CMP0028 NEW)
link_directories("${CUDA_ROOT}/lib/x64") link_directories("${CUDA_ROOT}/lib/x64")
include_directories("${CUDA_ROOT}/include") include_directories("${CUDA_ROOT}/include")
set(CUDA_LIB_DIR "${CUDA_ROOT}/lib/x64/") set(CUDA_LIB_DIR "${CUDA_ROOT}/lib/x64/")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}cublas.lib") set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}cublas.lib")
if(CUDA_VERSION_MAJOR EQUAL 11)
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}cublasLt.lib")
endif()
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}npps.lib") set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}npps.lib")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}nppc.lib") set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}nppc.lib")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}cudadevrt.lib") set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}cudadevrt.lib")
...@@ -133,31 +145,14 @@ if(USE_CUDA) ...@@ -133,31 +145,14 @@ if(USE_CUDA)
else() else()
set(CMAKE_CXX_FLAGS "-fPIC -msse4.2 -w -march=native -Wno-enum-compare -Wno-sign-compare -Wno-format -Wno-dev -O3 -DNDEBUG -rdynamic") set(CMAKE_CXX_FLAGS "-fPIC -msse4.2 -w -march=native -Wno-enum-compare -Wno-sign-compare -Wno-format -Wno-dev -O3 -DNDEBUG -rdynamic")
set(CUDA_NVCC_FLAGS "-Xcompiler -fPIC -maxrregcount=0 --disable-warnings -use_fast_math -DUSE_CUDA -Wno-deprecated-gpu-targets -std=c++11") set(CUDA_NVCC_FLAGS "-Xcompiler -fPIC -maxrregcount=0 --disable-warnings -use_fast_math -DUSE_CUDA -Wno-deprecated-gpu-targets -std=c++11")
if(USE_HALF_PRECISION) set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${ARCH_FLAGS})
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-DHALF_PRECISION") link_directories("${CUDA_ROOT}/lib64")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -arch=sm_60 include_directories("${CUDA_ROOT}/include")
-gencode=arch=compute_60,code=sm_60
-gencode=arch=compute_61,code=sm_61
-gencode=arch=compute_62,code=sm_62
-gencode=arch=compute_70,code=sm_70
-gencode=arch=compute_70,code=compute_70
)
else()
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -arch=sm_30
-gencode=arch=compute_30,code=sm_30
-gencode=arch=compute_50,code=sm_50
-gencode=arch=compute_52,code=sm_52
-gencode=arch=compute_60,code=sm_60
-gencode=arch=compute_61,code=sm_61
-gencode=arch=compute_62,code=sm_62
-gencode=arch=compute_70,code=sm_70
-gencode=arch=compute_70,code=compute_70
)
endif()
link_directories(${CUDA_ROOT}/lib64)
include_directories(${CUDA_ROOT}/include)
set(CUDA_LIB_DIR "${CUDA_ROOT}/lib64/") set(CUDA_LIB_DIR "${CUDA_ROOT}/lib64/")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcublas_static.a") set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcublas_static.a")
if(CUDA_VERSION_MAJOR EQUAL 11)
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcublasLt_static.a")
endif()
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libculibos.a") set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libculibos.a")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libnpps_static.a") set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libnpps_static.a")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libnppc_static.a") set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libnppc_static.a")
......
...@@ -46,7 +46,7 @@ NiuTensor蟾・蜈キ蛹庄莉・蝨ィWindows縲´inux莉・蜿確acOS邇ッ蠅ク玖ソ幄。悟ョ芽」シ梧髪 ...@@ -46,7 +46,7 @@ NiuTensor蟾・蜈キ蛹庄莉・蝨ィWindows縲´inux莉・蜿確acOS邇ッ蠅ク玖ソ幄。悟ョ芽」シ梧髪
- 执行CMake命令对Visual Studio项目进行生成(如果 visual studio 版本低于 2019,则在使用下列命令的时候需额外加上`-A x64`的CMake参数),如计划生成动态链接库,则仅需在命令中额外加上`-DGEN_DLL=ON`的CMake参数即可,否则默认生成可执行程序。 - 执行CMake命令对Visual Studio项目进行生成(如果 visual studio 版本低于 2019,则在使用下列命令的时候需额外加上`-A x64`的CMake参数),如计划生成动态链接库,则仅需在命令中额外加上`-DGEN_DLL=ON`的CMake参数即可,否则默认生成可执行程序。
- 如项目计划启用MKL数学运算库(需用户自行安装),则仅需在CMake命令中使用`-DUSE_MKL=ON`参数,并通过`-DINTEL_ROOT='/intel/root/path'`指定MKL库(Intel工具包)的安装路径。如`cmake -DUSE_MKL=ON -DINTEL_ROOT='C:/Program Files (x86)/IntelSWTools/compilers_and_libraries_2020.2.254/windows' ..` - 如项目计划启用MKL数学运算库(需用户自行安装),则仅需在CMake命令中使用`-DUSE_MKL=ON`参数,并通过`-DINTEL_ROOT='/intel/root/path'`指定MKL库(Intel工具包)的安装路径。如`cmake -DUSE_MKL=ON -DINTEL_ROOT='C:/Program Files (x86)/IntelSWTools/compilers_and_libraries_2020.2.254/windows' ..`
- 如项目计划启用OpenBLAS数学运算库(需用户自行安装),则仅需在CMake命令中使用`-DUSE_OPENBLAS=ON`参数,并通过`-DOPENBLAS_ROOT='/openblas/root/path'`指定OpenBLAS库的安装路径。如`cmake -DUSE_OPENBLAS=ON -DOPENBLAS_ROOT='C:/Program Files/OpenBLAS' ..` - 如项目计划启用OpenBLAS数学运算库(需用户自行安装),则仅需在CMake命令中使用`-DUSE_OPENBLAS=ON`参数,并通过`-DOPENBLAS_ROOT='/openblas/root/path'`指定OpenBLAS库的安装路径。如`cmake -DUSE_OPENBLAS=ON -DOPENBLAS_ROOT='C:/Program Files/OpenBLAS' ..`
- 如项目计划启用CUDA数学运算库(需用户自行安装),则仅需在CMake命令中使用`-DUSE_CUDA=ON`参数,并通过`-DCUDA_ROOT='/cuda/root/path'`指定CUDA库的安装路径。如`cmake -DUSE_CUDA=ON -DCUDA_ROOT='C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.2' ..`。如需在GPU设备上使用半精度浮点数进行运算,需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数(需要注意的是半精度但需要注意的是,半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持,该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询)。 - 如项目计划启用CUDA数学运算库(需用户自行安装),则仅需在CMake命令中使用`-DUSE_CUDA=ON`参数,并通过`-DCUDA_ROOT='/cuda/root/path'`指定CUDA库的安装路径,通过-DGPU_ARCH=ARCH指定所在GPU设备的架构(K:Kepler架构;M:Maxwell架构;P:Pascal架构;V:Volta架构;T:Turing架构;A:Ampere架构)。如`cmake -DUSE_CUDA=ON -DCUDA_ROOT='C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.2' -DGPU_ARCH=P ..`。如需在GPU设备上使用半精度浮点数进行运算,需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数(需要注意的是半精度但需要注意的是,半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持,该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询)。
- 执行成功将显示`Build files have been written to:...` - 执行成功将显示`Build files have been written to:...`
- 打开build目录中的NiuTensor.sln文件即可通过Visual Studio打开NiuTensor项目。 - 打开build目录中的NiuTensor.sln文件即可通过Visual Studio打开NiuTensor项目。
- 打开后在解决方案管理器中选中NiuTensor,右键将其设为启动项目即可开始使用。 - 打开后在解决方案管理器中选中NiuTensor,右键将其设为启动项目即可开始使用。
...@@ -67,7 +67,7 @@ NiuTensor蟾・蜈キ蛹庄莉・蝨ィWindows縲´inux莉・蜿確acOS邇ッ蠅ク玖ソ幄。悟ョ芽」シ梧髪 ...@@ -67,7 +67,7 @@ NiuTensor蟾・蜈キ蛹庄莉・蝨ィWindows縲´inux莉・蜿確acOS邇ッ蠅ク玖ソ幄。悟ョ芽」シ梧髪
- 打开CLion首选项,点击“构建,执行,部署”选项卡中的CMake,在“CMake选项”中进行设置,设置完成后CLion将自动使用CMake对项目进行构建,如计划生成动态链接库,则仅需在在“CMake选项”中额外加上`-DGEN_DLL=ON`的CMake参数即可,否则默认生成可执行程序。 - 打开CLion首选项,点击“构建,执行,部署”选项卡中的CMake,在“CMake选项”中进行设置,设置完成后CLion将自动使用CMake对项目进行构建,如计划生成动态链接库,则仅需在在“CMake选项”中额外加上`-DGEN_DLL=ON`的CMake参数即可,否则默认生成可执行程序。
- 如项目计划启用MKL数学运算库(需用户自行安装),则仅需在“CMake选项”中填入`-DUSE_MKL=ON`,并通过`-DINTEL_ROOT='/intel/root/path'`指定MKL库(Intel工具包)的安装路径。如`-DUSE_MKL=ON -DINTEL_ROOT='/opt/intel/compilers_and_libraries_2020.2.254/linux'` - 如项目计划启用MKL数学运算库(需用户自行安装),则仅需在“CMake选项”中填入`-DUSE_MKL=ON`,并通过`-DINTEL_ROOT='/intel/root/path'`指定MKL库(Intel工具包)的安装路径。如`-DUSE_MKL=ON -DINTEL_ROOT='/opt/intel/compilers_and_libraries_2020.2.254/linux'`
- 如项目计划启用OpenBLAS数学运算库(需用户自行安装),则仅需在“CMake选项”中填入`-DUSE_OPENBLAS=ON`,并通过`-DOPENBLAS_ROOT='/openblas/root/path'`指定OpenBLAS库的安装路径。如`-DUSE_OPENBLAS=ON -DOPENBLAS_ROOT='/opt/OpenBLAS'` - 如项目计划启用OpenBLAS数学运算库(需用户自行安装),则仅需在“CMake选项”中填入`-DUSE_OPENBLAS=ON`,并通过`-DOPENBLAS_ROOT='/openblas/root/path'`指定OpenBLAS库的安装路径。如`-DUSE_OPENBLAS=ON -DOPENBLAS_ROOT='/opt/OpenBLAS'`
- 如项目计划启用CUDA数学运算库(需用户自行安装),则仅需在“CMake选项”中填入`-DUSE_CUDA=ON`,并通过`-DCUDA_ROOT='/cuda/root/path'`指定CUDA库的安装路径。如`-DUSE_CUDA=ON -DCUDA_ROOT='/usr/local/cuda-9.2'`。如需在GPU设备上使用半精度浮点数进行运算,需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数(需要注意的是半精度但需要注意的是,半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持,该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询)。 - 如项目计划启用CUDA数学运算库(需用户自行安装),则仅需在“CMake选项”中填入`-DUSE_CUDA=ON`,并通过`-DCUDA_ROOT='/cuda/root/path'`指定CUDA库的安装路径,通过-DGPU_ARCH=ARCH指定所在GPU设备的架构(K:Kepler架构;M:Maxwell架构;P:Pascal架构;V:Volta架构;T:Turing架构;A:Ampere架构)。如`-DUSE_CUDA=ON -DCUDA_ROOT='/usr/local/cuda-9.2' -DGPU_ARCH=P `。如需在GPU设备上使用半精度浮点数进行运算,需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数(需要注意的是半精度但需要注意的是,半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持,该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询)。
##### CMake方式(命令行) ##### CMake方式(命令行)
...@@ -78,7 +78,7 @@ NiuTensor蟾・蜈キ蛹庄莉・蝨ィWindows縲´inux莉・蜿確acOS邇ッ蠅ク玖ソ幄。悟ョ芽」シ梧髪 ...@@ -78,7 +78,7 @@ NiuTensor蟾・蜈キ蛹庄莉・蝨ィWindows縲´inux莉・蜿確acOS邇ッ蠅ク玖ソ幄。悟ョ芽」シ梧髪
- 执行CMake命令对项目进行生成,如计划生成动态链接库,则仅需在命令中额外加上`-DGEN_DLL=ON`的CMake参数即可,否则默认生成可执行程序。 - 执行CMake命令对项目进行生成,如计划生成动态链接库,则仅需在命令中额外加上`-DGEN_DLL=ON`的CMake参数即可,否则默认生成可执行程序。
- 如项目计划启用MKL数学运算库(需用户自行安装),则仅需在CMake命令中使用`-DUSE_MKL=ON`参数,并通过`-DINTEL_ROOT='/intel/root/path'`指定MKL库(Intel工具包)的安装路径。如`cmake -DUSE_MKL=ON -DINTEL_ROOT='/opt/intel/compilers_and_libraries_2020.2.254/linux' ..` - 如项目计划启用MKL数学运算库(需用户自行安装),则仅需在CMake命令中使用`-DUSE_MKL=ON`参数,并通过`-DINTEL_ROOT='/intel/root/path'`指定MKL库(Intel工具包)的安装路径。如`cmake -DUSE_MKL=ON -DINTEL_ROOT='/opt/intel/compilers_and_libraries_2020.2.254/linux' ..`
- 如项目计划启用OpenBLAS数学运算库(需用户自行安装),则仅需在CMake命令中使用`-DUSE_OPENBLAS=ON`参数,并通过`-DOPENBLAS_ROOT='/openblas/root/path'`指定OpenBLAS库的安装路径。如`cmake -DUSE_OPENBLAS=ON -DOPENBLAS_ROOT='/opt/OpenBLAS' ..` - 如项目计划启用OpenBLAS数学运算库(需用户自行安装),则仅需在CMake命令中使用`-DUSE_OPENBLAS=ON`参数,并通过`-DOPENBLAS_ROOT='/openblas/root/path'`指定OpenBLAS库的安装路径。如`cmake -DUSE_OPENBLAS=ON -DOPENBLAS_ROOT='/opt/OpenBLAS' ..`
- 如项目计划启用CUDA数学运算库(需用户自行安装),则仅需在CMake命令中使用`-DUSE_CUDA=ON`参数,并通过`-DCUDA_ROOT='/cuda/root/path'`指定CUDA库的安装路径。如`cmake -DUSE_CUDA=ON -DCUDA_ROOT='/usr/local/cuda-9.2' ..`。如需在GPU设备上使用半精度浮点数进行运算,需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数(需要注意的是半精度但需要注意的是,半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持,该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询)。 - 如项目计划启用CUDA数学运算库(需用户自行安装),则仅需在CMake命令中使用`-DUSE_CUDA=ON`参数,并通过`-DCUDA_ROOT='/cuda/root/path'`指定CUDA库的安装路径,通过-DGPU_ARCH=ARCH指定所在GPU设备的架构(K:Kepler架构;M:Maxwell架构;P:Pascal架构;V:Volta架构;T:Turing架构;A:Ampere架构)。如`cmake -DUSE_CUDA=ON -DCUDA_ROOT='/usr/local/cuda-9.2' -DGPU_ARCH=P ..`。如需在GPU设备上使用半精度浮点数进行运算,需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数(需要注意的是半精度但需要注意的是,半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持,该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询)。
- 执行成功将显示`Build files have been written to:...`并在该目录下生成Makefile文件。 - 执行成功将显示`Build files have been written to:...`并在该目录下生成Makefile文件。
- 执行`make -j`命令对NiuTensor项目进行编译,执行成功将显示`Built target NiuTensor`,安装完毕。 - 执行`make -j`命令对NiuTensor项目进行编译,执行成功将显示`Built target NiuTensor`,安装完毕。
...@@ -137,4 +137,4 @@ NiuTensor蠑驥剰ョ。邂怜コ鍋罰荳懷圏螟ァ蟄ヲ閾ェ辟カ隸ュ險螟炊螳樣ェ悟ョ、蟆冗央蠑貅仙 ...@@ -137,4 +137,4 @@ NiuTensor蠑驥剰ョ。邂怜コ鍋罰荳懷圏螟ァ蟄ヲ閾ェ辟カ隸ュ險螟炊螳樣ェ悟ョ、蟆冗央蠑貅仙
## 更新版本 ## 更新版本
NiuTensor version 0.3.3 - 2020年9月14日 NiuTensor version 0.3.5 - 2021年2月6日
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
# NiuTrans.Tensor环境配置
## 注意事项
CUDA最新版本9.2尚且不支持VS2017最新版本,因此建议使用CUDA版本为9.0或9.1,建议使用VS版本为VS2015,或使用VS2017时安装v140工具集,解决方案平台设置为×64。
## CUDA配置
在已安装好VS、CUDA并配置好环境变量后,一些关键的CUDA配置选项如下所示,以下配置选项在 **项目 -> 属性** 中可以找到。
>$(CUDA_PATH)\include
加入到 **VC++目录 -> 包含** 中。
>$(CUDA_PATH)\lib\Win32
加入到 **VC++目录 -> 库** 中。
>cuda.lib;cudadevrt.lib;cudart.lib;cudart_static.lib;nvcuvid.lib;OpenCL.lib;cublas.lib;curand.lib;
加入到 **链接器->输入->附加依赖项** 中。
配置完成后,右键 **工程->项目依赖性** ,选择CUDA9。
在.cu文件上右键属性,在项类型中选择"CUDA C/C++"(最好搜索.cu文件,然后全选设置)。
## 其他配置
**C/C++->常规->SDL检查**,设为否。
**C/C++->预处理器->预处理器定义** 中,添加
>USE_CUDA;USE_BLAS;WIN32;MKL;_DEBUG;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_WARNINGS_
CONSOLE;
**链接器->系统->子系统**,设置为控制台。
**常规->字符集**,使用Unicode字符集。
**调试->命令参数**中设置可执行文件所需要的参数。
...@@ -39,7 +39,7 @@ NiuTensor工具包可以在Windows、Linux以及macOS环境下进行安装,支 ...@@ -39,7 +39,7 @@ NiuTensor工具包可以在Windows、Linux以及macOS环境下进行安装,支
- 执行CMake命令对Visual Studio项目进行生成(如果 visual studio 版本低于 2019,则在使用下列命令的时候需额外加上`-A x64`的CMake参数),如计划生成动态链接库,则仅需在命令中额外加上`-DGEN_DLL=ON`的CMake参数即可,否则默认生成可执行程序。 - 执行CMake命令对Visual Studio项目进行生成(如果 visual studio 版本低于 2019,则在使用下列命令的时候需额外加上`-A x64`的CMake参数),如计划生成动态链接库,则仅需在命令中额外加上`-DGEN_DLL=ON`的CMake参数即可,否则默认生成可执行程序。
- 如项目计划启用MKL数学运算库(需用户自行安装),则仅需在CMake命令中使用`-DUSE_MKL=ON`参数,并通过`-DINTEL_ROOT='/intel/root/path'`指定MKL库(Intel工具包)的安装路径。如`cmake -DUSE_MKL=ON -DINTEL_ROOT='C:/Program Files (x86)/IntelSWTools/compilers_and_libraries_2020.2.254/windows' ..` - 如项目计划启用MKL数学运算库(需用户自行安装),则仅需在CMake命令中使用`-DUSE_MKL=ON`参数,并通过`-DINTEL_ROOT='/intel/root/path'`指定MKL库(Intel工具包)的安装路径。如`cmake -DUSE_MKL=ON -DINTEL_ROOT='C:/Program Files (x86)/IntelSWTools/compilers_and_libraries_2020.2.254/windows' ..`
- 如项目计划启用OpenBLAS数学运算库(需用户自行安装),则仅需在CMake命令中使用`-DUSE_OPENBLAS=ON`参数,并通过`-DOPENBLAS_ROOT='/openblas/root/path'`指定OpenBLAS库的安装路径。如`cmake -DUSE_OPENBLAS=ON -DOPENBLAS_ROOT='C:/Program Files/OpenBLAS' ..` - 如项目计划启用OpenBLAS数学运算库(需用户自行安装),则仅需在CMake命令中使用`-DUSE_OPENBLAS=ON`参数,并通过`-DOPENBLAS_ROOT='/openblas/root/path'`指定OpenBLAS库的安装路径。如`cmake -DUSE_OPENBLAS=ON -DOPENBLAS_ROOT='C:/Program Files/OpenBLAS' ..`
- 如项目计划启用CUDA数学运算库(需用户自行安装),则仅需在CMake命令中使用`-DUSE_CUDA=ON`参数,并通过`-DCUDA_ROOT='/cuda/root/path'`指定CUDA库的安装路径。如`cmake -DUSE_CUDA=ON -DCUDA_ROOT='C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.2' ..`。如需在GPU设备上使用半精度浮点数进行运算,需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数(需要注意的是半精度但需要注意的是,半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持,该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询)。 - 如项目计划启用CUDA数学运算库(需用户自行安装),则仅需在CMake命令中使用`-DUSE_CUDA=ON`参数,并通过`-DCUDA_ROOT='/cuda/root/path'`指定CUDA库的安装路径,通过-DGPU_ARCH=ARCH指定所在GPU设备的架构(K:Kepler架构;M:Maxwell架构;P:Pascal架构;V:Volta架构;T:Turing架构;A:Ampere架构)。如`cmake -DUSE_CUDA=ON -DCUDA_ROOT='C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.2' -DGPU_ARCH=P ..`。如需在GPU设备上使用半精度浮点数进行运算,需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数(需要注意的是半精度但需要注意的是,半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持,该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询)。
- 执行成功将显示`Build files have been written to:...` - 执行成功将显示`Build files have been written to:...`
- 打开build目录中的NiuTensor.sln文件即可通过Visual Studio打开NiuTensor项目。 - 打开build目录中的NiuTensor.sln文件即可通过Visual Studio打开NiuTensor项目。
- 打开后在解决方案管理器中选中NiuTensor,右键将其设为启动项目即可开始使用。 - 打开后在解决方案管理器中选中NiuTensor,右键将其设为启动项目即可开始使用。
...@@ -60,7 +60,7 @@ NiuTensor工具包可以在Windows、Linux以及macOS环境下进行安装,支 ...@@ -60,7 +60,7 @@ NiuTensor工具包可以在Windows、Linux以及macOS环境下进行安装,支
- 打开CLion首选项,点击“构建,执行,部署”选项卡中的CMake,在“CMake选项”中进行设置,设置完成后CLion将自动使用CMake对项目进行构建,如计划生成动态链接库,则仅需在在“CMake选项”中额外加上`-DGEN_DLL=ON`的CMake参数即可,否则默认生成可执行程序。 - 打开CLion首选项,点击“构建,执行,部署”选项卡中的CMake,在“CMake选项”中进行设置,设置完成后CLion将自动使用CMake对项目进行构建,如计划生成动态链接库,则仅需在在“CMake选项”中额外加上`-DGEN_DLL=ON`的CMake参数即可,否则默认生成可执行程序。
- 如项目计划启用MKL数学运算库(需用户自行安装),则仅需在“CMake选项”中填入`-DUSE_MKL=ON`,并通过`-DINTEL_ROOT='/intel/root/path'`指定MKL库(Intel工具包)的安装路径。如`-DUSE_MKL=ON -DINTEL_ROOT='/opt/intel/compilers_and_libraries_2020.2.254/linux'` - 如项目计划启用MKL数学运算库(需用户自行安装),则仅需在“CMake选项”中填入`-DUSE_MKL=ON`,并通过`-DINTEL_ROOT='/intel/root/path'`指定MKL库(Intel工具包)的安装路径。如`-DUSE_MKL=ON -DINTEL_ROOT='/opt/intel/compilers_and_libraries_2020.2.254/linux'`
- 如项目计划启用OpenBLAS数学运算库(需用户自行安装),则仅需在“CMake选项”中填入`-DUSE_OPENBLAS=ON`,并通过`-DOPENBLAS_ROOT='/openblas/root/path'`指定OpenBLAS库的安装路径。如`-DUSE_OPENBLAS=ON -DOPENBLAS_ROOT='/opt/OpenBLAS'` - 如项目计划启用OpenBLAS数学运算库(需用户自行安装),则仅需在“CMake选项”中填入`-DUSE_OPENBLAS=ON`,并通过`-DOPENBLAS_ROOT='/openblas/root/path'`指定OpenBLAS库的安装路径。如`-DUSE_OPENBLAS=ON -DOPENBLAS_ROOT='/opt/OpenBLAS'`
- 如项目计划启用CUDA数学运算库(需用户自行安装),则仅需在“CMake选项”中填入`-DUSE_CUDA=ON`,并通过`-DCUDA_ROOT='/cuda/root/path'`指定CUDA库的安装路径。如`-DUSE_CUDA=ON -DCUDA_ROOT='/usr/local/cuda-9.2'`。如需在GPU设备上使用半精度浮点数进行运算,需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数(需要注意的是半精度但需要注意的是,半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持,该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询)。 - 如项目计划启用CUDA数学运算库(需用户自行安装),则仅需在“CMake选项”中填入`-DUSE_CUDA=ON`,并通过`-DCUDA_ROOT='/cuda/root/path'`指定CUDA库的安装路径,通过-DGPU_ARCH=ARCH指定所在GPU设备的架构(K:Kepler架构;M:Maxwell架构;P:Pascal架构;V:Volta架构;T:Turing架构;A:Ampere架构)。如`-DUSE_CUDA=ON -DCUDA_ROOT='/usr/local/cuda-9.2' -DGPU_ARCH=P `。如需在GPU设备上使用半精度浮点数进行运算,需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数(需要注意的是半精度但需要注意的是,半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持,该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询)。
##### CMake方式(命令行) ##### CMake方式(命令行)
...@@ -71,7 +71,7 @@ NiuTensor工具包可以在Windows、Linux以及macOS环境下进行安装,支 ...@@ -71,7 +71,7 @@ NiuTensor工具包可以在Windows、Linux以及macOS环境下进行安装,支
- 执行CMake命令对项目进行生成,如计划生成动态链接库,则仅需在命令中额外加上`-DGEN_DLL=ON`的CMake参数即可,否则默认生成可执行程序。 - 执行CMake命令对项目进行生成,如计划生成动态链接库,则仅需在命令中额外加上`-DGEN_DLL=ON`的CMake参数即可,否则默认生成可执行程序。
- 如项目计划启用MKL数学运算库(需用户自行安装),则仅需在CMake命令中使用`-DUSE_MKL=ON`参数,并通过`-DINTEL_ROOT='/intel/root/path'`指定MKL库(Intel工具包)的安装路径。如`cmake -DUSE_MKL=ON -DINTEL_ROOT='/opt/intel/compilers_and_libraries_2020.2.254/linux' ..` - 如项目计划启用MKL数学运算库(需用户自行安装),则仅需在CMake命令中使用`-DUSE_MKL=ON`参数,并通过`-DINTEL_ROOT='/intel/root/path'`指定MKL库(Intel工具包)的安装路径。如`cmake -DUSE_MKL=ON -DINTEL_ROOT='/opt/intel/compilers_and_libraries_2020.2.254/linux' ..`
- 如项目计划启用OpenBLAS数学运算库(需用户自行安装),则仅需在CMake命令中使用`-DUSE_OPENBLAS=ON`参数,并通过`-DOPENBLAS_ROOT='/openblas/root/path'`指定OpenBLAS库的安装路径。如`cmake -DUSE_OPENBLAS=ON -DOPENBLAS_ROOT='/opt/OpenBLAS' ..` - 如项目计划启用OpenBLAS数学运算库(需用户自行安装),则仅需在CMake命令中使用`-DUSE_OPENBLAS=ON`参数,并通过`-DOPENBLAS_ROOT='/openblas/root/path'`指定OpenBLAS库的安装路径。如`cmake -DUSE_OPENBLAS=ON -DOPENBLAS_ROOT='/opt/OpenBLAS' ..`
- 如项目计划启用CUDA数学运算库(需用户自行安装),则仅需在CMake命令中使用`-DUSE_CUDA=ON`参数,并通过`-DCUDA_ROOT='/cuda/root/path'`指定CUDA库的安装路径。如`cmake -DUSE_CUDA=ON -DCUDA_ROOT='/usr/local/cuda-9.2' ..`。如需在GPU设备上使用半精度浮点数进行运算,需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数(需要注意的是半精度但需要注意的是,半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持,该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询)。 - 如项目计划启用CUDA数学运算库(需用户自行安装),则仅需在CMake命令中使用`-DUSE_CUDA=ON`参数,并通过`-DCUDA_ROOT='/cuda/root/path'`指定CUDA库的安装路径,通过-DGPU_ARCH=ARCH指定所在GPU设备的架构(K:Kepler架构;M:Maxwell架构;P:Pascal架构;V:Volta架构;T:Turing架构;A:Ampere架构)。如`cmake -DUSE_CUDA=ON -DCUDA_ROOT='/usr/local/cuda-9.2' -DGPU_ARCH=P ..`。如需在GPU设备上使用半精度浮点数进行运算,需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数(需要注意的是半精度但需要注意的是,半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持,该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询)。
- 执行成功将显示`Build files have been written to:...`并在该目录下生成Makefile文件。 - 执行成功将显示`Build files have been written to:...`并在该目录下生成Makefile文件。
- 执行`make -j`命令对NiuTensor项目进行编译,执行成功将显示`Built target NiuTensor`,安装完毕。 - 执行`make -j`命令对NiuTensor项目进行编译,执行成功将显示`Built target NiuTensor`,安装完毕。
......
...@@ -26,7 +26,7 @@ ...@@ -26,7 +26,7 @@
#include "./tensor/core/CHeader.h" #include "./tensor/core/CHeader.h"
#include "./tensor/test/Test.h" #include "./tensor/test/Test.h"
#include "./sample/fnnlm/FNNLM.h" #include "./sample/fnnlm/FNNLM.h"
#include "./sample/transformer/Transformer.h" #include "./sample/transformer/NMT.h"
//#define CRTDBG_MAP_ALLOC //#define CRTDBG_MAP_ALLOC
//#include <stdlib.h> //#include <stdlib.h>
...@@ -34,7 +34,7 @@ ...@@ -34,7 +34,7 @@
using namespace nts; using namespace nts;
using namespace fnnlm; using namespace fnnlm;
using namespace transformer; using namespace nmt;
int main( int argc, const char ** argv ) int main( int argc, const char ** argv )
{ {
...@@ -43,7 +43,7 @@ int main( int argc, const char ** argv ) ...@@ -43,7 +43,7 @@ int main( int argc, const char ** argv )
else if(argc > 1 && !strcmp(argv[1], "-fnnlm")) else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
FNNLMMain(argc - 1, argv + 1); FNNLMMain(argc - 1, argv + 1);
else if(argc > 1 && !strcmp(argv[1], "-t2t")) else if(argc > 1 && !strcmp(argv[1], "-t2t"))
TransformerMain(argc - 1, argv + 1); NMTMain(argc - 1, argv + 1);
else{ else{
fprintf(stderr, "Thanks for using NiuTensor! This is a library for building\n"); fprintf(stderr, "Thanks for using NiuTensor! This is a library for building\n");
fprintf(stderr, "neural networks in an easy way. \n\n"); fprintf(stderr, "neural networks in an easy way. \n\n");
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University. * Copyright (C) 2020 NiuTrans Research. All rights reserved.
* All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
...@@ -20,15 +19,13 @@ ...@@ -20,15 +19,13 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/ */
#include <cmath> #include "Decoder.h"
#include "Utility.h"
#include "T2TDecoder.h" #include "module/LayerNorm.h"
#include "module/T2TUtility.h" #include "module/CommonModules.h"
#include "module/T2TLayerNormal.h"
#include "module/T2TCommonModules.h"
#include "../../tensor/core/CHeader.h" #include "../../tensor/core/CHeader.h"
namespace transformer namespace nmt
{ {
/* constructor */ /* constructor */
...@@ -64,7 +61,7 @@ AttDecoder::~AttDecoder() ...@@ -64,7 +61,7 @@ AttDecoder::~AttDecoder()
initialize the model initialize the model
>> config - configurations of the model >> config - configurations of the model
*/ */
void AttDecoder::InitModel(T2TConfig& config) void AttDecoder::InitModel(Config& config)
{ {
devID = config.devID; devID = config.devID;
nlayer = config.nDecLayer; nlayer = config.nDecLayer;
...@@ -80,16 +77,17 @@ void AttDecoder::InitModel(T2TConfig& config) ...@@ -80,16 +77,17 @@ void AttDecoder::InitModel(T2TConfig& config)
/* embedding model */ /* embedding model */
embedder.InitModel(config, false); embedder.InitModel(config, false);
selfAtt = new T2TAttention[nlayer]; selfAtt = new Attention[nlayer];
fnns = new T2TFNN[nlayer]; fnns = new FNN[nlayer];
selfAttLayerNorms = new T2TLN[nlayer]; selfAttLayerNorms = new LN[nlayer];
enDeAtt = new T2TAttention[nlayer]; enDeAtt = new Attention[nlayer];
enDeAttLayerNorms = new T2TLN[nlayer]; enDeAttLayerNorms = new LN[nlayer];
fnnLayerNorms = new T2TLN[nlayer]; fnnLayerNorms = new LN[nlayer];
selfAttCache = new Cache[nlayer]; selfAttCache = new Cache[nlayer];
enDeAttCache = new Cache[nlayer]; enDeAttCache = new Cache[nlayer];
if (preNorm) if (preNorm)
decoderLayerNorm = new T2TLN; decoderLayerNorm = new LN;
/* initialize the stacked layers */ /* initialize the stacked layers */
for (int i = 0; i < nlayer; i++) { for (int i = 0; i < nlayer; i++) {
...@@ -99,6 +97,8 @@ void AttDecoder::InitModel(T2TConfig& config) ...@@ -99,6 +97,8 @@ void AttDecoder::InitModel(T2TConfig& config)
fnnLayerNorms[i].InitModel(config); fnnLayerNorms[i].InitModel(config);
enDeAtt[i].InitModel(config); enDeAtt[i].InitModel(config);
enDeAttLayerNorms[i].InitModel(config); enDeAttLayerNorms[i].InitModel(config);
selfAttCache[i].enable = true;
enDeAttCache[i].enable = true;
} }
if (preNorm) if (preNorm)
decoderLayerNorm->InitModel(config); decoderLayerNorm->InitModel(config);
...@@ -118,6 +118,7 @@ XTensor AttDecoder::Make(XTensor& inputDec, XTensor& outputEnc, XTensor* mask, ...@@ -118,6 +118,7 @@ XTensor AttDecoder::Make(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
XTensor* maskEncDec, int nstep, bool isTraining) XTensor* maskEncDec, int nstep, bool isTraining)
{ {
XTensor x; XTensor x;
x = embedder.Make(inputDec, true, isTraining, nstep); x = embedder.Make(inputDec, true, isTraining, nstep);
/* dropout */ /* dropout */
...@@ -188,8 +189,86 @@ XTensor AttDecoder::Make(XTensor& inputDec, XTensor& outputEnc, XTensor* mask, ...@@ -188,8 +189,86 @@ XTensor AttDecoder::Make(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
} }
if (preNorm) if (preNorm)
return decoderLayerNorm->Make(x);
return x;
}
/*
make the decoding network
>> inputDec - the input tensor of the decoder
>> outputEnc - the output tensor of the encoder
>> mask - mask that indicates which position is valid
>> maskEncDec - mask for the encoder-decoder attention
>> nstep - the current length of the decoder input
>> isTraining - indicates whether the model is used for training
<< return - the output tensor of the decoder
*/
XTensor AttDecoder::MakeFast(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
XTensor* maskEncDec, int nstep, bool isTraining)
{
XTensor x;
x = embedder.Make(inputDec, true, isTraining, nstep);
/* dropout */
if (isTraining && dropoutP > 0)
x = Dropout(x, dropoutP);
for (int i = 0; i < nlayer; i++) {
XTensor res;
res = x;
/* layer normalization with pre-norm for self-attn */
x = selfAttLayerNorms[i].Make(x);
/******************/
/* self attention */
x = selfAtt[i].Make(x, x, x, mask, isTraining, &selfAttCache[i], SELF_ATT);
/* dropout */
if (isTraining && dropoutP > 0)
x = Dropout(x, dropoutP);
/* residual connection */
x = Sum(res, x);
res = x;
/* layer normalization with pre-norm for encoder-decoder attention */
x = enDeAttLayerNorms[i].Make(x);
/* encoder-decoder attention */
x = enDeAtt[i].Make(outputEnc, x, outputEnc, maskEncDec,
isTraining, &enDeAttCache[i], EN_DE_ATT);
/* dropout */
if (isTraining && dropoutP > 0)
x = Dropout(x, dropoutP);
/* residual connection */
x = Sum(res, x);
res = x;
/* layer normalization with pre-norm for fnn */
x = fnnLayerNorms[i].Make(x);
/* fnn */
x = fnns[i].Make(x, isTraining);
/* dropout */
if (isTraining && dropoutP > 0)
x = Dropout(x, dropoutP);
/* residual connection */
x = Sum(res, x);
}
x = decoderLayerNorm->Make(x); x = decoderLayerNorm->Make(x);
return x; return x;
} }
} }
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University. * Copyright (C) 2020 NiuTrans Research. All rights reserved.
* All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
...@@ -20,13 +19,13 @@ ...@@ -20,13 +19,13 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/ */
#ifndef __T2TDECODER_H__ #ifndef __DECODER_H__
#define __T2TDECODER_H__ #define __DECODER_H__
#include "T2TEncoder.h" #include "Encoder.h"
#include "module/T2TUtility.h" #include "Utility.h"
namespace transformer namespace nmt
{ {
class AttDecoder class AttDecoder
...@@ -52,28 +51,28 @@ public: ...@@ -52,28 +51,28 @@ public:
DTYPE dropoutP; DTYPE dropoutP;
/* embedding of word at each position */ /* embedding of word at each position */
T2TEmbedder embedder; Embedder embedder;
/* FNN model of each layer */ /* FNN model of each layer */
T2TFNN* fnns; FNN* fnns;
/* attention model of each layer */ /* attention model of each layer */
T2TAttention* selfAtt; Attention* selfAtt;
/* layer normalization for attention */ /* layer normalization for attention */
T2TLN* selfAttLayerNorms; LN* selfAttLayerNorms;
/* layer normalization for fnn */ /* layer normalization for fnn */
T2TLN* fnnLayerNorms; LN* fnnLayerNorms;
/* layer normalization for decoder */ /* layer normalization for decoder */
T2TLN* decoderLayerNorm; LN* decoderLayerNorm;
/* encoder-decoder attention model of each layer */ /* encoder-decoder attention model of each layer */
T2TAttention* enDeAtt; Attention* enDeAtt;
/* layer normalization for encoder-decoder attention */ /* layer normalization for encoder-decoder attention */
T2TLN* enDeAttLayerNorms; LN* enDeAttLayerNorms;
/* layer cache list */ /* layer cache list */
Cache* selfAttCache; Cache* selfAttCache;
...@@ -92,11 +91,15 @@ public: ...@@ -92,11 +91,15 @@ public:
~AttDecoder(); ~AttDecoder();
/* initialize the model */ /* initialize the model */
void InitModel(T2TConfig& config); void InitModel(Config& config);
/* make the decoding network */ /* make the decoding network */
XTensor Make(XTensor& inputDec, XTensor& outputEnc, XTensor* mask, XTensor Make(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
XTensor* maskEncDec, int nstep, bool isTraining); XTensor* maskEncDec, int nstep, bool isTraining);
/* make the decoding network (pre norm) */
XTensor MakeFast(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
XTensor* maskEncDec, int nstep, bool isTraining);
}; };
} }
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University. * Copyright (C) 2020 NiuTrans Research. All rights reserved.
* All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
...@@ -20,15 +19,13 @@ ...@@ -20,15 +19,13 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/ */
#include <cmath> #include "Encoder.h"
#include "Utility.h"
#include "T2TEncoder.h" #include "module/LayerNorm.h"
#include "module/T2TUtility.h" #include "module/CommonModules.h"
#include "module/T2TLayerNormal.h"
#include "module/T2TCommonModules.h"
#include "../../tensor/core/CHeader.h" #include "../../tensor/core/CHeader.h"
namespace transformer namespace nmt
{ {
/* constructor */ /* constructor */
...@@ -56,7 +53,7 @@ AttEncoder::~AttEncoder() ...@@ -56,7 +53,7 @@ AttEncoder::~AttEncoder()
initialize the model initialize the model
>> config - configurations for the model >> config - configurations for the model
*/ */
void AttEncoder::InitModel(T2TConfig& config) void AttEncoder::InitModel(Config& config)
{ {
devID = config.devID; devID = config.devID;
...@@ -68,18 +65,18 @@ void AttEncoder::InitModel(T2TConfig& config) ...@@ -68,18 +65,18 @@ void AttEncoder::InitModel(T2TConfig& config)
dropoutP = config.dropout; dropoutP = config.dropout;
CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!"); CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsize\""); CheckNTErrors(vSize > 1, "Set vocabulary size by \"-vsize\"");
/* embedding model */ /* embedding model */
embedder.InitModel(config); embedder.InitModel(config);
selfAtt = new T2TAttention[nlayer]; selfAtt = new Attention[nlayer];
fnns = new T2TFNN[nlayer]; fnns = new FNN[nlayer];
attLayerNorms = new T2TLN[nlayer]; attLayerNorms = new LN[nlayer];
fnnLayerNorms = new T2TLN[nlayer]; fnnLayerNorms = new LN[nlayer];
if (preNorm) if (preNorm)
encoderLayerNorm = new T2TLN; encoderLayerNorm = new LN;
/* initialize the stacked layers */ /* initialize the stacked layers */
for (int i = 0; i < nlayer; i++) { for (int i = 0; i < nlayer; i++) {
...@@ -122,7 +119,7 @@ XTensor AttEncoder::Make(XTensor& input, XTensor* mask, XTensor& maskEncDec, boo ...@@ -122,7 +119,7 @@ XTensor AttEncoder::Make(XTensor& input, XTensor* mask, XTensor& maskEncDec, boo
attnBefore = LayerNorm(x, attLayerNorms[i], preNorm, true, false); attnBefore = LayerNorm(x, attLayerNorms[i], preNorm, true, false);
/* self attention */ /* self attention */
att = selfAtt[i].Make(attnBefore, attnBefore, attnBefore, mask, isTraining, NULL, 0); att = selfAtt[i].Make(attnBefore, attnBefore, attnBefore, mask, isTraining, NULL, SELF_ATT);
/* dropout */ /* dropout */
if (isTraining && dropoutP > 0) if (isTraining && dropoutP > 0)
...@@ -151,6 +148,62 @@ XTensor AttEncoder::Make(XTensor& input, XTensor* mask, XTensor& maskEncDec, boo ...@@ -151,6 +148,62 @@ XTensor AttEncoder::Make(XTensor& input, XTensor* mask, XTensor& maskEncDec, boo
x = LayerNorm(res, fnnLayerNorms[i], preNorm, false, true); x = LayerNorm(res, fnnLayerNorms[i], preNorm, false, true);
} }
if (preNorm) if (preNorm)
return encoderLayerNorm->Make(x);
return x;
}
/*
make the encoding network
>> input - the input tensor of the encoder
>> mask - the mask that indicate each position is valid
>> maskEncDec - no use
>> isTraining - indicates whether the model is used for training
<< return - the output tensor of the encoder
*/
XTensor AttEncoder::MakeFast(XTensor& input, XTensor* mask, XTensor& maskEncDec, bool isTraining)
{
XTensor x;
x = embedder.Make(input, false, isTraining);
/* dropout */
if (isTraining && dropoutP > 0)
x = Dropout(x, dropoutP);
for (int i = 0; i < nlayer; i++) {
XTensor res;
res = x;
/* layer normalization with pre-norm for self-attn */
x = attLayerNorms[i].Make(x);
/* self attention */
x = selfAtt[i].Make(x, x, x, mask, isTraining, NULL, SELF_ATT);
/* dropout */
if (isTraining && dropoutP > 0)
x = Dropout(x, dropoutP);
/* residual connection */
x = Sum(res, x);
res = x;
/* layer normalization with pre-norm for fnn */
x = fnnLayerNorms[i].Make(x);
/* fnn */
x = fnns[i].Make(x, isTraining);
/* dropout */
if (isTraining && dropoutP > 0)
x = Dropout(x, dropoutP);
/* residual connection */
x = Sum(res, x);
}
x = encoderLayerNorm->Make(x); x = encoderLayerNorm->Make(x);
return x; return x;
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University. * Copyright (C) 2020 NiuTrans Research. All rights reserved.
* All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
...@@ -20,25 +19,25 @@ ...@@ -20,25 +19,25 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/ */
#ifndef __T2TENCODER_H__ #ifndef __ENCODER_H__
#define __T2TENCODER_H__ #define __ENCODER_H__
#include "module/T2TFNN.h" #include "Utility.h"
#include "module/T2TUtility.h" #include "module/FNN.h"
#include "module/T2TAttention.h" #include "module/Attention.h"
#include "module/T2TEmbedding.h" #include "module/Embedding.h"
#include "module/T2TLayerNormal.h" #include "module/LayerNorm.h"
#include "../../network/XNet.h" #include "../../network/XNet.h"
using namespace nts; using namespace nts;
namespace transformer namespace nmt
{ {
/* /*
base class of the encoder base class of the encoder
*/ */
class T2TEncoder class Encoder
{ {
public: public:
virtual XTensor Make(XTensor& input, XTensor* mask, XTensor& mask2, bool isTraining) = 0; virtual XTensor Make(XTensor& input, XTensor* mask, XTensor& mask2, bool isTraining) = 0;
...@@ -47,7 +46,7 @@ public: ...@@ -47,7 +46,7 @@ public:
/* /*
the encoder based on self-attention the encoder based on self-attention
*/ */
class AttEncoder : T2TEncoder class AttEncoder : Encoder
{ {
public: public:
/* device id */ /* device id */
...@@ -73,22 +72,22 @@ public: ...@@ -73,22 +72,22 @@ public:
int ignored; int ignored;
/* embedding of word at each position */ /* embedding of word at each position */
T2TEmbedder embedder; Embedder embedder;
/* FNN model of each layer */ /* FNN model of each layer */
T2TFNN* fnns; FNN* fnns;
/* attention model of each layer */ /* attention model of each layer */
T2TAttention* selfAtt; Attention* selfAtt;
/* layer normalizations for attention */ /* layer normalizations for attention */
T2TLN* attLayerNorms; LN* attLayerNorms;
/* layer normalization for fnn */ /* layer normalization for fnn */
T2TLN* fnnLayerNorms; LN* fnnLayerNorms;
/* layer normalization for encoder */ /* layer normalization for encoder */
T2TLN* encoderLayerNorm; LN* encoderLayerNorm;
/* the location of layer normalization */ /* the location of layer normalization */
bool preNorm; bool preNorm;
...@@ -101,11 +100,14 @@ public: ...@@ -101,11 +100,14 @@ public:
~AttEncoder(); ~AttEncoder();
/* initialize the model */ /* initialize the model */
void InitModel(T2TConfig& config); void InitModel(Config& config);
/* make the encoding network */ /* make the encoding network */
XTensor Make(XTensor& input, XTensor* mask, XTensor& maskEncDec, bool isTraining); XTensor Make(XTensor& input, XTensor* mask, XTensor& maskEncDec, bool isTraining);
/* make the encoding network */
XTensor MakeFast(XTensor& input, XTensor* mask, XTensor& maskEncDec, bool isTraining);
/* make the encoding network (wrapper) */ /* make the encoding network (wrapper) */
XTensor Make(XTensor& input, XTensor* mask, bool isTraining); XTensor Make(XTensor& input, XTensor* mask, bool isTraining);
}; };
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University. * Copyright (C) 2020 NiuTrans Research. All rights reserved.
* All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
...@@ -22,32 +21,32 @@ ...@@ -22,32 +21,32 @@
#include <cstdint> #include <cstdint>
#include "T2TModel.h" #include "Model.h"
#include "module/T2TUtility.h" #include "Utility.h"
#include "../../tensor/XUtility.h" #include "../../tensor/XUtility.h"
#include "../../tensor/core/CHeader.h" #include "../../tensor/core/CHeader.h"
namespace transformer namespace nmt
{ {
/* constructor */ /* constructor */
T2TModel::T2TModel() Model::Model()
{ {
devID = -1; devID = -1;
isLM = false; isLM = false;
isMT = false; isMT = false;
useFP16 = false; useFP16 = false;
shareAllEmbeddings = false; shareAllEmbeddings = 0;
shareDecInputOutputWeight = false; shareDecInputOutputWeight = 0;
nhead = 1; nhead = 1;
encoder = new AttEncoder(); encoder = new AttEncoder();
decoder = new AttDecoder(); decoder = new AttDecoder();
outputLayer = new T2TOutput(); outputLayer = new Output();
} }
/* de-constructor */ /* de-constructor */
T2TModel::~T2TModel() Model::~Model()
{ {
delete encoder; delete encoder;
delete decoder; delete decoder;
...@@ -58,7 +57,7 @@ T2TModel::~T2TModel() ...@@ -58,7 +57,7 @@ T2TModel::~T2TModel()
initialize the model initialize the model
>> config - configurations of the model >> config - configurations of the model
*/ */
void T2TModel::InitModel(T2TConfig& config) void Model::InitModel(Config& config)
{ {
devID = config.devID; devID = config.devID;
isMT = config.isMT; isMT = config.isMT;
...@@ -71,8 +70,8 @@ void T2TModel::InitModel(T2TConfig& config) ...@@ -71,8 +70,8 @@ void T2TModel::InitModel(T2TConfig& config)
&config.fnnHiddenSize, &config.modelSize, &config.fnnHiddenSize, &config.modelSize,
&config.embSize, &config.srcVocabSize, &config.embSize, &config.srcVocabSize,
&config.tgtVocabSize, &config.nhead, &config.tgtVocabSize, &config.nhead,
&config.maxRP, &shareAllEmbeddings, &config.maxRP, &config.shareAllEmbeddings,
&shareDecInputOutputWeight, &config.shareDecInputOutputWeight,
&config.maxPosLen &config.maxPosLen
}; };
...@@ -81,10 +80,28 @@ void T2TModel::InitModel(T2TConfig& config) ...@@ -81,10 +80,28 @@ void T2TModel::InitModel(T2TConfig& config)
/* read model configurations */ /* read model configurations */
if (!config.isTraining) { if (!config.isTraining) {
modelFile = fopen(config.modelFN, "rb"); modelFile = fopen(config.modelFN, "rb");
for (auto& meta : metaInfo) CheckNTErrors(modelFile, "Failed to open the model file");
for (auto& meta : metaInfo) {
fread(meta, sizeof(int), 1, modelFile); fread(meta, sizeof(int), 1, modelFile);
} }
}
else {
/* read the source and target vocab size */
FILE* trainF = fopen(config.trainFN, "rb");
CheckNTErrors(trainF, "Failed to open the training file");
fread(&config.srcVocabSize, sizeof(config.srcVocabSize), 1, trainF);
fread(&config.tgtVocabSize, sizeof(config.tgtVocabSize), 1, trainF);
CheckNTErrors(config.srcVocabSize > 0, "Invalid source vocabulary size");
CheckNTErrors(config.tgtVocabSize > 0, "Invalid target vocabulary size");
fclose(trainF);
}
nhead = config.nhead; nhead = config.nhead;
shareAllEmbeddings = config.shareAllEmbeddings;
shareDecInputOutputWeight = config.shareDecInputOutputWeight;
ShowModelConfig(config);
encoder->InitModel(config); encoder->InitModel(config);
outputLayer->InitModel(config); outputLayer->InitModel(config);
...@@ -92,13 +109,12 @@ void T2TModel::InitModel(T2TConfig& config) ...@@ -92,13 +109,12 @@ void T2TModel::InitModel(T2TConfig& config)
if (isMT) if (isMT)
decoder->InitModel(config); decoder->InitModel(config);
TensorList params(10);
GetParams(params);
/* load parameters */ /* load parameters */
if (!config.isTraining) if (!config.isTraining)
Read(modelFile); Read(modelFile);
else { else {
TensorList params;
GetParams(params);
for (int i = 0; i < params.Size(); i++) for (int i = 0; i < params.Size(); i++)
params[i]->SetVarFlag(); params[i]->SetVarFlag();
} }
...@@ -108,13 +124,28 @@ void T2TModel::InitModel(T2TConfig& config) ...@@ -108,13 +124,28 @@ void T2TModel::InitModel(T2TConfig& config)
} }
/* /*
print model configurations
>> config - model configurations
*/
void Model::ShowModelConfig(Config& config)
{
/* TODO: output more info */
XPRINT1(0, stderr, "encoder layer: %d\n", config.nEncLayer);
XPRINT1(0, stderr, "decoder layer: %d\n", config.nDecLayer);
XPRINT1(0, stderr, "attention heads: %d\n", config.nhead);
XPRINT1(0, stderr, "model size: %d\n", config.modelSize);
XPRINT1(0, stderr, "source vocab size: %d\n", config.srcVocabSize);
XPRINT1(0, stderr, "target vocab size: %d\n", config.tgtVocabSize);
}
/*
make the encoding network make the encoding network
>> input - input tensor >> input - input tensor, (batchSize, srcLen)
>> mask - the mask for positions that are/not involved in computation >> mask - the mask for encoder self-attention, (headNum, batchSize, srcLen, srcLen)
>> isTraining - indicates whether we are training the model >> isTraining - indicates whether we are training the model
<< return - encoding result << return - encoding result, (batchSize, srcLen, hiddenDim)
*/ */
XTensor T2TModel::MakeEncoder(XTensor& input, XTensor* mask, bool isTraining) XTensor Model::MakeEncoder(XTensor& input, XTensor* mask, bool isTraining)
{ {
XTensor nothing; XTensor nothing;
...@@ -123,15 +154,14 @@ XTensor T2TModel::MakeEncoder(XTensor& input, XTensor* mask, bool isTraining) ...@@ -123,15 +154,14 @@ XTensor T2TModel::MakeEncoder(XTensor& input, XTensor* mask, bool isTraining)
/* /*
make the decoding network make the decoding network
>> inputDec - input tensor of the decoder >> inputDec - input tensor of the decoder, (batchSize, tgtLen)
>> outputEnc - output tensor of the encoder >> outputEnc - output tensor of the encoder, (batchSize, srcLen, hiddenDim)
>> output - output tensor (distribution) >> mask - mask for decoder self-attention, (headNum, batchSize, tgtLen, tgtLen)
>> mask - mask for positions that are/not involved in computation >> maskEncDec - mask for the encoder-decoder attention, (headNum, batchSize, tgtLen, srcLen)
>> maskEncDec - mask for the encoder-decoder attention
>> isTraining - indicates whether we are training the model >> isTraining - indicates whether we are training the model
<< return - encoding result << return - decoding result, (batchSize, tgtLen, hiddenDim)
*/ */
XTensor T2TModel::MakeDecoder(XTensor& inputDec, XTensor& outputEnc, XTensor Model::MakeDecoder(XTensor& inputDec, XTensor& outputEnc,
XTensor* mask, XTensor& maskEncDec, bool isTraining) XTensor* mask, XTensor& maskEncDec, bool isTraining)
{ {
return decoder->Make(inputDec, outputEnc, mask, &maskEncDec, return decoder->Make(inputDec, outputEnc, mask, &maskEncDec,
...@@ -145,7 +175,7 @@ make the network for language modeling (with the output softmax layer) ...@@ -145,7 +175,7 @@ make the network for language modeling (with the output softmax layer)
>> padding - padding of the sequences >> padding - padding of the sequences
>> isTraining - indicates whether the model is for training >> isTraining - indicates whether the model is for training
*/ */
void T2TModel::MakeLM(XTensor& input, XTensor& output, XTensor& padding, bool isTraining) void Model::MakeLM(XTensor& input, XTensor& output, XTensor& padding, bool isTraining)
{ {
int len = padding.GetDim(padding.order - 1); int len = padding.GetDim(padding.order - 1);
int* dims = new int[padding.order + 2]; int* dims = new int[padding.order + 2];
...@@ -173,19 +203,19 @@ void T2TModel::MakeLM(XTensor& input, XTensor& output, XTensor& padding, bool is ...@@ -173,19 +203,19 @@ void T2TModel::MakeLM(XTensor& input, XTensor& output, XTensor& padding, bool is
/* /*
make the network for machine translation (with the output softmax layer) make the network for machine translation (with the output softmax layer)
>> inputEnc - input tensor of the encoder >> inputEnc - input tensor of the encoder, (batchSize, srcLen)
>> inputDec - input tensor of the decoder >> inputDec - input tensor of the decoder, (batchSize, tgtLen)
>> output - output tensor (distribution) >> output - output tensor (distribution), (batchSize, tgtLen, hiddenDim)
>> paddingEnc - padding of the sequences (on the encoder side) >> paddingEnc - padding of the sequences (on the encoder side), (batchSize, srcLen)
>> paddingDec - padding of the sequences (on the decoder side) >> paddingDec - padding of the sequences (on the decoder side), (batchSize, tgtLen)
>> isTraining - indicates whether the model is for training >> isTraining - indicates whether the model is for training
*/ */
void T2TModel::MakeMT(XTensor& inputEnc, XTensor& inputDec, XTensor& output, void Model::MakeMT(XTensor& inputEnc, XTensor& inputDec, XTensor& output,
XTensor& paddingEnc, XTensor& paddingDec, XTensor& paddingEnc, XTensor& paddingDec, bool isTraining)
bool isTraining)
{ {
XTensor encoding; XTensor encoding;
XTensor decoding; XTensor decoding;
XTensor maskEnc; XTensor maskEnc;
XTensor maskDec; XTensor maskDec;
XTensor maskEncDec; XTensor maskEncDec;
...@@ -213,7 +243,7 @@ make the mask for training MT models ...@@ -213,7 +243,7 @@ make the mask for training MT models
>> maksDec - mask of the decoder self-attention >> maksDec - mask of the decoder self-attention
>> maksEncDec - mask of the decoder enc-dec attention >> maksEncDec - mask of the decoder enc-dec attention
*/ */
void T2TModel::MakeMTMask(XTensor& inputEnc, XTensor& inputDec, void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
XTensor& paddingEnc, XTensor& paddingDec, XTensor& paddingEnc, XTensor& paddingDec,
XTensor& maskEnc, XTensor& maskDec, XTensor& maskEncDec) XTensor& maskEnc, XTensor& maskDec, XTensor& maskEncDec)
{ {
...@@ -260,8 +290,7 @@ void T2TModel::MakeMTMask(XTensor& inputEnc, XTensor& inputDec, ...@@ -260,8 +290,7 @@ void T2TModel::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
dimsPadding[i + 1] = padding2->GetDim(i); dimsPadding[i + 1] = padding2->GetDim(i);
dimsPadding[0] = nhead; dimsPadding[0] = nhead;
XTensor* padding3 = NewTensorBuf(paddingEnc.order + 2, dimsPadding, paddingEnc.dataType, XTensor* padding3 = NewTensorBuf(paddingEnc.order + 2, dimsPadding, paddingEnc.dataType, paddingEnc.devID);
paddingEnc.devID);
/* mask of the padding */ /* mask of the padding */
_Unsqueeze(&paddingEnc, padding2, paddingEnc.order - 1, paddingEnc.GetDim(-1)); _Unsqueeze(&paddingEnc, padding2, paddingEnc.order - 1, paddingEnc.GetDim(-1));
...@@ -284,37 +313,27 @@ void T2TModel::MakeMTMask(XTensor& inputEnc, XTensor& inputDec, ...@@ -284,37 +313,27 @@ void T2TModel::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
/* /*
make the mask of the encoder make the mask of the encoder
>> inputEnc - input of the encoder >> paddingEnc - padding of the encoder input, (batchSize, srcLen)
>> paddingEnc - padding of the encoder input >> maskEnc - mask of the encoder self-attention, (headNum, batchSize, srcLen, srcLen)
>> maskEnc - mask of the encoder self-attention
*/ */
void T2TModel::MakeMTMaskEnc(XTensor& paddingEnc, XTensor& maskEnc) void Model::MakeMTMaskEnc(XTensor& paddingEnc, XTensor& maskEnc)
{ {
XTensor padding2; XTensor padding2;
XTensor padding3;
/* mask of the padding */ /* mask of the padding */
Unsqueeze(paddingEnc, padding2, paddingEnc.order - 1, paddingEnc.GetDim(-1)); Unsqueeze(paddingEnc, padding2, paddingEnc.order - 1, paddingEnc.GetDim(-1));
Unsqueeze(padding2, padding3, 0, nhead); Unsqueeze(padding2, maskEnc, 0, nhead);
ScaleAndShiftMe(padding3, 1e9F, -1e9F); ScaleAndShiftMe(maskEnc, 1e9F, -1e9F);
InitTensor(&maskEnc, &padding3);
maskEnc.SetZeroAll();
/* generate the mask on the source language side (for padding) */
SumMe(maskEnc, padding3);
} }
/* /*
make the mask of the decoder make the mask of the decoder
>> inputEnc - input of the encoder >> paddingEnc - padding of the encoder input, (batchSize, srcLen)
>> inputDec - input of the decoder >> paddingDec - padding of the decoder input, (batchSize, tgtLen)
>> paddingEnc - padding of the encoder input >> maksDec - mask of the decoder self-attention, (headNum, batchSize, tgtLen, tgtLen)
>> paddingDec - padding of the decoder input >> maksEncDec - mask of the decoder enc-dec attention, (headNum, batchSize, tgtLen, srcLen)
>> maksDec - mask of the decoder self-attention
>> maksEncDec - mask of the decoder enc-dec attention
*/ */
void T2TModel::MakeMTMaskDec(XTensor& paddingEnc, XTensor& paddingDec, void Model::MakeMTMaskDec(XTensor& paddingEnc, XTensor& paddingDec,
XTensor& maskDec, XTensor& maskEncDec) XTensor& maskDec, XTensor& maskEncDec)
{ {
int len = paddingDec.GetDim(paddingDec.order - 1); int len = paddingDec.GetDim(paddingDec.order - 1);
...@@ -340,26 +359,27 @@ void T2TModel::MakeMTMaskDec(XTensor& paddingEnc, XTensor& paddingDec, ...@@ -340,26 +359,27 @@ void T2TModel::MakeMTMaskDec(XTensor& paddingEnc, XTensor& paddingDec,
delete[] dims; delete[] dims;
} }
/* /*
get parameter matrices get parameter matrices
>> list - the list that keeps the parameter matrics >> list - the list that keeps the parameter matrics
*/ */
void T2TModel::GetParams(TensorList& list) void Model::GetParams(TensorList& list)
{ {
list.Clear(); list.Clear();
/* encoder parameters */ /* encoder parameters */
for (int i = 0; i < encoder->nlayer; i++) { for (int i = 0; i < encoder->nlayer; i++) {
list.Add(&encoder->selfAtt[i].wq); list.Add(&encoder->selfAtt[i].weightQ);
list.Add(&encoder->selfAtt[i].wk); list.Add(&encoder->selfAtt[i].weightK);
list.Add(&encoder->selfAtt[i].wv); list.Add(&encoder->selfAtt[i].weightV);
list.Add(&encoder->selfAtt[i].bq); list.Add(&encoder->selfAtt[i].biasQ);
list.Add(&encoder->selfAtt[i].bk); list.Add(&encoder->selfAtt[i].biasK);
list.Add(&encoder->selfAtt[i].bv); list.Add(&encoder->selfAtt[i].biasV);
if (encoder->selfAtt[i].useRPR) if (encoder->selfAtt[i].useRPR)
list.Add(&encoder->selfAtt[i].RPEmbK); list.Add(&encoder->selfAtt[i].RPEmbK);
list.Add(&encoder->selfAtt[i].wo); list.Add(&encoder->selfAtt[i].weightO);
list.Add(&encoder->selfAtt[i].bo); list.Add(&encoder->selfAtt[i].biasO);
list.Add(&encoder->fnns[i].w1); list.Add(&encoder->fnns[i].w1);
list.Add(&encoder->fnns[i].b1); list.Add(&encoder->fnns[i].b1);
list.Add(&encoder->fnns[i].w2); list.Add(&encoder->fnns[i].w2);
...@@ -377,26 +397,26 @@ void T2TModel::GetParams(TensorList& list) ...@@ -377,26 +397,26 @@ void T2TModel::GetParams(TensorList& list)
if (isMT) { if (isMT) {
/* decoder parameters */ /* decoder parameters */
for (int i = 0; i < decoder->nlayer; i++) { for (int i = 0; i < decoder->nlayer; i++) {
list.Add(&decoder->selfAtt[i].wq); list.Add(&decoder->selfAtt[i].weightQ);
list.Add(&decoder->selfAtt[i].wk); list.Add(&decoder->selfAtt[i].weightK);
list.Add(&decoder->selfAtt[i].wv); list.Add(&decoder->selfAtt[i].weightV);
list.Add(&decoder->selfAtt[i].bq); list.Add(&decoder->selfAtt[i].biasQ);
list.Add(&decoder->selfAtt[i].bk); list.Add(&decoder->selfAtt[i].biasK);
list.Add(&decoder->selfAtt[i].bv); list.Add(&decoder->selfAtt[i].biasV);
if (decoder->selfAtt[i].useRPR) if (decoder->selfAtt[i].useRPR)
list.Add(&decoder->selfAtt[i].RPEmbK); list.Add(&decoder->selfAtt[i].RPEmbK);
list.Add(&decoder->selfAtt[i].wo); list.Add(&decoder->selfAtt[i].weightO);
list.Add(&decoder->selfAtt[i].bo); list.Add(&decoder->selfAtt[i].biasO);
list.Add(&decoder->selfAttLayerNorms[i].w); list.Add(&decoder->selfAttLayerNorms[i].w);
list.Add(&decoder->selfAttLayerNorms[i].b); list.Add(&decoder->selfAttLayerNorms[i].b);
list.Add(&decoder->enDeAtt[i].wq); list.Add(&decoder->enDeAtt[i].weightQ);
list.Add(&decoder->enDeAtt[i].wk); list.Add(&decoder->enDeAtt[i].weightK);
list.Add(&decoder->enDeAtt[i].wv); list.Add(&decoder->enDeAtt[i].weightV);
list.Add(&decoder->enDeAtt[i].bq); list.Add(&decoder->enDeAtt[i].biasQ);
list.Add(&decoder->enDeAtt[i].bk); list.Add(&decoder->enDeAtt[i].biasK);
list.Add(&decoder->enDeAtt[i].bv); list.Add(&decoder->enDeAtt[i].biasV);
list.Add(&decoder->enDeAtt[i].wo); list.Add(&decoder->enDeAtt[i].weightO);
list.Add(&decoder->enDeAtt[i].bo); list.Add(&decoder->enDeAtt[i].biasO);
list.Add(&decoder->enDeAttLayerNorms[i].w); list.Add(&decoder->enDeAttLayerNorms[i].w);
list.Add(&decoder->enDeAttLayerNorms[i].b); list.Add(&decoder->enDeAttLayerNorms[i].b);
list.Add(&decoder->fnns[i].w1); list.Add(&decoder->fnns[i].w1);
...@@ -418,8 +438,9 @@ void T2TModel::GetParams(TensorList& list) ...@@ -418,8 +438,9 @@ void T2TModel::GetParams(TensorList& list)
list.Add(&decoder->embedder.w); list.Add(&decoder->embedder.w);
} }
if (shareDecInputOutputWeight == 0) if (shareDecInputOutputWeight == 0) {
list.Add(&outputLayer->w); list.Add(&outputLayer->w);
}
} }
/* /*
...@@ -427,14 +448,14 @@ dump the model to a file ...@@ -427,14 +448,14 @@ dump the model to a file
>> fn - where to save the model >> fn - where to save the model
>> model - the model >> model - the model
*/ */
void T2TModel::Dump(const char* fn) void Model::Dump(const char* fn)
{ {
double startT = GetClockSec(); double startT = GetClockSec();
FILE* file = fopen(fn, "wb"); FILE* file = fopen(fn, "wb");
CheckNTErrors(file, "Cannot open the model file"); CheckNTErrors(file, "Cannot open the model file");
TensorList params(100); TensorList params;
GetParams(params); GetParams(params);
...@@ -459,22 +480,29 @@ void T2TModel::Dump(const char* fn) ...@@ -459,22 +480,29 @@ void T2TModel::Dump(const char* fn)
double elapsed = GetClockSec() - startT; double elapsed = GetClockSec() - startT;
XPRINT1(0, stderr, "[INFO] model saved (took %.1fs)\n", elapsed); LOG("model saved (took %.1fs)", elapsed);
} }
/* read the parameters */ /* read the parameters */
void T2TModel::Read(FILE* file) void Model::Read(FILE* file)
{ {
double startT = GetClockSec(); double startT = GetClockSec();
TensorList params(100); TensorList params;
GetParams(params); GetParams(params);
LOG("params count: %lu", params.Size());
int size = 0;
for (int i = 0; i < params.Size(); i++) {
size += params[i]->unitNum;
}
LOG("params size: %d", size);
/* convert parameters to FP16 */ /* convert parameters to FP16 before reading files */
if (useFP16) { if (useFP16) {
LOG("Convert parameters to FP16");
for (int i = 0; i < params.Size(); i++) { for (int i = 0; i < params.Size(); i++) {
XTensor* p = params[i]; XTensor* p = params[i];
InitTensorV2(p, p->order, p->dimSize, X_FLOAT16, 1, p->devID); InitTensor(p, p->order, p->dimSize, X_FLOAT16, p->devID, p->enableGrad && X_ENABLE_GRAD);
} }
auto& encEmb = encoder->embedder.posEmbeddingBase; auto& encEmb = encoder->embedder.posEmbeddingBase;
...@@ -488,18 +516,18 @@ void T2TModel::Read(FILE* file) ...@@ -488,18 +516,18 @@ void T2TModel::Read(FILE* file)
/* share all embeddings */ /* share all embeddings */
if (shareAllEmbeddings == 1) { if (shareAllEmbeddings == 1) {
decoder->embedder.w = CopyValues(encoder->embedder.w); _CopyValues(&encoder->embedder.w, &decoder->embedder.w);
XPRINT(0, stderr, "[INFO] sharing encoder decoder embeddings\n"); LOG("sharing encoder decoder embeddings");
} }
/* share embeddings with output weights */ /* share embeddings with output weights */
if (shareDecInputOutputWeight == 1) { if (shareDecInputOutputWeight == 1) {
outputLayer->w = CopyValues(decoder->embedder.w); _CopyValues(&decoder->embedder.w, &outputLayer->w);
XPRINT(0, stderr, "[INFO] sharing decoder embeddings with output weights\n"); LOG("sharing decoder embeddings with output weights");
} }
double elapsed = GetClockSec() - startT; double elapsed = GetClockSec() - startT;
XPRINT1(0, stderr, "[INFO] model loaded (took %.1fs)\n", elapsed); LOG("model loaded (took %.1fs)", elapsed);
} }
} }
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University. * Copyright (C) 2020 NiuTrans Research. All rights reserved.
* All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
...@@ -20,23 +19,22 @@ ...@@ -20,23 +19,22 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/ */
#ifndef __T2TMODEL_H__ #ifndef __MODEL_H__
#define __T2TMODEL_H__ #define __MODEL_H__
#include "T2TEncoder.h" #include "Encoder.h"
#include "T2TDecoder.h" #include "Decoder.h"
#include "module/T2TFNN.h" #include "module/FNN.h"
#include "module/T2TOutput.h" #include "module/Output.h"
#include "module/T2TUtility.h" #include "Utility.h"
#include "module/T2TAttention.h" #include "module/Attention.h"
namespace transformer namespace nmt
{ {
/* a transformer model that keeps parameters of the encoder, /* a nmt model that keeps parameters of the encoder,
the decoder and the output layer (softmax). Also, it creates the decoder and the output layer (softmax). */
the network used in transformer. */ class Model
class T2TModel
{ {
public: public:
/* device id */ /* device id */
...@@ -49,7 +47,7 @@ public: ...@@ -49,7 +47,7 @@ public:
AttDecoder* decoder; AttDecoder* decoder;
/* output layer */ /* output layer */
T2TOutput* outputLayer; Output* outputLayer;
/* indicates whether the model is running for language modeling */ /* indicates whether the model is running for language modeling */
bool isLM; bool isLM;
...@@ -71,13 +69,16 @@ public: ...@@ -71,13 +69,16 @@ public:
public: public:
/* constructor */ /* constructor */
T2TModel(); Model();
/* de-constructor */ /* de-constructor */
~T2TModel(); ~Model();
/* initialize the model */ /* initialize the model */
void InitModel(T2TConfig& config); void InitModel(Config& config);
/* print model configurations */
void ShowModelConfig(Config& config);
/* make the encoding network */ /* make the encoding network */
XTensor MakeEncoder(XTensor& input, XTensor* mask, bool isTraining); XTensor MakeEncoder(XTensor& input, XTensor* mask, bool isTraining);
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2018, Natural Language Processing Lab, Northeastern University. * Copyright (C) 2020 NiuTrans Research. All rights reserved.
* All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
...@@ -17,49 +16,47 @@ ...@@ -17,49 +16,47 @@
/* /*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-06 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-06, 2020-07
*/ */
#include <cmath>
#include <ctime> #include <ctime>
#include "Transformer.h" #include "NMT.h"
#include "train/T2TTrainer.h" #include "train/Trainer.h"
#include "module/T2TUtility.h" #include "translate/Translator.h"
#include "translate/T2TTranslator.h"
#include "../../tensor/XDevice.h"
#include "../../tensor/XGlobal.h"
#include "../../tensor/XUtility.h"
namespace transformer namespace nmt
{ {
int TransformerMain(int argc, const char** argv) int NMTMain(int argc, const char** argv)
{ {
if (argc == 0) if (argc == 0)
return 1; return 1;
/* load configurations */ /* load configurations */
T2TConfig config(argc, argv); Config config(argc, argv);
srand((unsigned int)time(NULL)); srand(1);
/* train the model */ /* training */
if (strcmp(config.trainFN, "") != 0) { if (strcmp(config.trainFN, "") != 0) {
ENABLE_GRAD;
T2TModel model; Model model;
model.InitModel(config); model.InitModel(config);
T2TTrainer trainer; Trainer trainer;
trainer.Init(config); trainer.Init(config);
trainer.Train(config.trainFN, config.validFN, config.modelFN, &model); trainer.Train(config.trainFN, config.validFN, config.modelFN, &model);
} }
/* translate the test file */ /* translating */
if (strcmp(config.testFN, "") != 0 && strcmp(config.outputFN, "") != 0) { if (strcmp(config.testFN, "") != 0 && strcmp(config.outputFN, "") != 0) {
/* disable grad flow */
DISABLE_GRAD; DISABLE_GRAD;
T2TModel model;
Model model;
model.InitModel(config); model.InitModel(config);
T2TTranslator translator; Translator translator;
translator.Init(config); translator.Init(config);
translator.Translate(config.testFN, config.srcVocabFN, translator.Translate(config.testFN, config.srcVocabFN,
config.tgtVocabFN, config.outputFN, &model); config.tgtVocabFN, config.outputFN, &model);
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University. * Copyright (C) 2020 NiuTrans Research. All rights reserved.
* All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
...@@ -16,29 +15,17 @@ ...@@ -16,29 +15,17 @@
*/ */
/* /*
* * An implementation of the NMT system.
* An implementation of the transformer system. See more details
* about FNNLM in
* "Attention Is All You Need" by Vaswani et al.
* https://arxiv.org/pdf/1706.03762.pdf
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
* I start writing the code related to NMT - a long time since my last coding
* work on MT
*/ */
#ifndef __TRANSFORMER_H__ #ifndef __NMT_H__
#define __TRANSFORMER_H__ #define __NMT_H__
#include "../../tensor/XGlobal.h"
#include "../../tensor/XTensor.h"
#include "../../tensor/core/CHeader.h"
namespace transformer namespace nmt
{ {
/* entrance of the program */ /* entrance of the program */
int TransformerMain(int argc, const char** argv); int NMTMain(int argc, const char** argv);
} }
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
*/
#include <math.h>
#include "T2TAttention.h"
#include "T2TUtility.h"
#include "T2TEmbedding.h"
#include "../../tensor/core/CHeader.h"
namespace transformer
{
/* constructor */
T2TAttention::T2TAttention()
{
nhead = -1;
dk = -1;
dv = -1;
d = -1;
isMasked = false;
ignored = 0;
}
/* deconstructor */
T2TAttention::~T2TAttention()
{
}
/*
initialize the model
>> argc - number of arguments
>> argv - list of pointers to the arguments
>> myIgnored - number of position ignored in attention (from the begining)
>> myIsMasked - indicates whether the attention is with a mask
>> myDevID - device id
*/
void T2TAttention::InitModel(int argc, char** argv,
bool myIsMasked, int myIgnored,
int myDevID)
{
devID = myDevID;
isMasked = myIsMasked;
ignored = myIgnored;
float minmax = 0;
LoadParamInt(argc, argv, "nhead", &nhead, 4);
LoadParamInt(argc, argv, "d", &dk, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "d", &dv, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "maxPosition", &max_relative_position, 8);
LoadParamFloat(argc, argv, "attminmax", &minmax, 0.1F);
LoadParamFloat(argc, argv, "dropoutatt", &dropoutP, 0);
InitTensor2DV2(&wq, d, d, X_FLOAT, devID);
InitTensor1DV2(&bq, d, X_FLOAT, devID);
InitTensor2DV2(&wk, d, d, X_FLOAT, devID);
InitTensor1DV2(&bk, d, X_FLOAT, devID);
InitTensor2DV2(&wv, d, d, X_FLOAT, devID);
InitTensor1DV2(&bv, d, X_FLOAT, devID);
InitTensor2DV2(&rp_embedding_k, max_relative_position * 2 + 1, d/nhead, X_FLOAT, devID);
InitTensor2DV2(&wo, d, d, X_FLOAT, devID);
InitTensor1DV2(&bo, d, X_FLOAT, devID);
}
/*
make the network
>> k - keys. It might be of size B * L * H
where B = batch size, L = sequence length,
and H = vector size of each position
>> q - queries
>> v - values
>> mask - as it is
>> isTraining - indicates whether the model is used for training
>> cache - layer cache list
>> cacheType - which type that cache is
<< return - multi-attention result
*/
XTensor T2TAttention::Make(XTensor& k, XTensor& q, XTensor& v, XTensor* mask, bool isTraining, Cache* cache, int cacheType)
{
const bool isEnc = (!cache) ? true : false;
/* linear transformation before self-attention */
XTensor q2, k2, v2;
q2 = MatrixMul(q, wq) + bq;
if (!cache) {
/* self attention for encoder layers */
k2 = MatrixMul(k, wk) + bk;
v2 = MatrixMul(v, wv) + bv;
return MakeRPRAttention(k2, q2, v2, mask, isTraining, isEnc);
}
else {
if (cacheType == SELF_ATT) {
k2 = MatrixMul(k, wk) + bk;
v2 = MatrixMul(v, wv) + bv;
/* if hit, we only concat the cache with the new token */
if (!cache->miss) {
k2 = Concatenate(cache->key, k2, 1);
v2 = Concatenate(cache->value, v2, 1);
}
cache->key = k2;
cache->value = v2;
cache->miss = false;
return MakeRPRAttention(cache->key, q2, cache->value, mask, isTraining, isEnc);
}
else if (cacheType == EN_DE_ATT) {
if (cache->miss) {
cache->key = MatrixMul(k, wk) + bk;
cache->value = MatrixMul(v, wv) + bv;
cache->miss = false;
}
return MakeAttention(cache->key, q2, cache->value, mask, isTraining, isEnc);
}
CheckNTErrors(0, "invalid cache type");
}
}
/*
make the attention network given keys, queries and values (after linear transformation)
>> k - keys. It might be of size B * L * H
where B = batch size, L = sequence length,
and H = vector size of each position
>> q - queries
>> v - values
>> mask - as it is
>> isTraining - indicates whether the model is used for training
*/
XTensor T2TAttention::MakeAttention(XTensor& k, XTensor& q, XTensor& v, XTensor* mask, bool isTraining, bool is_encoder)
{
XTensor kheads;
XTensor qheads;
XTensor vheads;
/* multi head */
kheads = Split(k, k.order - 1, nhead);
qheads = Split(q, q.order - 1, nhead);
vheads = Split(v, v.order - 1, nhead);
XTensor att;
XTensor dot;
XTensor scalar;
/* scalar = softmax(Q * K^T / sqrt(dk)) * V */
dot = BMMul(qheads, X_NOTRANS, kheads, X_TRANS);
/*if (isMasked && mask)
_SumMe(&dot, mask);*/
dot = Linear(dot, 1.0F / (float)sqrt((float)dk / nhead));
scalar = Softmax(dot, -1);
if(isTraining && dropoutP > 0)
scalar = Dropout(scalar, dropoutP);
att = BMMul(scalar, vheads);
/* concatenate the heads */
return MulAndShift(Merge(att, att.order - 1), wo, bo);
}
/*
make the attention network by incorporating the relative position representation with the given keys, queries and values (after linear transformation)
>> k - keys. It might be of size B * L * H
where B = batch size, L = sequence length,
and H = vector size of each position
>> q - queries
>> v - values
>> mask - as it is
>> isTraining - indicates whether the model is used for training
*/
XTensor T2TAttention::MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v, XTensor* mask, bool isTraining, bool is_encoder)
{
XTensor kheads;
XTensor qheads;
XTensor vheads;
const int batch_size = q.GetDim(0);
const int len_q = q.GetDim(1);
const int len_kv = k.GetDim(1);
/* multi head */
kheads = Split(k, k.order - 1, nhead);
qheads = Split(q, q.order - 1, nhead);
vheads = Split(v, v.order - 1, nhead);
XTensor att;
XTensor dot;
XTensor scalar;
XTensor emb_matrix, relative_key;
InitTensor2DV2(&emb_matrix, len_q, len_kv, X_INT, q.devID);
InitTensor3DV2(&relative_key, len_q, len_kv, kheads.GetDim(-1), X_FLOAT, q.devID);
InitTensor4DV2(&dot, nhead, batch_size, len_q, len_kv, X_FLOAT, q.devID);
/* generate the relative emb index (L_q, L_kv) */
GetRPEmbedding(&emb_matrix, len_q, len_kv, max_relative_position, q.devID, is_encoder);
/* generate the relative key from the rp_embedding_k (L_q, L_kv, H/K) */
_Gather(&rp_embedding_k, &relative_key, &emb_matrix);
/* RPR dot product (K, B, L_q, L_kv)*/
RPDotProduct(&qheads, &kheads, &relative_key, &dot, true);
/*if (isMasked && mask)
_SumMe(&dot, mask);*/
/* scale the dot result */
dot = Linear(dot, 1.0F / (float)sqrt((float)dk / nhead));
/* softmax */
scalar = Softmax(dot, -1);
if (isTraining && dropoutP > 0)
scalar = Dropout(scalar, dropoutP);
/* generate the relative attention output (K, B, L_q, H/K) */
att = BMMul(scalar, vheads);
/* concatenate the heads */
return MulAndShift(Merge(att, att.order - 1), wo, bo);
}
void T2TAttention::GetRPEmbedding(XTensor* emb_matrix, const int len_q, const int len_kv, const int max_relative_length, const int devID, const bool is_encoder)
{
XTensor range;
InitTensor1DV2(&range, len_kv, X_INT, devID);
int* index = new int[len_kv];
// for encoder self-attention which the L_q = L_kv
if (is_encoder)
{
for (int i = 0; i < len_kv; i++)
index[i] = i;
range.SetData(index, len_kv);
XTensor range_2D, range_2D_t;
InitTensor2DV2(&range_2D, len_q, len_kv, X_INT, devID);
InitTensor2DV2(&range_2D_t, len_q, len_kv, X_INT, devID);
_Unsqueeze(&range, &range_2D, 0, len_q);
_Transpose(&range_2D, &range_2D_t, 0, 1);
_Sum(&range_2D, &range_2D_t, emb_matrix, -1);
}
// for decoder self-attention which the L_q != L_kv, and L_q is 1
else
{
for (int i = 0; i < len_kv; i++)
index[i] = -len_kv + i + 1;
range.SetData(index, len_kv);
_Unsqueeze(&range, emb_matrix, 0, len_q);
}
// clip the tensor range from -max_ralative_length to max_relative_length
_Clip(emb_matrix, emb_matrix, -max_relative_length, max_relative_length);
// (L_q, L_kv)
_ScaleAndShift(emb_matrix, emb_matrix, 1, max_relative_length);
delete[] index;
}
void T2TAttention::RPDotProduct(XTensor* x, XTensor* y, XTensor* z, XTensor* attention, const bool is_key)
{
const int head_num = nhead;
const int batch_size = x->dimSize[1];
const int len_kv = y->dimSize[2];
const int len_q = x->dimSize[2];
const int depth = y->dimSize[3];
// L_kv (is_key=True) or H/K (is_key=False)
const int last_dim = is_key ? len_kv : depth;
MATRIX_TRANS_TYPE transpose_flag = is_key ? X_TRANS : X_NOTRANS;
//if (profiler_) profiler_->StartTimer("RPDotPro-BMM");
// for key: batch-MM: (K,B,L_q,H/K) * (K,B,H/K,L_kv) -> (K,B,L_q,L_kv)
// for not key: batch-MM: (K,B,L_q,L_kv) * (K,B,L_kv,H/K) -> (K,B,L_q,H/K)
XTensor context;
InitTensor4DV2(&context, head_num, batch_size, len_q, last_dim, X_FLOAT, x->devID);
_MatrixMulBatched(x, X_NOTRANS, y, transpose_flag, &context);
// reshape and transpose x to (L_q, K*B, H/K or L_kv)
int merge_dims[] = { head_num * batch_size, len_q, x->dimSize[3] };
x->Reshape(3, merge_dims);
XTensor x_t;
InitTensor3DV2(&x_t, len_q, head_num * batch_size, x->GetDim(-1), X_FLOAT, x->devID);
_Transpose(x, &x_t, 0, 1);
// for key: batch-MM: (L_q, K*B, H/K) * (L_q, L_kv, H/K) -> (L_q, K*B, L_kv)
// for not key: batch-MM: (L_q, K*B, L_kv) * (L_q, L_kv, H/K) -> (L_q, K*B, H/K)
XTensor relative;
InitTensor3DV2(&relative, len_q, head_num * batch_size, last_dim, X_FLOAT, x->devID);
_MatrixMulBatched(&x_t, X_NOTRANS, z, transpose_flag, &relative);
// (L_q, K*B, H/K or L_kv) -> (K*B, L_q, H/K or L_kv)
XTensor relative_t;
InitTensor3DV2(&relative_t, head_num * batch_size, len_q, last_dim, X_FLOAT, x->devID);
_Transpose(&relative, &relative_t, 0, 1);
int split_dims[] = { head_num, batch_size, len_q, last_dim };
relative_t.Reshape(4, split_dims);
_Sum(&context, &relative_t, attention);
}
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
*/
#ifndef __T2TATTENTION_H__
#define __T2TATTENTION_H__
#include "../../network/XNet.h"
using namespace nts;
namespace transformer
{
/* attention type */
enum { NONE, SELF_ATT, EN_DE_ATT };
/* layer cache for keys and values */
class Cache
{
public:
/* cache for keys */
XTensor key;
/* cache for values */
XTensor value;
public:
bool miss;
Cache() {
miss = true;
}
void Update(XTensor&& k, XTensor&& v) {
key = k;
value = v;
miss = false;
}
};
/*
multi-head attention
y(Q, K, V) = cat(head_1, head_2, ..., head_n)
where head_i = Attention(Q * w_i^Q, K * w_i^K, V * w_i^V)
attention(Q, K, V) = softmax(Q * K^T/d_k^0.5) V
d_k = dimension size of K
*/
class T2TAttention
{
public:
/* device id */
int devID;
/* head number */
int nhead;
/* transformation matrix for Q */
XTensor wq;
/* bias for Q */
XTensor bq;
/* transformation matrix for K */
XTensor wk;
/* bias for K */
XTensor bk;
/* transformation matrix for V */
XTensor wv;
/* bias for V */
XTensor bv;
XTensor wBig;
XTensor bBig;
/* RPR emb */
XTensor rp_embedding_k;
/* transformation after dot-product attention */
XTensor wo;
/* bias after dot-product attention */
XTensor bo;
/* size of transformed Q and K */
int dk;
/* size of transformed V */
int dv;
/* size of input Q, K and V */
int d;
/* indicates whether the attention is masked */
bool isMasked;
/* some positions can be ignored in attention. this is useful in lm where the first position needs
special design for the attention model. */
int ignored;
/* indicates whether the model is used for training */
bool isTraining;
/* dropout probability */
DTYPE dropoutP;
/* max relative window size */
int max_relative_position;
public:
/* constructor */
T2TAttention();
/* de-constructor */
~T2TAttention();
/* initialize the model */
void InitModel(int argc, char** argv,
bool myIsMasked, int myIgnored,
int myDevID = -1);
/* make the network */
XTensor Make( XTensor& k, XTensor& q, XTensor& v,
XTensor* mask, bool isTraining, Cache* cache, int cacheType);
/* make the attention network given keys, queries and values (after linear transformation) */
XTensor MakeAttention(XTensor& k, XTensor& q, XTensor& v, XTensor* mask, bool isTraining, bool is_encoder);
/* make the attention network given keys, queries and values (after linear transformation) */
XTensor MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v, XTensor* mask, bool isTraining, bool is_encoder);
void GetRPEmbedding(XTensor* emb_matrix, const int len_q, const int len_kv, const int max_relative_length, const int device_id, const bool is_encoder);
void RPDotProduct(XTensor* x, XTensor* y, XTensor* z, XTensor* attention, const bool is_key);
};
}
#endif
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-08-01
*/
#include <math.h>
#include "T2TEmbedding.h"
#include "T2TUtility.h"
#include "../../tensor/core/CHeader.h"
namespace transformer
{
/* constructor */
T2TEmbedder::T2TEmbedder()
{
devID = -1;
vSize = -1;
maxLength = -1;
}
/* deconstructor */
T2TEmbedder::~T2TEmbedder()
{
}
/*
initialize the model
>> argc - number of arguments
>> argv - list of pointers to the arguments
>> myDevID - device id
*/
void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, bool isEnc)
{
devID = myDevID;
if(isEnc){
LoadParamInt(argc, argv, "vsize", &vSize, -1);
}
else{
LoadParamInt(argc, argv, "vsizetgt", &vSize, -1);
}
//LoadParamInt(argc, argv, "vsize", &vSize, -1);
LoadParamInt(argc, argv, "maxlen", &maxLength, 1024);
LoadParamInt(argc, argv, "d", &eSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "pad", &padIdx, 1);
InitTensor2DV2(&w, vSize, eSize, X_FLOAT, devID);
maxLength = maxLength + 1 + 1;
DTYPE v = 1.0F/(float)sqrt((float)eSize);
w.SetDataRandn(0, v);
/* create the positional embedding matrix */
MakePosEmbedding(eSize, d, maxLength, padIdx);
}
/*
make positional embeddings (of size eSize * length)
>> eSize - embedding size
>> d - dimension size of the hidden layers
>> length - length of the sequence
*/
void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length, int padIdx)
{
InitTensor2DV2(&posEmbeddingBase, length, eSize, X_FLOAT, devID);
float * data = new float[posEmbeddingBase.unitNum];
for(int pos = 0; pos < length; pos++){
float * dp = data + pos * eSize;
int channelSize = eSize / 2;
int offset = 0;
for(int i = 0; i < channelSize; i++){
dp[offset++] = (float)sin(pos * exp(-i * log(10000.0F) / (channelSize - 1)));
}
for(int i = 0; i < channelSize; i++){
dp[offset++] = (float)cos(pos * exp(-i * log(10000.0F) / (channelSize - 1)));
}
}
/* padding zeros */
int padStart = padIdx * eSize;
for (int i = padStart; i < padStart + eSize; i++)
data[i] = 0.F;
posEmbeddingBase.SetData(data, posEmbeddingBase.unitNum);
delete[] data;
}
/*
make the network
*/
XTensor T2TEmbedder::Make(XTensor &input, int prevLen, int nstep, bool isDec)
{
/* assert padding index is 1 */
CheckNTErrors(input.order > 1, "Wrong input tensor size!");
CheckNTErrors(input.dimSize[input.order - 1] < maxLength, "The sequence is too long!");
CheckNTErrors(vSize > 0, "set vocabulary size by \"-vsize\"");
CheckNTErrors(eSize > 0, "set embedding size by \"-esize\"");
XTensor wordEmbedding, position, posEmbedding;
InitTensorV2(&position, &input);
int* posData = new int[input.unitNum];
XTensor inputCPU;
InitTensorOnCPU(&inputCPU, &input);
_CopyValues(&input, &inputCPU);
if (!isDec)
{
for (int i = 0; i < inputCPU.GetDim(0); i++) {
int startNoPad = 2 + prevLen;
int* p = ((int*)inputCPU.data) + i * inputCPU.GetDim(1);
for (int j = 0; j < inputCPU.GetDim(1); j++) {
if (p[j] == 1) {
posData[i * inputCPU.GetDim(1) + j] = 1;
}
else {
posData[i * inputCPU.GetDim(1) + j] = startNoPad++;
}
}
}
position.SetData(posData, position.unitNum);
}
else
{
for (int i = 0; i < position.GetDim(0); i++) {
for (int j = 0; j < position.GetDim(1); j++) {
position.Set2DInt(nstep + 2, i, j);
}
}
}
delete[] posData;
/* we make positional embeddings first */
if (true) {
posEmbedding = Gather(posEmbeddingBase, position);
}
/* then we make word embeddings */
wordEmbedding = Gather(w, input);
wordEmbedding = Linear(wordEmbedding, (float)sqrt((float)eSize));
/* we sum over the two embeddings */
return Sum(wordEmbedding, posEmbedding);
}
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-08-01
*/
#ifndef __T2TEMBEDDING_H__
#define __T2TEMBEDDING_H__
#include "../../network/XNet.h"
using namespace nts;
namespace transformer
{
#define DEFAULT_EMBEDDING_SIZE 128
/*
embedding (of word at position i):
word embedding + positional embedding
*/
class T2TEmbedder
{
public:
/* device id */
int devID;
/* vocabulary size */
int vSize;
/* embedding size */
int eSize;
/* maximum length of the sequence */
int maxLength;
/* dimension size of the hidden layers in the t2t model */
int d;
/* padding index */
int padIdx;
/* word embedding matrix */
XTensor w;
/* predefined positional embeddings. It can speeds up
the embedding processing by re-loading. */
XTensor posEmbeddingBase;
public:
/* constructor */
T2TEmbedder();
/* de-constructor */
~T2TEmbedder();
/* initialize the model */
void InitModel(int argc, char ** argv, int myDevID = -1, bool isEnc = true);
/* make positional embeddings */
void MakePosEmbedding(int eSize, int d, int length, int padIdx);
/* make the network */
XTensor Make(XTensor &input, int prevLen=0, int nstep = -1, bool isDec = false);
};
}
#endif
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
*/
#include <math.h>
#include "T2TFNN.h"
#include "T2TUtility.h"
#include "T2TEmbedding.h"
#include "../../tensor/core/CHeader.h"
#include "../../tensor/function/FHeader.h"
namespace transformer
{
/* constructor */
T2TFNN::T2TFNN()
{
inSize = -1;
outSize = -1;
hSize = -1;
}
/* deconstructor */
T2TFNN::~T2TFNN()
{
}
/*
initialize the model
>> argc - number of arguments
>> argv - list of pointers to the arguments
>> myDevID - device id
*/
void T2TFNN::InitModel(int argc, char** argv, int myDevID)
{
devID = myDevID;
float minmax = 0;
LoadParamInt(argc, argv, "d", &inSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "d", &outSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "fnnh", &hSize, outSize * 8);
LoadParamFloat(argc, argv, "fnnminmax", &minmax, 0.1F);
LoadParamFloat(argc, argv, "dropoutfnn", &dropoutP, 0);
InitTensor2DV2(&w1, inSize, hSize, X_FLOAT, devID);
InitTensor1DV2(&b1, hSize, X_FLOAT, devID);
InitTensor2DV2(&w2, hSize, outSize, X_FLOAT, devID);
InitTensor1DV2(&b2, outSize, X_FLOAT, devID);
fnnLayerNorm.InitModel(argc, argv, myDevID);
//float scale = 1.0F;
//float finfout1 = (float)sqrt(6.0F * scale/(inSize + hSize));
//float finfout2 = (float)sqrt(6.0F * scale/(hSize + outSize));
//
//w1.SetDataRand(-finfout1, finfout1);
//b1.SetZeroAll();
//w2.SetDataRand(-finfout2, finfout2);
//b2.SetZeroAll();
}
/*
make the network
y = max(0, x * w1 + b1) * w2 + b2
>> input - the input tensor
>> return - the output tensor
*/
XTensor T2TFNN::Make(XTensor& input, bool isTraining)
{
XTensor t1;
/* t1 = max(0, x * w1 + b1) */
t1 = Rectify(MulAndShift(fnnLayerNorm.Make(input), w1, b1));
if (isTraining && dropoutP > 0)
t1 = Dropout(t1, dropoutP);
/* result = t1 * w2 + b2 */
XTensor res;
res = MulAndShift(t1, w2, b2);
_SumMe(&res, &input);
return res;
}
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
*/
#ifndef __T2TFNN_H__
#define __T2TFNN_H__
#include "T2TLayerNormal.h"
#include "../../tensor/XTensor.h"
using namespace nts;
namespace transformer
{
/* a fnn: y = max(0, x * w1 + b1) * w2 + b2 */
class T2TFNN
{
public:
/* device id */
int devID;
/* size of input vector */
int inSize;
/* size of output vector */
int outSize;
/* size of hidden layers */
int hSize;
/* matrix of transformation 1 */
XTensor w1;
/* bias of transformation 1 */
XTensor b1;
/* matrix of transformation 2 */
XTensor w2;
/* bias of transformation 2 */
XTensor b2;
/* layer normalization for fnn */
T2TLN fnnLayerNorm;
/* dropout probability */
DTYPE dropoutP;
public:
/* constructor */
T2TFNN();
/* deconstructor */
~T2TFNN();
/* initialize the model */
void InitModel(int argc, char ** argv, int myDevID = -1);
/* make the network */
XTensor Make(XTensor &input, bool isTraining);
};
}
#endif
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
*/
#include <math.h>
#include "T2TLayerNormal.h"
#include "T2TUtility.h"
#include "T2TEmbedding.h"
#include "../../tensor/core/CHeader.h"
namespace transformer
{
/* constructor */
T2TLN::T2TLN()
{
devID = -1;
d = 0;
}
/* de-constructor */
T2TLN::~T2TLN()
{
}
/*
initialize the model
>> argc - number of arguments
>> argv - list of pointers to the arguments
>> myDevID - device id
*/
void T2TLN::InitModel(int argc, char ** argv, int myDevID)
{
devID = myDevID;
d = 0;
LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
InitTensor1DV2(&w, d, X_FLOAT, devID);
InitTensor1DV2(&b, d, X_FLOAT, devID);
}
/*
make the network
for each layer representation x, we have
y =
>> input - the input tensor
>> return - layer normalization output
*/
XTensor T2TLN::Make(XTensor &input)
{
XTensor &x = input;
XTensor xn;
XTensor mean;
XTensor variance;
XTensor standard;
XTensor meanFilled;
XTensor standardFilled;
/* \mu = (sum_i x_i)/m */
mean = ReduceMean(x, x.order - 1);
/* \sigma = (sum_i (x_i - \mu)^2)/m */
variance = ReduceVariance(x, x.order - 1, mean) + 1e-5F;
/* standard = sqrt(variance) */
standard = Power(variance, 0.5F);
/* unsqueeze mean and standard deviation to fit them into
the same shape of x */
meanFilled = Unsqueeze(mean, x.order - 1, x.GetDim(-1));
standardFilled = Unsqueeze(standard, x.order - 1, x.GetDim(-1));
/* x' = (x - \mu)/standard */
xn = (x - meanFilled) / standardFilled;
/* result = x' * w + b */
return xn * w + b;
}
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
*/
#ifndef __T2TLAYERNORMAL_H__
#define __T2TLAYERNORMAL_H__
#include "../../network/XNet.h"
using namespace nts;
namespace transformer
{
/* layer normalization: y = norm(x) * w + b
where norm(x) = (x - mean)/standardDeviation */
class T2TLN
{
public:
/* device id */
int devID;
/* the transformation matrix w */
XTensor w;
/* the bias term b */
XTensor b;
/* dimension size of the model */
int d;
public:
/* constructor */
T2TLN();
/* de-constructor */
~T2TLN();
/* initialize the model */
void InitModel(int argc, char ** argv, int myDevID = -1);
/* make the network */
XTensor Make(XTensor &input);
};
}
#endif
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "../../tensor/core/CHeader.h"
#include "T2TLengthPenalty.h"
using namespace nts;
namespace transformer
{
/*
GNMT-like length penalty: pl = ((5 + n)/(5 + 1))^\alpha
where n = length of the sequence
>> length - length of the sequence (for each entry)
>> alpha - the parameter controls the length preference
<< return - length penaltyof the sequence (for each entry)
*/
XTensor T2TLengthPenalizer::GNMT(const XTensor & length, float alpha)
{
XTensor base;
XTensor lp;
//base = ScaleAndShift(ScaleAndShift(length, 0, 5.0F), 1.0F/(5 + 1));
base = (length + 5)/(1 + 5);
lp = Power(base, alpha);
return lp;
}
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-04-08
* Start of a new week - I just finished several documents.
* Writing document is harder than writing code :)
*/
#ifndef __T2TLENGTHPENALTY_H__
#define __T2TLENGTHPENALTY_H__
#include "../../tensor/XTensor.h"
using namespace nts;
namespace transformer
{
/* We intend to penalize short sequences because they have higher score
in product of a sequence of probability-like terms and have more chances
to beat others in search. */
class T2TLengthPenalizer
{
public:
/* GNMT-like length penalty: pl = ((5 + n)/(5 + 1))^\alpha
where n = length of the sequence */
static
XTensor GNMT(const XTensor & length, float alpha);
};
}
#endif
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
*/
#include <math.h>
#include "T2TOutput.h"
#include "T2TUtility.h"
#include "T2TEmbedding.h"
#include "../../tensor/core/CHeader.h"
namespace transformer
{
/* constructor */
T2TOutput::T2TOutput()
{
devID = -1;
vSize = -1;
inSize = -1;
hSize = -1;
}
/* de-constructor */
T2TOutput::~T2TOutput()
{
}
/*
initialize the model
>> argc - number of arguments
>> argv - list of pointers to the arguments
>> myDevID - device id
*/
void T2TOutput::InitModel(int argc, char ** argv, int myDevID)
{
devID = myDevID;
float minmax = 0;
LoadParamInt(argc, argv, "vsizetgt", &vSize, -1);
LoadParamInt(argc, argv, "d", &inSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "d", &hSize, DEFAULT_EMBEDDING_SIZE);
InitTensor2DV2(&w, vSize, hSize, X_FLOAT, devID);
}
/*
make the network (redefined output tensor)
>> input - input tensor
>> output - output tensor
*/
void T2TOutput::Make(XTensor &input, XTensor &output)
{
XTensor &x = input;
output = LogSoftmax(MMul(x, X_NOTRANS, w, X_TRANS), -1);
}
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
*/
#ifndef __T2TOUTPUT_H__
#define __T2TOUTPUT_H__
#include "../../tensor/function/FHeader.h"
using namespace nts;
namespace transformer
{
#define OUTPUT_NAME "output"
/* output layer */
class T2TOutput
{
public:
/* device id */
int devID;
/* vocabulary size */
int vSize;
/* input vector size */
int inSize;
/* vector size of the linear transformation */
int hSize;
/* transformation matrix */
XTensor w;
public:
/* constructor */
T2TOutput();
/* de-constructor */
~T2TOutput();
/* initialize the model */
void InitModel(int argc, char ** argv, int myDevID = -1);
/* make the network */
XTensor Make(XTensor &input);
/* make the network (redefined output tensor) */
void Make(XTensor &input, XTensor &output);
};
}
#endif
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-13
*/
#include "T2TPredictor.h"
#include "../../tensor/core/CHeader.h"
#include <iostream>
using namespace nts;
namespace transformer
{
/* constructor */
T2TStateBundle::T2TStateBundle()
{
states = NULL;
isStart = false;
}
/* de-constructor */
T2TStateBundle::~T2TStateBundle()
{
if (states != NULL)
delete[] states;
}
/*
create states
>> num - number of states
*/
void T2TStateBundle::MakeStates(int num)
{
CheckNTErrors(num > 0, "invalid number");
if (states != NULL)
delete[] states;
states = new T2TState[num];
for (int i = 0; i < num; i++) {
states[i].prediction = -1;
states[i].pid = T2T_PID_EMPTY;
states[i].isEnd = false;
states[i].isStart = false;
states[i].isCompleted = false;
states[i].prob = 0;
states[i].probPath = 0;
states[i].modelScore = 0;
states[i].nstep = 0;
states[i].last = NULL;
}
stateNum = num;
}
/* constructor */
T2TPredictor::T2TPredictor()
{
startSymbol = 2;
}
/* de-constructor */
T2TPredictor::~T2TPredictor()
{
}
/*
create an initial state
>> model - the t2t model
>> top - the top-most layer of the network
>> input - input of the network
>> beamSize - beam size
>> state - the state to be initialized
*/
void T2TPredictor::Create(T2TModel* model, XTensor* top, const XTensor* input, int beamSize, T2TStateBundle* state)
{
int dims[MAX_TENSOR_DIM_NUM];
for (int i = 0; i < input->order - 1; i++)
dims[i] = input->GetDim(i);
dims[input->order - 1] = beamSize;
InitTensorV2(&state->probPath, input->order, dims, X_FLOAT, 1.0F, input->devID);
InitTensorV2(&state->nstep, input->order, dims, X_FLOAT, 1.0F, input->devID);
InitTensorV2(&state->endMark, input->order, dims, X_INT, 1.0F, input->devID);
state->probPath.SetZeroAll();
state->nstep.SetZeroAll();
state->endMark.SetZeroAll();
state->stateNum = 0;
}
/*
set start symbol
>> symbol - the symbol (in integer)
*/
void T2TPredictor::SetStartSymbol(int symbol)
{
startSymbol = symbol;
}
/*
read a state
>> model - the t2t model that keeps the network created so far
>> state - a set of states. It keeps
1) hypotheses (states)
2) probablities of hypotheses
3) parts of the network for expanding toward the next state
*/
void T2TPredictor::Read(T2TModel* model, T2TStateBundle* state)
{
m = model;
s = state;
}
/*
predict the next state
>> next - next states (assuming that the current state has been read)
>> encoding - encoder output
>> inputEnc - input of the encoder
>> paddingEnc - padding of the encoder
>>> isStart - is the start or not
*/
void T2TPredictor::Predict(T2TStateBundle* next, XTensor* encoding, XTensor* inputEnc, XTensor* paddingEnc, bool isStart)
{
int dims[MAX_TENSOR_DIM_NUM];
/* word indices of positions up to next state */
XTensor inputDec;
/* the first token */
XTensor first;
CheckNTErrors(inputEnc->order >= 2, "Wrong order of the tensor!");
for (int i = 0; i < inputEnc->order - 1; i++)
dims[i] = inputEnc->GetDim(i);
dims[inputEnc->order - 1] = 1;
InitTensorV2(&first, inputEnc->order, dims, X_INT, 1.0F, inputEnc->devID);
SetDataFixedInt(first, startSymbol);
/* add a new word into the input sequence of the decoder side */
if (isStart) {
inputDec = Identity(first);
}
else {
/* only pass one step to the decoder */
inputDec = GetLastPrediction(s);
inputDec.SetDevice(inputEnc->devID);
}
/* prediction probabilities */
XTensor& output = next->prob;
XTensor decoding;
for (int i = 0; i < inputDec.order - 1; i++)
dims[i] = inputDec.GetDim(i);
dims[inputDec.order - 1] = inputDec.GetDim(-1);
XTensor paddingDec;
InitTensorV2(&paddingDec, inputDec.order, dims, X_INT, 1.0F, paddingEnc->devID);
SetDataFixedInt(paddingDec, 1);
XTensor maskDec;
XTensor maskEncDec;
/* decoder mask */
m->MakeMTMaskDec(*inputEnc, inputDec, *paddingEnc, paddingDec, maskDec, maskEncDec);
/* make the decoding network */
decoding = m->decoder->Make(inputDec, *encoding, &maskDec, maskEncDec, false);
CheckNTErrors(decoding.order >= 2, "The tensor must be of order 2 or larger!");
/* generate the output probabilities */
m->outputLayer->Make(decoding, output);
}
/*
generate paths up to the states of the current step
>> state - state bundle of the current step
*/
XTensor T2TPredictor::GeneratePaths(T2TStateBundle* state)
{
CheckNTErrors(state->stateNum >= 0, "Illegal state!");
int distance = -1;
for (int i = 0; i < state->stateNum; i++) {
T2TState* cur = state->states + i;
int nsteps = 0;
while (cur != NULL) {
nsteps++;
cur = cur->last;
}
if (nsteps > distance)
distance = nsteps;
}
XTensor path;
InitTensor2DV2(&path, state->stateNum, distance, X_INT);
path.SetZeroAll();
for (int i = 0; i < state->stateNum; i++) {
T2TState* cur = state->states + i;
int nsteps = 0;
while (cur != NULL) {
nsteps++;
path.Set2DInt(cur->prediction, i, distance - nsteps);
cur = cur->last;
}
}
return path;
}
/*
get the predictions of the previous step
>> state - state bundle of the current step
*/
XTensor T2TPredictor::GetLastPrediction(T2TStateBundle* state)
{
CheckNTErrors(state->stateNum >= 0, "Illegal state!");
XTensor lastPred;
InitTensor2DV2(&lastPred, state->stateNum, 1, X_INT);
for (int i = 0; i < state->stateNum; i++) {
T2TState* cur = state->states + i;
lastPred.Set2DInt(cur->prediction, i, 0);
}
return lastPred;
}
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-13
* This is the first source file I create in 2019 - new start!
*/
#ifndef __T2TPREDICTOR_H__
#define __T2TPREDICTOR_H__
#include "T2TModel.h"
#include "T2TLengthPenalty.h"
namespace transformer
{
#define T2T_PID_EMPTY -1
/* state for search. It keeps the path (back-pointer), prediction distribution,
and etc. It can be regarded as a hypothsis in translation. */
class T2TState
{
public:
/* we assume that the prediction is an integer */
int prediction;
/* id of the problem. One can regard it as the sentence id when we
translate a number of sentences in the batched manner. The hypothesis
is empty if id = -1 */
int pid;
/* indicates whether the state is an end */
bool isEnd;
/* indicates whether the state is the start */
bool isStart;
/* indicates whether the state is completed */
bool isCompleted;
/* probability of every prediction (last state of the path) */
float prob;
/* probability of every path */
float probPath;
/* model score of every path. A model score = path probability + some other stuff */
float modelScore;
/* nubmer of steps we go over so far */
int nstep;
/* pointer to the previous state */
T2TState* last;
};
/* a bundle of states */
class T2TStateBundle
{
public:
/* predictions */
XTensor prediction;
/* id of the previous state that generates the current one */
XTensor preID;
/* mark that indicates whether each hypothesis is completed */
XTensor endMark;
/* probability of every prediction (last state of the path) */
XTensor prob;
/* probability of every path */
XTensor probPath;
/* model score of every path */
XTensor modelScore;
/* step number of each hypothesis */
XTensor nstep;
/* list of states */
T2TState* states;
/* number of states */
int stateNum;
/* indicates whether it is the first state */
bool isStart;
public:
/* constructor */
T2TStateBundle();
/* de-constructor */
~T2TStateBundle();
/* create states */
void MakeStates(int num);
};
/* The predictor reads the current state and then predicts the next.
It is exactly the same procedure of MT inference -
we get the state of previous words and then generate the next word.
Here, a state can be regared as the representation of words (word
indices, hidden states, embeddings and etc.). */
class T2TPredictor
{
private:
/* pointer to the transformer model */
T2TModel* m;
/* current state */
T2TStateBundle* s;
/* start symbol */
int startSymbol;
public:
/* constructor */
T2TPredictor();
/* de-constructor */
~T2TPredictor();
/* create an initial state */
void Create(T2TModel* model, XTensor* top, const XTensor* input, int beamSize, T2TStateBundle* state);
/* set the start symbol */
void SetStartSymbol(int symbol);
/* read a state */
void Read(T2TModel* model, T2TStateBundle* state);
/* predict the next state */
void Predict(T2TStateBundle* next, XTensor* encoding, XTensor* inputEnc, XTensor* paddingEnc, bool isStart);
/* generate paths up to the states of the current step */
XTensor GeneratePaths(T2TStateBundle* state);
/* get the predictions of the previous step */
XTensor GetLastPrediction(T2TStateBundle* state);
};
}
#endif
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
*/
#include "T2TSearch.h"
#include "T2TUtility.h"
#include "../../tensor/core/CHeader.h"
using namespace nts;
namespace transformer
{
/* constructor */
T2TSearch::T2TSearch()
{
alpha = 0;
maxLength = 0;
beamSize = 0;
batchSize = 0;
endSymbolNum = 0;
fullHypos = NULL;
endSymbols = new int[32];
startSymbol = -1;
}
/* de-constructor */
T2TSearch::~T2TSearch()
{
if (fullHypos != NULL)
delete[] fullHypos;
if (endSymbols != NULL)
delete[] endSymbols;
}
/*
initialize the model
>> argc - number of arguments
>> argv - list of pointers to the arguments
*/
void T2TSearch::Init(int argc, char** argv)
{
LoadParamInt(argc, argv, "beamsize", &beamSize, 1);
LoadParamInt(argc, argv, "batchsize", &batchSize, 1);
LoadParamFloat(argc, argv, "lenalpha", &alpha, 1.0F);
LoadParamInt(argc, argv, "endid", endSymbols, -1);
LoadParamInt(argc, argv, "startid", &startSymbol, -1);
LoadParamFloat(argc, argv, "maxlenalpha", &scalarMaxLength, 2.0F);
LoadParamBool(argc, argv, "earlystop", &isEarlyStop, false);
if (endSymbols[0] >= 0)
endSymbolNum = 1;
}
/*
search for the most promising states
>> model - the transformer model
>> input - input of the model
>> padding - padding of the input
>> output - output that represents the sequences as rows
>> score - score of the sequences
*/
void T2TSearch::Search(T2TModel * model, XTensor * input, XTensor * padding,
XTensor * output, XTensor * score)
{
T2TPredictor predictor;
XTensor maskEnc;
XTensor encoding;
XTensor encodingBeam;
XTensor inputBeam;
XTensor paddingBeam;
CheckNTErrors(endSymbolNum > 0, "The search class is not initialized!");
CheckNTErrors(startSymbol >= 0, "The search class is not initialized!");
Prepare(input->unitNum / input->GetDim(-1), beamSize);
/* encoder mask */
model->MakeMTMaskEnc(*input, *padding, maskEnc);
/* make the encoding network */
encoding = model->MakeEncoder(*input, &maskEnc, false);
encodingBeam = Unsqueeze(encoding, encoding.order - 2, beamSize);
inputBeam = Unsqueeze(*input, input->order - 1, beamSize);
paddingBeam = Unsqueeze(*padding, padding->order - 1, beamSize);
encodingBeam.ReshapeMerged(encodingBeam.order - 4);
inputBeam.ReshapeMerged(inputBeam.order - 3);
paddingBeam.ReshapeMerged(paddingBeam.order - 3);
/* max output-length = scalar * source-length */
int lengthLimit = (int)(input->GetDim(-1) * scalarMaxLength);
CheckNTErrors(lengthLimit > 0, "no max length specified!");
maxLength = lengthLimit;
T2TStateBundle * states = new T2TStateBundle[lengthLimit + 1];
T2TStateBundle * first = states;
T2TStateBundle * cur = NULL;
T2TStateBundle * next = NULL;
/* create the first state */
predictor.Create(model, &encodingBeam, input, beamSize, first);
predictor.SetStartSymbol(startSymbol);
first->isStart = true;
/* generate the sequence from left to right */
for(int l = 0 ; l < lengthLimit; l++){
cur = states + l;
next = states + l + 1;
/* read the current state */
predictor.Read(model, cur);
/* predict the next state */
predictor.Predict(next, &encodingBeam, &inputBeam, &paddingBeam, l == 0);
/* compute the model score (given the prediction probability) */
Score(cur, next);
/* beam pruning */
Generate(next);
/* expand the search graph */
Expand(cur, next);
/* push complete hypotheses into the heap */
Collect(next);
/* stop searching when all hypotheses are completed */
if(IsAllCompleted(next)){
maxLength = l + 1;
break;
}
}
/* fill the heap with imcomplete hypotheses if neccesary */
FillHeap(next);
Dump(output, score);
delete[] states;
}
/*
prepare for search
>> batchSize - size of the batch
>> beamSize - size of the beam
*/
void T2TSearch::Prepare(int myBatchSize, int myBeamSize)
{
batchSize = myBatchSize;
beamSize = myBeamSize;
if (fullHypos != NULL)
delete[] fullHypos;
fullHypos = new XHeap<MIN_HEAP, float>[batchSize];
for (int i = 0; i < batchSize; i++)
fullHypos[i].Init(beamSize);
}
/*
compute the model score for each hypothesis
>> prev - the beam of the previous state
>> beam - the beam that keeps a number of states
*/
void T2TSearch::Score(T2TStateBundle* prev, T2TStateBundle* beam)
{
XTensor& score = beam->modelScore;
XTensor& prob = beam->prob;
XTensor& probPath = beam->probPath;
XTensor& probPathPrev = prev->probPath;
XTensor& lenPrev = prev->nstep;
XTensor& len = beam->nstep;
XTensor lp;
XTensor mask;
int order = prob.order;
int outputSize = prob.GetDim(-1);
int dims[MAX_TENSOR_DIM_NUM];
for (int i = 0; i < order; i++)
dims[i] = prob.GetDim(i);
InitTensorV2(&score, &prob);
InitTensorV2(&probPath, &prob);
prob.Reshape(prob.unitNum / outputSize, outputSize);
score.Reshape(score.unitNum / outputSize, outputSize);
probPath.Reshape(score.unitNum / outputSize, outputSize);
probPathPrev.Reshape(probPathPrev.unitNum);
/* the log-scale probability of the entire sequence */
_SumDim(&prob, &probPathPrev, &probPath, 0);
InitTensorV2(&len, &lenPrev);
InitTensorV2(&lp, &lenPrev);
_ScaleAndShift(&lenPrev, &len, 1.0F, 1.0F);
/* the GNMT-like length penalty */
lp = T2TLengthPenalizer::GNMT(len, alpha);
lp.Reshape(lp.unitNum);
/* score = log-prob/lp */
_DivDim(&probPath, &lp, &score, 0);
if (prev->isStart) {
XTensor firstMask = MakeFirstMask(beam);
firstMask.Reshape(firstMask.unitNum);
/* mask the hypotheses in the beam except the first one */
_SumDim(&score, &firstMask, &score, 0);
}
InitTensorV2(&mask, prev->endMark.order, prev->endMark.dimSize, X_FLOAT, 1.0F, prev->endMark.devID);
mask.SetZeroAll();
_SetDataFixedCond(&mask, &prev->endMark, -1e9F);
mask.Reshape(mask.unitNum);
/* mask the completed hypotheses so that they cannot
be involved in further sorting and beam search. */
_SumDim(&score, &mask, &score, 0);
prob.Reshape(order, dims);
score.Reshape(order, dims);
probPath.Reshape(order, dims);
probPathPrev.Reshape(order - 1, dims);
lp.Reshape(order - 1, dims);
mask.Reshape(order - 1, dims);
}
/*
generate tokens for the next state via beam pruning
>> beam - the beam that keeps a number of states
*/
void T2TSearch::Generate(T2TStateBundle* beam)
{
int dims[MAX_TENSOR_DIM_NUM];
int dimsBeam[MAX_TENSOR_DIM_NUM];
int dimsTopK[MAX_TENSOR_DIM_NUM];
XTensor scoreTopK;
XTensor indexCPU;
XTensor &score = beam->modelScore;
XTensor &index = beam->prediction;
XTensor &preID = beam->preID;
XTensor &probPath = beam->probPath;
XTensor &prob = beam->prob;
int order = score.order;
for (int i = 0; i < order; i++) {
dims[i] = score.GetDim(i);
dimsBeam[i] = score.GetDim(i);
dimsTopK[i] = score.GetDim(i);
}
CheckNTErrors(order >= 3, "The tensor must be of order 2 or larger.");
CheckNTErrors(dimsBeam[order - 3] % beamSize == 0, "Wrong dimension size!");
int sizeVocab = score.GetDim(-1);
int stride = score.GetDim(-1);
dimsBeam[order - 3] /= beamSize;
dimsBeam[order - 1] *= beamSize;
dimsTopK[order - 3] = dimsBeam[order - 3];
dimsTopK[order - 1] = beamSize;
InitTensorV2(&scoreTopK, order, dimsTopK, score.dataType, 1.0F, score.devID);
InitTensorV2(&index, order, dimsTopK, X_INT, 1.0F, score.devID);
InitTensorV2(&preID, order, dimsTopK, X_INT, 1.0F, -1);
InitTensorV2(&indexCPU, order, dimsTopK, X_INT, 1.0F, -1);
/* TODO: check the mask - mask the first and the padding id */
int dimMask[]{ score.GetDim(-1) };
XTensor mask;
InitTensorV2(&mask, 1, dimMask, X_FLOAT, 1.0F, -1);
mask.SetZeroAll();
mask.Set1D(-1e9F, 0);
mask.Set1D(-1e9F, 1);
mask.SetDevice(score.devID);
_SumDim(&score, &mask, 2);
score.Reshape(order, dimsBeam);
/* keep the most promissing candidates in the beam */
TopK(score, scoreTopK, index, -1, beamSize);
CopyValues(index, indexCPU);
CopyValues(index, preID);
/* "preID" represents the id (or the offset) of the previous state used to make the current
hypothesis. Note that we reshape the "score" tensor into a matrix where each
row means a previous state. The column number is size-of-beam \times vocab-size. We,
therefore, divide entries of the top-k index by vocab-size to compute the id of the
previous state for each hypothesis in the top-k list. */
DescaleMe(preID, sizeVocab);
/* Then, we do something similar to "preID". For the top-k predictions, we need
to know their indices in the vocabulary. We compute the offset of each prediction
in the vocabulary by dividing it with vocab-size and computing the remainder. */
ModMe(index, sizeVocab);
score.Reshape(order, dims);
/* we keep the top-k scores */
InitTensorV2(&score, &scoreTopK);
CopyValues(scoreTopK, score);
/* CPU data (TODO: remove GPU->CPU data copy!!!) */
for (int i = 0; i < indexCPU.unitNum; i += beamSize){
for (int j = 0; j < beamSize; j++) {
indexCPU.SetInt(i * stride + indexCPU.GetInt(i + j), i + j);
}
}
CheckNTErrors(IsSameShaped(prob, probPath), "Wrong tensor shape!");
/* sequence probability of top-k candidates */
XTensor probPathTopK;
InitTensorV2(&probPathTopK, &scoreTopK);
XTensor probTopK;
InitTensorV2(&probTopK, &scoreTopK);
for (int i = 0; i < probPath.order; i++) {
dims[i] = probPath.GetDim(i);
dimsTopK[i] = probPathTopK.GetDim(i);
}
order = probPath.order;
prob.Reshape(prob.unitNum, 1);
probPath.Reshape(probPath.unitNum, 1);
indexCPU.Reshape(indexCPU.GetDim(0), indexCPU.GetDim(-1));
indexCPU.SetDevice(prob.devID);
probTopK = Gather(prob, indexCPU);
probPathTopK = Gather(probPath, indexCPU);
probPath.Reshape(order, dims);
probPathTopK.Reshape(order, dimsTopK);
prob.Reshape(order, dims);
probTopK.Reshape(order, dimsTopK);
probPath = probPathTopK;
prob = probTopK;
}
/*
expand the search graph
>> beam - the beam that keeps a number of states
*/
void T2TSearch::Expand(T2TStateBundle* prev, T2TStateBundle* beam)
{
CheckNTErrors(beam->prediction.unitNum == beam->preID.unitNum, "A problem occurs in the beam!");
beam->MakeStates(beam->prediction.unitNum);
T2TState* states = beam->states;
XTensor& idRef = beam->preID;
XTensor& modelScoreRef = beam->modelScore;
XTensor& probRef = beam->prob;
XTensor& probPathRef = beam->probPath;
XTensor& predictionRef = beam->prediction;
XTensor& endMark = beam->endMark;
XTensor id;
XTensor modelScore;
XTensor prob;
XTensor probPath;
XTensor prediction;
XTensor endMarkCPU;
InitTensorOnCPU(&id, &idRef);
InitTensorOnCPU(&modelScore, &modelScoreRef);
InitTensorOnCPU(&prob, &probRef);
InitTensorOnCPU(&probPath, &probPathRef);
InitTensorOnCPU(&prediction, &predictionRef);
InitTensorOnCPU(&endMarkCPU, &predictionRef);
InitTensorV2(&endMark, &predictionRef);
/* we copy the data to CPU because the frequent access to GPU is slow
and we can speed-up the process by doing the job on CPU. */
CopyValues(idRef, id);
CopyValues(modelScoreRef, modelScore);
CopyValues(probRef, prob);
CopyValues(probPathRef, probPath);
CopyValues(predictionRef, prediction);
CheckNTErrors(beam->stateNum == id.unitNum, "Errors occur in counting!");
/* Related variables are kept on the states of the graph. All these are
maintained on CPUs to ease the implementation of frequent access and
modification of the states. An alternative is to do this on GPUs but
it needs much more coding work and the speed-up is not obvious. */
for (int i = 0; i < beam->stateNum; i += beamSize) {
for (int j = 0; j < beamSize; j++) {
int k = i + j;
T2TState& state = states[k];
int offset = id.GetInt(k);
int pid = i / beamSize;
T2TState* last = prev->states + pid * beamSize + offset;
CheckNTErrors(offset >= 0, "Wrong state index!");
/* pointer to the previous state */
if (prev->isStart) {
state.last = NULL;
state.pid = pid;
state.nstep = 0;
state.isCompleted = false;
}
else {
state.last = last;
state.pid = state.last->pid;
state.nstep = last->nstep + 1;
state.isCompleted = last->isCompleted;
CheckNTErrors(offset < prev->stateNum, "Wrong state index!");
}
/* scores */
state.modelScore = modelScore.Get(k);
state.prob = prob.Get(k);
state.probPath = probPath.Get(k);
/* prediction */
state.prediction = prediction.GetInt(k);
CheckNTErrors(state.prediction >= 0, "Illegal prediction!");
/* check if it is the end of the sequence */
state.isEnd = IsEnd(state.prediction);
state.isCompleted = (state.isCompleted || state.isEnd);
/* set the ending mark */
endMarkCPU.SetInt(state.isEnd, k);
}
}
/* copy the ending mark from CPU to the target device */
CopyValues(endMarkCPU, endMark);
}
/*
collect hypotheses with ending symbols. Given a beam of hypotheses,
we remove the finished hypotheses and keep them in a heap.
>> beam - the beam that keeps a number of states
*/
void T2TSearch::Collect(T2TStateBundle* beam)
{
T2TState* states = beam->states;
for (int i = 0; i < beam->stateNum; i++) {
T2TState& state = states[i];
CheckNTErrors(state.pid >= 0 && state.pid < batchSize, "Invalid sample id!");
/* check if this is the first end symbol. It is false
if there have been end symbols in previously generated words. */
bool isCompleted = state.isCompleted && (state.last == NULL || !state.last->isCompleted);
/* we push the hypothesis into the heap when it is completed */
if (state.isEnd && isCompleted) {
fullHypos[state.pid].Push(HeapNode<float>(&state, state.modelScore));
}
}
}
/*
fill the hypotheis heap with incomplete hypotheses
>> beam - the beam that keeps a number of states (final)
*/
void T2TSearch::FillHeap(T2TStateBundle* beam)
{
bool* emptyFlags = new bool[batchSize];
for (int i = 0; i < batchSize; i++)
emptyFlags[i] = (fullHypos[i].Count() == 0);
T2TState* states = beam->states;
for (int i = 0; i < beam->stateNum; i++) {
T2TState& state = states[i];
CheckNTErrors(state.pid >= 0 && state.pid < batchSize,
"Invalid sample id!");
/* check if this is the first end symbol. It is false
if there have been end symbols in previously generated words. */
bool isCompleted = state.isCompleted && (state.last == NULL || !state.last->isCompleted);
/* we push the imcomplete hypothesis into the heap */
if (emptyFlags[state.pid] || state.isEnd || isCompleted)
fullHypos[state.pid].Push(HeapNode<float>(&state, state.modelScore));
}
delete[] emptyFlags;
}
/*
save the output sequences in a tensor
>> output - output sequences (for return)
>> score - score of thes sequences
*/
void T2TSearch::Dump(XTensor * output, XTensor * score)
{
int dims[3] = { batchSize, beamSize, maxLength };
int* words = new int[maxLength];
InitTensorV2(output, 3, dims, X_INT);
InitTensorV2(score, 2, dims, X_FLOAT);
SetDataFixedInt(*output, -1);
score->SetZeroAll();
/* heap for an input sentence in the batch */
for (int h = 0; h < batchSize; h++) {
XHeap<MIN_HEAP, float> &heap = fullHypos[h];
int c = heap.Count();
/* for each output in the beam */
for(int i = 0; i < beamSize && heap.Count() > 0; i++){
HeapNode<float> node = heap.Pop();
T2TState * state = (T2TState *)node.index;
int count = 0;
bool isCompleted = true;
/* we track the state from the end to the beginning */
while (state != NULL) {
if (!state->isCompleted)
isCompleted = false;
if (isCompleted)
words[count++] = 2;
else
words[count++] = state->prediction;
state = state->last;
}
/* dump the sentence to the output tensor */
for(int w = 0; w < count; w++)
output->Set3DInt(words[count - w - 1], h, c - i - 1, w);
score->Set2D(node.value, h, c - i - 1);
}
}
delete[] words;
}
/*
check if the token is an end symbol
>> token - token to be checked
*/
bool T2TSearch::IsEnd(int token)
{
CheckNTErrors(endSymbolNum > 0, "No end symbol?");
for (int i = 0; i < endSymbolNum; i++) {
if (endSymbols[i] == token)
return true;
}
return false;
}
/*
set end symbols for search
>> tokens - end symbols
>> tokenNum - number of the end symbols
*/
void T2TSearch::SetEnd(const int* tokens, const int tokenNum)
{
if (endSymbols != NULL)
delete[] endSymbols;
if (tokenNum <= 0)
return;
/* we may have multiple end symbols */
tokens = new int[tokenNum];
for (int i = 0; i < tokenNum; i++)
endSymbols[i] = tokens[i];
endSymbolNum = tokenNum;
}
/*
check whether all hypotheses are completed
>> beam - the beam that keeps the searching states
*/
bool T2TSearch::IsAllCompleted(T2TStateBundle * beam)
{
T2TState * states = beam->states;
for (int i = 0; i < beam->stateNum; i++) {
T2TState & state = states[i];
if(!state.isCompleted)
return false;
}
return true;
}
/*
make a mask to prevent duplicated entries in beam expansion for the first position
>> beam - the beam that keeps the searching states
*/
XTensor T2TSearch::MakeFirstMask(T2TStateBundle* beam)
{
XTensor& prob = beam->prob;
XTensor mask;
int order = prob.order;
int dims[MAX_TENSOR_DIM_NUM];
for (int i = 0; i < order - 1; i++)
dims[i] = prob.GetDim(i);
InitTensorV2(&mask, order - 1, dims, X_FLOAT);
mask.SetZeroAll();
for (int i = 0; i < mask.unitNum; i++) {
if (i % beamSize != 0)
mask.Set(-1e9, i);
}
mask.SetDevice(prob.devID);
return mask;
}
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
*/
#ifndef __T2TSEARCH_H__
#define __T2TSEARCH_H__
#include "T2TModel.h"
#include "T2TPredictor.h"
namespace transformer
{
/* The class orgnizes the search process. It calls "predictors" to generate
distributions of the predictions and prunes the search space by beam pruning.
This makes a graph where each path respresents a translation hypothsis.
The output can be the path with the highest model score. */
class T2TSearch
{
private:
/* the alpha parameter controls the length preference */
float alpha;
/* predictor */
T2TPredictor predictor;
/* max length of the generated sequence */
int maxLength;
/* beam size */
int beamSize;
/* batch size */
int batchSize;
/* we keep the final hypotheses in a heap for each sentence in the batch. */
XHeap<MIN_HEAP, float>* fullHypos;
/* array of the end symbols */
int* endSymbols;
/* number of the end symbols */
int endSymbolNum;
/* start symbol */
int startSymbol;
/* scalar of the input sequence (for max number of search steps) */
float scalarMaxLength;
/* indicate whether the early stop strategy is used */
bool isEarlyStop;
public:
/* constructor */
T2TSearch();
/* de-constructor */
~T2TSearch();
/* initialize the model */
void Init(int argc, char** argv);
/* search for the most promising states */
void Search(T2TModel* model, XTensor* input, XTensor* padding, XTensor* output, XTensor* score);
/* preparation */
void Prepare(int myBatchSize, int myBeamSize);
/* compute the model score for each hypothesis */
void Score(T2TStateBundle* prev, T2TStateBundle* beam);
/* generate token indices via beam pruning */
void Generate(T2TStateBundle* beam);
/* expand the search graph */
void Expand(T2TStateBundle* prev, T2TStateBundle* beam);
/* collect hypotheses with ending symbol */
void Collect(T2TStateBundle* beam);
/* fill the hypotheis heap with incomplete hypothses */
void FillHeap(T2TStateBundle* beam);
/* save the output sequences and score */
void Dump(XTensor* output, XTensor* score);
/* check if the token is an end symbol */
bool IsEnd(int token);
/*check whether all hypotheses are completed*/
bool IsAllCompleted(T2TStateBundle* beam);
/* set end symbols for search */
void SetEnd(const int* tokens, const int tokenNum);
/* make a mask to prevent duplicated entries in beam expansion for the first position */
XTensor MakeFirstMask(T2TStateBundle* beam);
};
}
#endif
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
*/
#include <math.h>
#include "T2TUtility.h"
#include "T2TTester.h"
#include "T2TSearch.h"
#include "../../tensor/XUtility.h"
#include "../../tensor/core/CHeader.h"
#include "../../network/XNoder.h"
#include "..//..//tensor/XTensor.h"
using namespace nts;
namespace transformer
{
/* constructor */
T2TTester::T2TTester()
{
}
/* de-constructor */
T2TTester::~T2TTester()
{
}
/* initialize the model */
void T2TTester::Init(int argc, char** argv)
{
LoadParamInt(argc, argv, "vsize", &vSize, 34040);
LoadParamInt(argc, argv, "vsizetgt", &vSizeTgt, vSize);
LoadParamInt(argc, argv, "sentbatch", &sentBatch, 1);
LoadParamBool(argc, argv, "sort", &batchLoader.sortBuffer, true);
seacher.Init(argc, argv);
}
/*
test the model
>> fn - test data file
>> ofn - output data file
>> model - model that is trained
*/
void T2TTester::Test(const char* fn, const char* ofn, T2TModel* model)
{
int wc = 0;
int wordCount = 0;
int wordCountTotal = 0;
int sentCount = 0;
int batchCount = 0;
/* data files */
FILE* ofile = fopen(ofn, "wb");
CheckNTErrors(ofile, "Cannot open the output file");
int devID = model->devID;
double startT = GetClockSec();
/* batch of input sequences */
XTensor batchEnc;
/* padding */
XTensor paddingEnc;
/* an array that keeps the sequences */
int* seqs = new int[MILLION];
batchLoader.Init(fn);
int count = 0;
while (!batchLoader.IsEmpty())
{
count++;
wordCount = 0;
for (int i = 0; i < model->decoder->nlayer; ++i) {
model->decoder->selfAttCache[i].miss = true;
model->decoder->enDeAttCache[i].miss = true;
}
vector<int> indices = batchLoader.LoadBatch(&batchEnc, &paddingEnc, sentBatch, devID);
XTensor output;
XTensor score;
seacher.Search(model, &batchEnc, &paddingEnc, &output, &score);
for (int i = 0; i < indices.size(); ++i) {
Result res;
XTensor sent, srcIdx, tgtIdx;
InitTensor1DV2(&srcIdx, 1, X_INT, output.devID);
int idx[]{ i };
srcIdx.SetData(idx, 1);
InitTensorV2(&tgtIdx, &srcIdx);
SetAscendingOrder(tgtIdx, 0);
sent = CopyIndexed(output, 0, srcIdx, tgtIdx);
res.values = sent;
res.id = indices[i];
batchLoader.resBuffer.emplace_back(res);
}
wc = batchEnc.GetDim(-1);
wordCount += wc;
wordCountTotal += wc;
sentCount += batchEnc.GetDim(-2);
batchCount += 1;
if (batchCount % 1 == 0) {
double elapsed = GetClockSec() - startT;
XPRINT3(0, stderr, "[INFO] elapsed=%.1fs, sentence=%d, sword=%d\n", elapsed, sentCount, wordCount);
}
}
batchLoader.RerankRes();
for (auto res : batchLoader.resBuffer) {
Dump(ofile, &res.values);
}
fclose(ofile);
delete[] seqs;
double elapsed = GetClockSec() - startT;
XPRINT3(0, stderr, "[INFO] test finished (took %.1fs, word=%d, sent=%d)\n", elapsed, wordCountTotal, sentCount);
}
/*
dump the result into the file
>> file - data file
>> output - output tensor
*/
void T2TTester::Dump(FILE* file, XTensor* output)
{
int seqLength = output->GetDim(-1);
for (int i = 0; i < output->unitNum; i += seqLength) {
for (int j = 0; j < seqLength; j++) {
int w = output->GetInt(i + j);
if (w < 0 || w == 1)
break;
fprintf(file, "%d ", w);
}
fprintf(file, "\n");
}
}
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
* A week with no trips :)
*/
#ifndef __T2TTESTER_H__
#define __T2TTESTER_H__
#include "T2TSearch.h"
#include "t2tdata/DataSet.h"
namespace transformer
{
/* This class translates test sentences with a trained model. */
class T2TTester
{
public:
/* vocabulary size of the source side */
int vSize;
/* vocabulary size of the target side */
int vSizeTgt;
/* batch size for sentences */
int sentBatch;
/* for batching */
DataSet batchLoader;
/* decoder for inference */
T2TSearch seacher;
public:
/* constructor */
T2TTester();
/* de-constructor */
~T2TTester();
/* initialize the model */
void Init(int argc, char** argv);
/* test the model */
void Test(const char* fn, const char* ofn, T2TModel* model);
/* dump the result into the file */
void Dump(FILE* file, XTensor* output);
};
}
#endif
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
namespace transformer
{
FILE * tmpFILE;
int llnum = 0;
FILE * tf = NULL;
void LoadParamString(int argc, char ** argv, const char * name, char * p, const char * defaultP)
{
char vname[128];
vname[0] = '-';
strcpy(vname + 1, name);
bool hit = false;
for(int i = 0; i < argc; i++){
if(!strcmp(argv[i], vname) && i + 1 < argc){
strcpy(p, argv[i + 1]);
//fprintf(stderr, " %s=%s\n", name, argv[i + 1]);
hit = true;
}
}
if(!hit)
strcpy(p, defaultP);
}
void LoadParamInt(int argc, char ** argv, const char * name, int * p, int defaultP)
{
char vname[128];
vname[0] = '-';
strcpy(vname + 1, name);
bool hit = false;
for(int i = 0; i < argc; i++){
if(!strcmp(argv[i], vname) && i + 1 < argc){
*(int*)p = atoi(argv[i + 1]);
//fprintf(stderr, " %s=%s\n", name, argv[i + 1]);
hit = true;
}
}
if(!hit)
*p = defaultP;
}
void LoadParamBool(int argc, char ** argv, const char * name, bool * p, bool defaultP)
{
char vname[128];
vname[0] = '-';
strcpy(vname + 1, name);
bool hit = false;
for(int i = 0; i < argc; i++){
if(!strcmp(argv[i], vname)){
*(bool*)p = true;
//fprintf(stderr, " %s=%s\n", name, "true");
hit = true;
}
}
if(!hit)
*p = defaultP;
}
void LoadParamFloat(int argc, char ** argv, const char * name, float * p, float defaultP)
{
char vname[128];
vname[0] = '-';
strcpy(vname + 1, name);
bool hit = false;
for(int i = 0; i < argc; i++){
if(!strcmp(argv[i], vname) && i + 1 < argc){
*p = (float)atof(argv[i + 1]);
//fprintf(stderr, " %s=%s\n", name, argv[i + 1]);
hit = true;
}
}
if(!hit)
*p = defaultP;
}
void ShowParams(int argc, char ** argv)
{
fprintf(stderr, "args:\n");
for(int i = 0; i < argc; i++){
if(argv[i][1] == 0)
continue;
if(argv[i][0] == '-' && (argv[i][1] < '1' || argv[i][1] > '9')){
if(i + 1 < argc && argv[i + 1][0] != '-')
fprintf(stderr, " %s=%s\n", argv[i], argv[i + 1]);
else
fprintf(stderr, " %s=yes\n", argv[i]);
}
}
fprintf(stderr, "\n");
}
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
*/
#ifndef __T2TUTILITY_H__
#define __T2TUTILITY_H__
#include <stdio.h>
namespace transformer
{
extern FILE * tmpFILE;
/* load arguments */
void LoadParamString(int argc, char ** argv, const char * name, char * p, const char * defaultP);
void LoadParamInt(int argc, char ** argv, const char * name, int * p, int defaultP);
void LoadParamBool(int argc, char ** argv, const char * name, bool * p, bool defaultP);
void LoadParamFloat(int argc, char ** argv, const char * name, float * p, float defaultP);
/* show arguments */
void ShowParams(int argc, char ** argv);
extern int llnum;
extern FILE * tf;
}
#endif
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University. * Copyright (C) 2020 NiuTrans Research. All rights reserved.
* All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
...@@ -27,13 +26,13 @@ ...@@ -27,13 +26,13 @@
#include <fstream> #include <fstream>
#include <sstream> #include <sstream>
#include "T2TUtility.h" #include "Utility.h"
#include "../../../tensor/XGlobal.h" #include "../../tensor/XGlobal.h"
using namespace nts; using namespace nts;
using namespace std; using namespace std;
namespace transformer namespace nmt
{ {
/* /*
...@@ -41,7 +40,7 @@ load configurations from the command ...@@ -41,7 +40,7 @@ load configurations from the command
>> argc - number of arguments >> argc - number of arguments
>> argv - the list of arguments >> argv - the list of arguments
*/ */
T2TConfig::T2TConfig(int argc, const char** argv) Config::Config(int argc, const char** argv)
{ {
char** args = new char* [MAX_PARAM_NUM]; char** args = new char* [MAX_PARAM_NUM];
for (int i = 0; i < argc; i++) { for (int i = 0; i < argc; i++) {
...@@ -61,22 +60,26 @@ T2TConfig::T2TConfig(int argc, const char** argv) ...@@ -61,22 +60,26 @@ T2TConfig::T2TConfig(int argc, const char** argv)
ShowParams(argsNum, args); ShowParams(argsNum, args);
/* options for the model */ /* options for the model */
LoadParamInt(argsNum, args, "nhead", &nhead, 8); LoadParamInt(argsNum, args, "nhead", &nhead, 4);
LoadParamInt(argsNum, args, "enclayer", &nEncLayer, 1); LoadParamInt(argsNum, args, "enclayer", &nEncLayer, 6);
LoadParamInt(argsNum, args, "declayer", &nDecLayer, 1); LoadParamInt(argsNum, args, "declayer", &nDecLayer, 6);
LoadParamInt(argsNum, args, "maxrp", &maxRP, 8); LoadParamInt(argsNum, args, "maxrp", &maxRP, 8);
LoadParamInt(argsNum, args, "embsize", &embSize, 256); LoadParamInt(argsNum, args, "embsize", &embSize, 512);
LoadParamInt(argsNum, args, "modelsize", &modelSize, 256); LoadParamInt(argsNum, args, "modelsize", &modelSize, 512);
LoadParamInt(argsNum, args, "maxpos", &maxPosLen, 1024); LoadParamInt(argsNum, args, "maxpos", &maxPosLen, 1024);
LoadParamInt(argsNum, args, "fnnhidden", &fnnHiddenSize, modelSize * 4); LoadParamInt(argsNum, args, "fnnhidden", &fnnHiddenSize, modelSize * 2);
LoadParamInt(argsNum, args, "vsize", &srcVocabSize, 10000); LoadParamInt(argsNum, args, "vsize", &srcVocabSize, 10152);
LoadParamInt(argsNum, args, "vsizetgt", &tgtVocabSize, 10000); LoadParamInt(argsNum, args, "vsizetgt", &tgtVocabSize, 10152);
LoadParamInt(argsNum, args, "padid", &padID, 1); LoadParamInt(argsNum, args, "padid", &padID, 1);
LoadParamInt(argsNum, args, "startid", &startID, 2); LoadParamInt(argsNum, args, "startid", &startID, 2);
LoadParamInt(argsNum, args, "endid", &endID, 2); LoadParamInt(argsNum, args, "endid", &endID, 2);
LoadParamBool(argsNum, args, "rpr", &useRPR, false); LoadParamBool(argsNum, args, "rpr", &useRPR, false);
LoadParamBool(argsNum, args, "prenorm", &preNorm, false); LoadParamBool(argsNum, args, "prenorm", &preNorm, true);
LoadParamString(argsNum, args, "model", modelFN, "model.bin");
// TODO: refactor the parameters type to support weight sharing during training
LoadParamInt(argsNum, args, "shareemb", &shareAllEmbeddings, 0);
LoadParamInt(argsNum, args, "sharedec", &shareDecInputOutputWeight, 0);
LoadParamString(argsNum, args, "model", modelFN, "");
LoadParamString(argsNum, args, "srcvocab", srcVocabFN, "vocab.src"); LoadParamString(argsNum, args, "srcvocab", srcVocabFN, "vocab.src");
LoadParamString(argsNum, args, "tgtvocab", tgtVocabFN, "vocab.tgt"); LoadParamString(argsNum, args, "tgtvocab", tgtVocabFN, "vocab.tgt");
...@@ -84,19 +87,20 @@ T2TConfig::T2TConfig(int argc, const char** argv) ...@@ -84,19 +87,20 @@ T2TConfig::T2TConfig(int argc, const char** argv)
LoadParamString(argsNum, args, "train", trainFN, ""); LoadParamString(argsNum, args, "train", trainFN, "");
LoadParamString(argsNum, args, "valid", validFN, ""); LoadParamString(argsNum, args, "valid", validFN, "");
LoadParamInt(argsNum, args, "dev", &devID, 0); LoadParamInt(argsNum, args, "dev", &devID, 0);
LoadParamInt(argsNum, args, "wbatch", &wBatchSize, 2048); LoadParamInt(argsNum, args, "wbatch", &wBatchSize, 4096);
LoadParamInt(argsNum, args, "sbatch", &sBatchSize, 1); LoadParamInt(argsNum, args, "sbatch", &sBatchSize, 8);
isTraining = (strcmp(trainFN, "") == 0) ? false : true; isTraining = (strcmp(trainFN, "") == 0) ? false : true;
LoadParamBool(argsNum, args, "mt", &isMT, true); LoadParamBool(argsNum, args, "mt", &isMT, true);
LoadParamFloat(argsNum, args, "dropout", &dropout, 0.1); LoadParamFloat(argsNum, args, "dropout", &dropout, 0.3);
LoadParamFloat(argsNum, args, "fnndrop", &fnnDropout, 0.0); LoadParamFloat(argsNum, args, "fnndrop", &fnnDropout, 0.1);
LoadParamFloat(argsNum, args, "attdrop", &attDropout, 0.0); LoadParamFloat(argsNum, args, "attdrop", &attDropout, 0.1);
LoadParamFloat(argc, args, "lrate", &lrate, 1.0F); LoadParamFloat(argc, args, "lrate", &lrate, 0.0015F);
LoadParamFloat(argc, args, "lrbias", &lrbias, 0); LoadParamFloat(argc, args, "lrbias", &lrbias, 0);
LoadParamInt(argc, args, "nepoch", &nepoch, 20); LoadParamInt(argc, args, "nepoch", &nepoch, 50);
LoadParamInt(argc, args, "maxcheckpoint", &maxCheckpoint, 10);
LoadParamInt(argc, args, "nstep", &nstep, 100000); LoadParamInt(argc, args, "nstep", &nstep, 100000);
LoadParamInt(argc, args, "nwarmup", &nwarmup, 3000); LoadParamInt(argc, args, "nwarmup", &nwarmup, 8000);
LoadParamBool(argc, args, "adam", &useAdam, true); LoadParamBool(argc, args, "adam", &useAdam, true);
LoadParamFloat(argc, args, "adambeta1", &adamBeta1, 0.9F); LoadParamFloat(argc, args, "adambeta1", &adamBeta1, 0.9F);
LoadParamFloat(argc, args, "adambeta2", &adamBeta2, 0.98F); LoadParamFloat(argc, args, "adambeta2", &adamBeta2, 0.98F);
...@@ -104,9 +108,8 @@ T2TConfig::T2TConfig(int argc, const char** argv) ...@@ -104,9 +108,8 @@ T2TConfig::T2TConfig(int argc, const char** argv)
LoadParamBool(argc, args, "shuffled", &isShuffled, true); LoadParamBool(argc, args, "shuffled", &isShuffled, true);
LoadParamFloat(argc, args, "labelsmoothing", &labelSmoothingP, 0.1); LoadParamFloat(argc, args, "labelsmoothing", &labelSmoothingP, 0.1);
LoadParamInt(argc, args, "nstepcheckpoint", &nStepCheckpoint, -1); LoadParamInt(argc, args, "nstepcheckpoint", &nStepCheckpoint, -1);
LoadParamBool(argc, args, "epochcheckpoint", &useEpochCheckpoint, false); LoadParamBool(argc, args, "epochcheckpoint", &useEpochCheckpoint, true);
LoadParamInt(argc, args, "updatestep", &updateStep, 1); LoadParamInt(argc, args, "updatestep", &updateStep, 1);
LoadParamBool(argc, args, "debug", &isDebugged, false);
LoadParamBool(argc, args, "sorted", &isLenSorted, false); LoadParamBool(argc, args, "sorted", &isLenSorted, false);
LoadParamInt(argc, args, "bufsize", &bufSize, 50000); LoadParamInt(argc, args, "bufsize", &bufSize, 50000);
...@@ -114,7 +117,7 @@ T2TConfig::T2TConfig(int argc, const char** argv) ...@@ -114,7 +117,7 @@ T2TConfig::T2TConfig(int argc, const char** argv)
LoadParamBool(argc, args, "smallbatch", &isSmallBatch, true); LoadParamBool(argc, args, "smallbatch", &isSmallBatch, true);
LoadParamBool(argc, args, "bigbatch", &isBigBatch, false); LoadParamBool(argc, args, "bigbatch", &isBigBatch, false);
LoadParamBool(argc, args, "randbatch", &isRandomBatch, false); LoadParamBool(argc, args, "randbatch", &isRandomBatch, false);
LoadParamInt(argc, args, "bucketsize", &bucketSize, 0); LoadParamInt(argc, args, "bucketsize", &bucketSize, wBatchSize * 10);
/* options for translating */ /* options for translating */
LoadParamString(argsNum, args, "test", testFN, ""); LoadParamString(argsNum, args, "test", testFN, "");
...@@ -122,7 +125,7 @@ T2TConfig::T2TConfig(int argc, const char** argv) ...@@ -122,7 +125,7 @@ T2TConfig::T2TConfig(int argc, const char** argv)
LoadParamInt(argsNum, args, "beamsize", &beamSize, 1); LoadParamInt(argsNum, args, "beamsize", &beamSize, 1);
LoadParamBool(argsNum, args, "fp16", &useFP16, false); LoadParamBool(argsNum, args, "fp16", &useFP16, false);
LoadParamFloat(argsNum, args, "lenalpha", &lenAlpha, 0.6); LoadParamFloat(argsNum, args, "lenalpha", &lenAlpha, 0.6);
LoadParamFloat(argsNum, args, "maxlenalpha", &maxLenAlpha, 2.0); LoadParamFloat(argsNum, args, "maxlenalpha", &maxLenAlpha, 1.2);
for (int i = 0; i < argc; i++) for (int i = 0; i < argc; i++)
delete[] args[i]; delete[] args[i];
...@@ -136,7 +139,7 @@ load configurations from a file ...@@ -136,7 +139,7 @@ load configurations from a file
>> args - the list to store the configurations >> args - the list to store the configurations
format: one option per line, separated by a blank or a tab format: one option per line, separated by a blank or a tab
*/ */
int T2TConfig::LoadFromFile(const char* configFN, char** args) { int Config::LoadFromFile(const char* configFN, char** args) {
ifstream f(configFN, ios::in); ifstream f(configFN, ios::in);
CheckNTErrors(f.is_open(), "unable to open the config file"); CheckNTErrors(f.is_open(), "unable to open the config file");
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2018, Natural Language Processing Lab, Northeastern University. * Copyright (C) 2020 NiuTrans Research. All rights reserved.
* All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
...@@ -20,18 +19,18 @@ ...@@ -20,18 +19,18 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-06 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-06
*/ */
#ifndef __T2TUTILITY_H__ #ifndef __UTILITY_H__
#define __T2TUTILITY_H__ #define __UTILITY_H__
#include <string> #include <string>
#include <cstdio> #include <cstdio>
#include "../../../tensor/XList.h" #include "../../tensor/XList.h"
using namespace std; using namespace std;
using namespace nts; using namespace nts;
namespace transformer namespace nmt
{ {
#define MAX_PARAM_NUM 100 #define MAX_PARAM_NUM 100
...@@ -50,8 +49,8 @@ IntList SplitInt(const string& s, const string& delimiter); ...@@ -50,8 +49,8 @@ IntList SplitInt(const string& s, const string& delimiter);
FloatList SplitFloat(const string& s, const string& delimiter); FloatList SplitFloat(const string& s, const string& delimiter);
UInt64List SplitToPos(const string& s, const string& delimiter); UInt64List SplitToPos(const string& s, const string& delimiter);
/* configurations for t2t */ /* configurations for */
class T2TConfig { class Config {
public: public:
/* path to the model */ /* path to the model */
char modelFN[1024]; char modelFN[1024];
...@@ -131,6 +130,12 @@ public: ...@@ -131,6 +130,12 @@ public:
/* indicates whether the model is running for machine translation */ /* indicates whether the model is running for machine translation */
bool isMT; bool isMT;
/* indicates whether share encoder decoder embeddings */
int shareAllEmbeddings;
/* indicates whether share decoder embeddings and output weights */
int shareDecInputOutputWeight;
/* indicates whether the model is running with FP16 data type */ /* indicates whether the model is running with FP16 data type */
bool useFP16; bool useFP16;
...@@ -164,9 +169,12 @@ public: ...@@ -164,9 +169,12 @@ public:
/* training epoch number */ /* training epoch number */
int nepoch; int nepoch;
/* traing step number */ /* training step number */
int nstep; int nstep;
/* the maximum number of saved checkpoints */
int maxCheckpoint;
/* indicates whether we use Adam */ /* indicates whether we use Adam */
bool useAdam; bool useAdam;
...@@ -193,9 +201,6 @@ public: ...@@ -193,9 +201,6 @@ public:
/* number of batches on which we do model update */ /* number of batches on which we do model update */
int updateStep; int updateStep;
/* indicates whether we intend to debug the net */
bool isDebugged;
/* indicates whether the sequence is sorted by length */ /* indicates whether the sequence is sorted by length */
bool isLenSorted; bool isLenSorted;
...@@ -222,7 +227,7 @@ public: ...@@ -222,7 +227,7 @@ public:
public: public:
/* load configurations from the command */ /* load configurations from the command */
T2TConfig(int argc, const char** argv); Config(int argc, const char** argv);
/* load configurations from a file */ /* load configurations from a file */
int LoadFromFile(const char* configFN, char** args); int LoadFromFile(const char* configFN, char** args);
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University. * Copyright (C) 2020 NiuTrans Research. All rights reserved.
* All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
...@@ -15,22 +14,20 @@ ...@@ -15,22 +14,20 @@
* limitations under the License. * limitations under the License.
*/ */
/* /*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
*/ */
#include <cmath> #include "Attention.h"
#include "Embedding.h"
#include "T2TUtility.h" #include "../Utility.h"
#include "T2TAttention.h"
#include "T2TEmbedding.h"
#include "../../../tensor/core/CHeader.h" #include "../../../tensor/core/CHeader.h"
namespace transformer namespace nmt
{ {
/* constructor */ /* constructor */
T2TAttention::T2TAttention() Attention::Attention()
{ {
nhead = -1; nhead = -1;
dk = -1; dk = -1;
...@@ -39,7 +36,7 @@ T2TAttention::T2TAttention() ...@@ -39,7 +36,7 @@ T2TAttention::T2TAttention()
} }
/* de-constructor */ /* de-constructor */
T2TAttention::~T2TAttention() Attention::~Attention()
{ {
} }
...@@ -47,7 +44,7 @@ T2TAttention::~T2TAttention() ...@@ -47,7 +44,7 @@ T2TAttention::~T2TAttention()
initialize the model initialize the model
>> config - the configurations of the network >> config - the configurations of the network
*/ */
void T2TAttention::InitModel(T2TConfig& config) void Attention::InitModel(Config& config)
{ {
devID = config.devID; devID = config.devID;
useRPR = config.useRPR; useRPR = config.useRPR;
...@@ -59,28 +56,34 @@ void T2TAttention::InitModel(T2TConfig& config) ...@@ -59,28 +56,34 @@ void T2TAttention::InitModel(T2TConfig& config)
maxRP = config.maxRP; maxRP = config.maxRP;
dropoutP = config.attDropout; dropoutP = config.attDropout;
InitTensor2D(&wq, d, d, X_FLOAT, devID); /* initialize the parameters */
InitTensor1D(&bq, d, X_FLOAT, devID); InitTensor2D(&weightQ, d, d, X_FLOAT, devID);
InitTensor2D(&wk, d, d, X_FLOAT, devID); InitTensor1D(&biasQ, d, X_FLOAT, devID);
InitTensor1D(&bk, d, X_FLOAT, devID); InitTensor2D(&weightK, d, d, X_FLOAT, devID);
InitTensor2D(&wv, d, d, X_FLOAT, devID); InitTensor1D(&biasK, d, X_FLOAT, devID);
InitTensor1D(&bv, d, X_FLOAT, devID); InitTensor2D(&weightV, d, d, X_FLOAT, devID);
InitTensor1D(&biasV, d, X_FLOAT, devID);
if (useRPR) if (useRPR)
InitTensor2D(&RPEmbK, maxRP * 2 + 1, d / nhead, X_FLOAT, devID); InitTensor2D(&RPEmbK, maxRP * 2 + 1, d / nhead, X_FLOAT, devID);
InitTensor2D(&wo, d, d, X_FLOAT, devID);
InitTensor1D(&bo, d, X_FLOAT, devID); InitTensor2D(&weightO, d, d, X_FLOAT, devID);
InitTensor1D(&biasO, d, X_FLOAT, devID);
float scale = 1.0F; float scale = 1.0F;
_SetDataFanInOut(&wk, scale); _SetDataFanInOut(&weightK, scale);
_SetDataFanInOut(&wq, scale); _SetDataFanInOut(&weightQ, scale);
_SetDataFanInOut(&wv, scale); _SetDataFanInOut(&weightV, scale);
_SetDataFanInOut(&wo, scale); _SetDataFanInOut(&weightO, scale);
if (useRPR) if (useRPR)
_SetDataFanInOut(&RPEmbK, scale); _SetDataFanInOut(&RPEmbK, scale);
bk.SetZeroAll();
bq.SetZeroAll(); biasQ.SetZeroAll();
bv.SetZeroAll(); biasO.SetZeroAll();
bo.SetZeroAll();
biasK.SetDataRand(-(DTYPE)sqrt(6.0F / d), (DTYPE)sqrt(6.0F / d));
biasV.SetDataRand(-(DTYPE)sqrt(6.0F / d), (DTYPE)sqrt(6.0F / d));
} }
/* /*
...@@ -96,30 +99,30 @@ make the network ...@@ -96,30 +99,30 @@ make the network
>> cacheType - type of cache, e.g., self-attention >> cacheType - type of cache, e.g., self-attention
<< return - multi-attention result << return - multi-attention result
*/ */
XTensor T2TAttention::Make(XTensor& k, XTensor& q, XTensor& v, XTensor* mask, XTensor Attention::Make(XTensor& k, XTensor& q, XTensor& v, XTensor* mask,
bool isTraining, Cache* cache, int cacheType) bool isTraining, Cache* cache, int attType)
{ {
const bool isEnc = (!cache) ? true : false; const bool isEnc = (!cache) ? true : false;
/* linear transformation before self-attention */ /* linear transformation before self-attention */
XTensor q2, k2, v2; XTensor q2, k2, v2;
q2 = MulAndShift(q, wq, bq); q2 = MulAndShift(q, weightQ, biasQ);
if (!cache || isTraining) { if (!cache || isTraining || !(cache->enable)) {
/* self attention for encoder layers */ /* self attention for encoder layers */
k2 = MulAndShift(k, wk, bk); k2 = MulAndShift(k, weightK, biasK);
v2 = MulAndShift(v, wv, bv); v2 = MulAndShift(v, weightV, biasV);
if (useRPR) if (useRPR && attType == SELF_ATT)
return MakeRPRAttention(k2, q2, v2, mask, isTraining, isEnc); return MakeRPRAttention(k2, q2, v2, mask, isTraining, isEnc);
return MakeAttention(k2, q2, v2, mask, isTraining); return MakeAttention(k2, q2, v2, mask, isTraining);
} }
else { else {
if (cacheType == SELF_ATT) { if (attType == SELF_ATT) {
k2 = MulAndShift(k, wk, bk); k2 = MulAndShift(k, weightK, biasK);
v2 = MulAndShift(v, wv, bv); v2 = MulAndShift(v, weightV, biasV);
/* if hit, we only concat the cache with the new token */ /* if hit, we only concat the cache with the new token */
if (!cache->miss) { if (!cache->miss) {
...@@ -134,10 +137,10 @@ XTensor T2TAttention::Make(XTensor& k, XTensor& q, XTensor& v, XTensor* mask, ...@@ -134,10 +137,10 @@ XTensor T2TAttention::Make(XTensor& k, XTensor& q, XTensor& v, XTensor* mask,
return MakeRPRAttention(cache->key, q2, cache->value, mask, isTraining, isEnc); return MakeRPRAttention(cache->key, q2, cache->value, mask, isTraining, isEnc);
return MakeAttention(cache->key, q2, cache->value, mask, isTraining); return MakeAttention(cache->key, q2, cache->value, mask, isTraining);
} }
else if (cacheType == EN_DE_ATT) { else if (attType == EN_DE_ATT) {
if (cache->miss) { if (cache->miss) {
cache->key = MulAndShift(k, wk, bk); cache->key = MulAndShift(k, weightK, biasK);
cache->value = MulAndShift(v, wv, bv); cache->value = MulAndShift(v, weightV, biasV);
cache->miss = false; cache->miss = false;
} }
...@@ -155,7 +158,7 @@ make the attention network given keys, queries and values (after linear transfor ...@@ -155,7 +158,7 @@ make the attention network given keys, queries and values (after linear transfor
>> mask - as it is >> mask - as it is
>> isTraining - indicates whether the model is used for training >> isTraining - indicates whether the model is used for training
*/ */
XTensor T2TAttention::MakeAttention(XTensor& k, XTensor& q, XTensor& v, XTensor Attention::MakeAttention(XTensor& k, XTensor& q, XTensor& v,
XTensor* mask, bool isTraining) XTensor* mask, bool isTraining)
{ {
XTensor kheads; XTensor kheads;
...@@ -185,7 +188,7 @@ XTensor T2TAttention::MakeAttention(XTensor& k, XTensor& q, XTensor& v, ...@@ -185,7 +188,7 @@ XTensor T2TAttention::MakeAttention(XTensor& k, XTensor& q, XTensor& v,
dot = BMMul(qheads, X_NOTRANS, kheads, X_TRANS); dot = BMMul(qheads, X_NOTRANS, kheads, X_TRANS);
if (mask) if (mask)
dot = dot + (*mask); dot = dot + *mask;
dot = Linear(dot, 1.0F / (float)sqrt((float)dk / nhead)); dot = Linear(dot, 1.0F / (float)sqrt((float)dk / nhead));
...@@ -203,7 +206,7 @@ XTensor T2TAttention::MakeAttention(XTensor& k, XTensor& q, XTensor& v, ...@@ -203,7 +206,7 @@ XTensor T2TAttention::MakeAttention(XTensor& k, XTensor& q, XTensor& v,
att = ConvertDataType(att, dataType); att = ConvertDataType(att, dataType);
/* concatenate the heads */ /* concatenate the heads */
return MulAndShift(Merge(att, att.order - 1), wo, bo); return MulAndShift(Merge(att, att.order - 1), weightO, biasO);
} }
/* /*
...@@ -216,16 +219,16 @@ with the given keys, queries and values (after linear transformation) ...@@ -216,16 +219,16 @@ with the given keys, queries and values (after linear transformation)
>> isTraining - indicates whether the model is used for training >> isTraining - indicates whether the model is used for training
>> isEnc - indicates whether it is encoder >> isEnc - indicates whether it is encoder
*/ */
XTensor T2TAttention::MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v, XTensor Attention::MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v,
XTensor* mask, bool isTraining, bool isEnc) XTensor* mask, bool isTraining, bool isEnc)
{ {
XTensor kheads; XTensor kheads;
XTensor qheads; XTensor qheads;
XTensor vheads; XTensor vheads;
const int batchSize = q.dimSize[0]; const int batchSize = q.GetDim(0);
const int lenQ = q.dimSize[1]; const int lenQ = q.GetDim(1);
const int lenKV = k.dimSize[1]; const int lenKV = k.GetDim(1);
const auto dataType = k.dataType; const auto dataType = k.dataType;
...@@ -241,7 +244,7 @@ XTensor T2TAttention::MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v, ...@@ -241,7 +244,7 @@ XTensor T2TAttention::MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v,
XTensor embMatrix, relativeKey; XTensor embMatrix, relativeKey;
/* generate the relative emb index (L_q, L_kv) */ /* generate the relative emb index (L_q, L_kv) */
embMatrix = GetRPEmbedding(lenQ, lenKV, maxRP, isEnc); embMatrix = GetRPEmbedding(lenQ, lenKV, maxRP, isEnc || isTraining);
/* generate the relative key from the RPEmbK (L_q, L_kv, H/K) */ /* generate the relative key from the RPEmbK (L_q, L_kv, H/K) */
relativeKey = Gather(RPEmbK, embMatrix); relativeKey = Gather(RPEmbK, embMatrix);
...@@ -252,12 +255,13 @@ XTensor T2TAttention::MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v, ...@@ -252,12 +255,13 @@ XTensor T2TAttention::MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v,
relativeKey = ConvertDataType(relativeKey, X_FLOAT); relativeKey = ConvertDataType(relativeKey, X_FLOAT);
} }
ScaleAndShiftMe(qheads, 1.0F / float(nhead)); float scaling = sqrt(d / nhead);
qheads = ScaleAndShift(qheads, 1.0F / scaling);
dot = RPDotProduct(qheads, kheads, relativeKey, true); dot = RPDotProduct(qheads, kheads, relativeKey, true);
if (mask) if (mask)
dot = dot + (*mask); dot = dot + *mask;
/* softmax */ /* softmax */
scalar = Softmax(dot, -1); scalar = Softmax(dot, -1);
...@@ -275,7 +279,7 @@ XTensor T2TAttention::MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v, ...@@ -275,7 +279,7 @@ XTensor T2TAttention::MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v,
att = ConvertDataType(att, dataType); att = ConvertDataType(att, dataType);
/* concatenate the heads */ /* concatenate the heads */
return MulAndShift(Merge(att, att.order - 1), wo, bo); return MulAndShift(Merge(att, att.order - 1), weightO, biasO);
} }
/* /*
...@@ -284,7 +288,7 @@ generate relative position embeddings ...@@ -284,7 +288,7 @@ generate relative position embeddings
>> lenKV - the length of key and value >> lenKV - the length of key and value
>> maxRelativeLen - the maximum length of relative position >> maxRelativeLen - the maximum length of relative position
*/ */
XTensor T2TAttention::GetRPEmbedding(const int lenQ, const int lenKV, XTensor Attention::GetRPEmbedding(const int lenQ, const int lenKV,
const int maxRelativeLen, const bool isEnc) const int maxRelativeLen, const bool isEnc)
{ {
XTensor range; XTensor range;
...@@ -300,7 +304,7 @@ XTensor T2TAttention::GetRPEmbedding(const int lenQ, const int lenKV, ...@@ -300,7 +304,7 @@ XTensor T2TAttention::GetRPEmbedding(const int lenQ, const int lenKV,
XTensor range2DTrans; XTensor range2DTrans;
range2D = Unsqueeze(range, 0, lenQ); range2D = Unsqueeze(range, 0, lenQ);
range2DTrans = Transpose(range2D, 0, 1); range2DTrans = Transpose(range2D, 0, 1);
embMatrix = Sum(range2D, range2DTrans, -1); embMatrix = Sum(range2D, range2DTrans, false, -1);
} }
else { else {
for (int i = 0; i < lenKV; i++) for (int i = 0; i < lenKV; i++)
...@@ -309,37 +313,46 @@ XTensor T2TAttention::GetRPEmbedding(const int lenQ, const int lenKV, ...@@ -309,37 +313,46 @@ XTensor T2TAttention::GetRPEmbedding(const int lenQ, const int lenKV,
embMatrix = Unsqueeze(range, 0, lenQ); embMatrix = Unsqueeze(range, 0, lenQ);
} }
ClipMe(embMatrix, -float(maxRelativeLen), float(maxRelativeLen)); //ClipMe(embMatrix, -float(maxRelativeLen), float(maxRelativeLen));
ScaleAndShiftMe(embMatrix, 1.0F, float(maxRelativeLen)); embMatrix = Clip(embMatrix, -float(maxRelativeLen), float(maxRelativeLen));
embMatrix = ScaleAndShift(embMatrix, 1.0F, float(maxRelativeLen));
delete[] index; delete[] index;
return embMatrix; return embMatrix;
} }
/* /*
Relative position-aware dot-product attention inner calculation. relative position-aware dot-product attention inner calculation.
>> x - Tensor with shape [batch_size*heads, length, length or depth]. >> x - Tensor with shape [batch_size*heads, length, length or depth].
>> y - Tensor with shape [batch_size*heads, length, depth]. >> y - Tensor with shape [batch_size*heads, length, depth].
>> z - Tensor with shape [length, length, depth]. >> z - Tensor with shape [length, length, depth].
>> isKey - Whether y is key. >> isKey - Whether y is key.
<< return - A Tensor with shape [batch_size*heads, length, length or depth]. << return - A Tensor with shape [batch_size*heads, length, length or depth].
*/ */
XTensor T2TAttention::RPDotProduct(XTensor& x, XTensor& y, XTensor& z, const bool isKey) XTensor Attention::RPDotProduct(XTensor& x, XTensor& y, XTensor& z, const bool isKey)
{ {
const int headNum = nhead; const int headNum = nhead;
const int batchSize = x.dimSize[1]; const int batchSize = x.GetDim(1);
const int lenQ = x.dimSize[2]; const int lenQ = x.GetDim(2);
const int lenKV = y.dimSize[2]; const int lenKV = y.GetDim(2);
const int depth = y.dimSize[3]; const int depth = y.GetDim(3);
const int lastDim = isKey ? lenKV : depth; const int lastDim = isKey ? lenKV : depth;
MATRIX_TRANS_TYPE transposeFlag = isKey ? X_TRANS : X_NOTRANS; auto transposeFlag = isKey ? X_TRANS : X_NOTRANS;
XTensor context; int mergeDimsX[] = { headNum * batchSize, lenQ, x.GetDim(3) };
context = MatrixMulBatched(x, X_NOTRANS, y, transposeFlag); int mergeDimsY[] = { headNum * batchSize, lenKV, y.GetDim(3) };
x = Reshape(x, 3, mergeDimsX);
y = Reshape(y, 3, mergeDimsY);
if (isKey) {
y = Transpose(y, 1, 2);
}
int mergeDims[] = { headNum * batchSize, lenQ, x.dimSize[3] }; XTensor context;
x.Reshape(3, mergeDims); context = BMMul(x, y);
int newDims[]{ headNum, batchSize, context.GetDim(1), context.GetDim(2) };
context = Reshape(context, 4, newDims);
XTensor xTrans; XTensor xTrans;
xTrans = Transpose(x, 0, 1); xTrans = Transpose(x, 0, 1);
...@@ -351,15 +364,17 @@ XTensor T2TAttention::RPDotProduct(XTensor& x, XTensor& y, XTensor& z, const boo ...@@ -351,15 +364,17 @@ XTensor T2TAttention::RPDotProduct(XTensor& x, XTensor& y, XTensor& z, const boo
relativeTrans = Transpose(relative, 0, 1); relativeTrans = Transpose(relative, 0, 1);
int splitDims[] = { headNum, batchSize, lenQ, lastDim }; int splitDims[] = { headNum, batchSize, lenQ, lastDim };
relativeTrans.Reshape(4, splitDims);
return Sum(context, relativeTrans); relativeTrans = Reshape(relativeTrans, 4, splitDims);
return context + relativeTrans;
} }
/* constructor */ /* constructor */
Cache::Cache() Cache::Cache()
{ {
miss = true; miss = true;
enable = true;
} }
/* update the states cache */ /* update the states cache */
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University. * Copyright (C) 2020 NiuTrans Research. All rights reserved.
* All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
...@@ -20,17 +19,17 @@ ...@@ -20,17 +19,17 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
*/ */
#ifndef __T2TATTENTION_H__ #ifndef __ATTENTION_H__
#define __T2TATTENTION_H__ #define __ATTENTION_H__
#include "T2TNNUtil.h" #include "NNUtil.h"
#include "T2TUtility.h" #include "../Utility.h"
#include "../../../network/XNet.h" #include "../../../network/XNet.h"
#include "../../../tensor/core/CHeader.h" #include "../../../tensor/core/CHeader.h"
using namespace nts; using namespace nts;
namespace transformer namespace nmt
{ {
/* attention type */ /* attention type */
enum { NONE, SELF_ATT, EN_DE_ATT }; enum { NONE, SELF_ATT, EN_DE_ATT };
...@@ -50,6 +49,9 @@ public: ...@@ -50,6 +49,9 @@ public:
/* indicates cache miss if 'true' */ /* indicates cache miss if 'true' */
bool miss; bool miss;
/* indicates whether we use cache */
bool enable;
/* constructor */ /* constructor */
Cache(); Cache();
...@@ -64,7 +66,7 @@ public: ...@@ -64,7 +66,7 @@ public:
}; };
/* multi-head attention */ /* multi-head attention */
class T2TAttention class Attention
{ {
public: public:
/* device id */ /* device id */
...@@ -74,22 +76,22 @@ public: ...@@ -74,22 +76,22 @@ public:
int nhead; int nhead;
/* transformation matrix for Q */ /* transformation matrix for Q */
XTensor wq; XTensor weightQ;
/* bias for Q */ /* bias for Q */
XTensor bq; XTensor biasQ;
/* transformation matrix for K */ /* transformation matrix for K */
XTensor wk; XTensor weightK;
/* bias for K */ /* bias for K */
XTensor bk; XTensor biasK;
/* transformation matrix for V */ /* transformation matrix for V */
XTensor wv; XTensor weightV;
/* bias for V */ /* bias for V */
XTensor bv; XTensor biasV;
XTensor wBig; XTensor wBig;
...@@ -99,10 +101,10 @@ public: ...@@ -99,10 +101,10 @@ public:
XTensor RPEmbK; XTensor RPEmbK;
/* transformation after dot-product attention */ /* transformation after dot-product attention */
XTensor wo; XTensor weightO;
/* bias after dot-product attention */ /* bias after dot-product attention */
XTensor bo; XTensor biasO;
/* size of transformed Q and K */ /* size of transformed Q and K */
int dk; int dk;
...@@ -124,13 +126,13 @@ public: ...@@ -124,13 +126,13 @@ public:
public: public:
/* constructor */ /* constructor */
T2TAttention(); Attention();
/* de-constructor */ /* de-constructor */
~T2TAttention(); ~Attention();
/* initialize the model */ /* initialize the model */
void InitModel(T2TConfig& config); void InitModel(Config& config);
/* make the network */ /* make the network */
XTensor Make(XTensor& k, XTensor& q, XTensor& v, XTensor Make(XTensor& k, XTensor& q, XTensor& v,
...@@ -145,8 +147,10 @@ public: ...@@ -145,8 +147,10 @@ public:
XTensor MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v, XTensor MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v,
XTensor* mask, bool isTraining, bool isEnc); XTensor* mask, bool isTraining, bool isEnc);
/* generate relative position embeddings */
XTensor GetRPEmbedding(const int lenQ, const int lenKV, const int maxRelativeLen, const bool isEnc); XTensor GetRPEmbedding(const int lenQ, const int lenKV, const int maxRelativeLen, const bool isEnc);
/* relative position-aware dot-product attention inner calculation */
XTensor RPDotProduct(XTensor& x, XTensor& y, XTensor& z, const bool is_key); XTensor RPDotProduct(XTensor& x, XTensor& y, XTensor& z, const bool is_key);
}; };
} }
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020, Natural Language Processing Lab, Northestern University. * Copyright (C) 2020 NiuTrans Research. All rights reserved.
* All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
...@@ -20,13 +19,11 @@ ...@@ -20,13 +19,11 @@
* This file includes some common modules of the Transformer model * This file includes some common modules of the Transformer model
*/ */
#include <cmath> #include "CommonModules.h"
#include "T2TCommonModules.h"
#include "../../../tensor/core/CHeader.h" #include "../../../tensor/core/CHeader.h"
#include "../../../tensor/function/FHeader.h" #include "../../../tensor/function/FHeader.h"
namespace transformer namespace nmt
{ {
/* /*
...@@ -37,7 +34,7 @@ flexible layer normalization for the Transformer ...@@ -37,7 +34,7 @@ flexible layer normalization for the Transformer
>> before - whether we use layernorm before attention/fnn >> before - whether we use layernorm before attention/fnn
>> after - whether we use layernorm after attention/fnn >> after - whether we use layernorm after attention/fnn
*/ */
XTensor LayerNorm(XTensor& input, T2TLN& ln, bool prenorm, bool before, bool after) XTensor LayerNorm(XTensor& input, LN& ln, bool prenorm, bool before, bool after)
{ {
if (after ^ prenorm) if (after ^ prenorm)
return ln.Make(input); return ln.Make(input);
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020, Natural Language Processing Lab, Northestern University. * Copyright (C) 2020 NiuTrans Research. All rights reserved.
* All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
...@@ -22,16 +21,16 @@ ...@@ -22,16 +21,16 @@
#ifndef __COMMONMODULE_H__ #ifndef __COMMONMODULE_H__
#define __COMMONMODULE_H__ #define __COMMONMODULE_H__
#include "T2TLayerNormal.h" #include "LayerNorm.h"
#include "T2TCommonModules.h" #include "CommonModules.h"
using namespace nts; using namespace nts;
namespace transformer namespace nmt
{ {
/* the layer normalization module to control pre-norm or post-norm*/ /* the layer normalization module to control pre-norm or post-norm*/
XTensor LayerNorm(XTensor& input, T2TLN& ln, bool prenorm, bool before, bool after); XTensor LayerNorm(XTensor& input, LN& ln, bool prenorm, bool before, bool after);
} }
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University. * Copyright (C) 2020 NiuTrans Research. All rights reserved.
* All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
...@@ -20,17 +19,15 @@ ...@@ -20,17 +19,15 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-07 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-07
*/ */
#include <cmath> #include "Embedding.h"
#include "../Utility.h"
#include "T2TUtility.h"
#include "T2TEmbedding.h"
#include "../../../tensor/core/CHeader.h" #include "../../../tensor/core/CHeader.h"
namespace transformer namespace nmt
{ {
/* constructor */ /* constructor */
T2TEmbedder::T2TEmbedder() Embedder::Embedder()
{ {
devID = -1; devID = -1;
vSize = -1; vSize = -1;
...@@ -38,7 +35,7 @@ T2TEmbedder::T2TEmbedder() ...@@ -38,7 +35,7 @@ T2TEmbedder::T2TEmbedder()
} }
/* de-constructor */ /* de-constructor */
T2TEmbedder::~T2TEmbedder() Embedder::~Embedder()
{ {
} }
...@@ -47,7 +44,7 @@ initialize the model ...@@ -47,7 +44,7 @@ initialize the model
>> config - configurations of the model >> config - configurations of the model
>> isEnc - indicates if it is used for the encoder >> isEnc - indicates if it is used for the encoder
*/ */
void T2TEmbedder::InitModel(T2TConfig& config, bool isEnc) void Embedder::InitModel(Config& config, bool isEnc)
{ {
devID = config.devID; devID = config.devID;
d = config.modelSize; d = config.modelSize;
...@@ -70,7 +67,7 @@ void T2TEmbedder::InitModel(T2TConfig& config, bool isEnc) ...@@ -70,7 +67,7 @@ void T2TEmbedder::InitModel(T2TConfig& config, bool isEnc)
make positional embeddings (of size eSize * length) make positional embeddings (of size eSize * length)
>> length - length of the sequence >> length - length of the sequence
*/ */
void T2TEmbedder::MakePosEmbedding(int length) void Embedder::MakePosEmbedding(int length)
{ {
InitTensor2D(&posEmbeddingBase, length, eSize, X_FLOAT, devID); InitTensor2D(&posEmbeddingBase, length, eSize, X_FLOAT, devID);
...@@ -110,58 +107,45 @@ make the network ...@@ -110,58 +107,45 @@ make the network
>> isTraining - indicates whether it is training >> isTraining - indicates whether it is training
<< return - word & position embeddings of the input << return - word & position embeddings of the input
*/ */
XTensor T2TEmbedder::Make(XTensor& input, bool isDec, bool isTraining, int nstep) XTensor Embedder::Make(XTensor& input, bool isDec, bool isTraining, int nstep)
{ {
/* make sure the padding index is 1 */ /* make sure the padding index is 1 */
CheckNTErrors(input.order > 1, "Wrong input tensor size!"); CheckNTErrors(input.order > 1, "Wrong input tensor size!");
CheckNTErrors(input.dimSize[input.order - 1] < maxLength, "The sequence is too long!"); CheckNTErrors(input.dimSize[input.order - 1] < maxLength, "The sequence is too long!");
CheckNTErrors(vSize > 0, "set vocabulary size by \"-vsize\""); CheckNTErrors(vSize > 0, "Set vocabulary size by \"-vsize\"");
CheckNTErrors(eSize > 0, "set embedding size by \"-esize\""); CheckNTErrors(eSize > 0, "Set embedding size by \"-esize\"");
XTensor wordEmbedding, position, posEmbedding; XTensor wordEmbedding, position, posEmbedding;
InitTensor(&position, &input);
int* posData = new int[input.unitNum];
XTensor inputCPU; InitTensor1D(&position, input.GetDim(-1), X_INT, devID);
InitTensorOnCPU(&inputCPU, &input);
_CopyValues(&input, &inputCPU);
if (!isDec) if (!isDec || isTraining || input.GetDim(-1) > 1)
{ {
/* encoder embeddings */ position.Range(0, position.unitNum, 1);
for (int i = 0; i < inputCPU.dimSize[0]; i++) {
int startNoPad = 1 + 1; // disable grad
int* p = ((int*)inputCPU.data) + i * inputCPU.dimSize[1]; ScaleAndShiftMe(position, 1.0F, float(padIdx + 1));
for (int j = 0; j < inputCPU.dimSize[1]; j++) {
if (p[j] == 1) {
posData[i * inputCPU.dimSize[1] + j] = 1;
}
else {
posData[i * inputCPU.dimSize[1] + j] = startNoPad++;
}
}
}
position.SetData(posData, position.unitNum);
} }
else else
{ {
/* decoder embeddings */ /* decoder embeddings during decoding */
position.SetDataFixed(nstep + 2); position.SetDataFixed(nstep + padIdx + 1);
} }
delete[] posData;
/* we make positional embeddings first */ /* we make positional embeddings first */
posEmbedding = Gather(posEmbeddingBase, position); XTensor embTMP;
embTMP = Gather(posEmbeddingBase, position);
posEmbedding = Unsqueeze(embTMP, 0, input.GetDim(0));
/* then we make word embeddings */ /* then we make word embeddings */
//w.enableGrad = false;
wordEmbedding = Gather(w, input); wordEmbedding = Gather(w, input);
wordEmbedding = Linear(wordEmbedding, (float)sqrt((float)eSize)); wordEmbedding = Linear(wordEmbedding, (float)sqrt((float)eSize));
/* we sum over the two embeddings */ /* we sum over the two embeddings */
return wordEmbedding + posEmbedding; SumMe(wordEmbedding, posEmbedding);
return wordEmbedding;
} }
} }
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University. * Copyright (C) 2020 NiuTrans Research. All rights reserved.
* All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
...@@ -20,15 +19,15 @@ ...@@ -20,15 +19,15 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-07 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-07
*/ */
#ifndef __T2TEMBEDDING_H__ #ifndef __EMBEDDING_H__
#define __T2TEMBEDDING_H__ #define __EMBEDDING_H__
#include "T2TUtility.h" #include "../Utility.h"
#include "../../../network/XNet.h" #include "../../../network/XNet.h"
using namespace nts; using namespace nts;
namespace transformer namespace nmt
{ {
#define DEFAULT_EMBEDDING_SIZE 512 #define DEFAULT_EMBEDDING_SIZE 512
...@@ -37,7 +36,7 @@ namespace transformer ...@@ -37,7 +36,7 @@ namespace transformer
embedding (of word at position i): embedding (of word at position i):
word embedding + positional embedding word embedding + positional embedding
*/ */
class T2TEmbedder class Embedder
{ {
public: public:
/* device id */ /* device id */
...@@ -52,7 +51,7 @@ public: ...@@ -52,7 +51,7 @@ public:
/* maximum length of the sequence */ /* maximum length of the sequence */
int maxLength; int maxLength;
/* dimension size of the hidden layers in the t2t model */ /* dimension size of the hidden layers in the model */
int d; int d;
/* padding index */ /* padding index */
...@@ -67,13 +66,13 @@ public: ...@@ -67,13 +66,13 @@ public:
public: public:
/* constructor */ /* constructor */
T2TEmbedder(); Embedder();
/* de-constructor */ /* de-constructor */
~T2TEmbedder(); ~Embedder();
/* initialize the model */ /* initialize the model */
void InitModel(T2TConfig& config, bool isEnc = true); void InitModel(Config& config, bool isEnc = true);
/* make positional embeddings */ /* make positional embeddings */
void MakePosEmbedding(int length); void MakePosEmbedding(int length);
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University. * Copyright (C) 2020 NiuTrans Research. All rights reserved.
* All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
...@@ -20,19 +19,17 @@ ...@@ -20,19 +19,17 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/ */
#include <cmath> #include "FNN.h"
#include "Embedding.h"
#include "T2TFNN.h" #include "../Utility.h"
#include "T2TUtility.h"
#include "T2TEmbedding.h"
#include "../../../tensor/core/CHeader.h" #include "../../../tensor/core/CHeader.h"
#include "../../../tensor/function/FHeader.h" #include "../../../tensor/function/FHeader.h"
namespace transformer namespace nmt
{ {
/* constructor */ /* constructor */
T2TFNN::T2TFNN() FNN::FNN()
{ {
inSize = -1; inSize = -1;
outSize = -1; outSize = -1;
...@@ -40,7 +37,7 @@ T2TFNN::T2TFNN() ...@@ -40,7 +37,7 @@ T2TFNN::T2TFNN()
} }
/* de-constructor */ /* de-constructor */
T2TFNN::~T2TFNN() FNN::~FNN()
{ {
} }
...@@ -50,7 +47,7 @@ initialize the model ...@@ -50,7 +47,7 @@ initialize the model
>> argv - list of pointers to the arguments >> argv - list of pointers to the arguments
>> config - configurations of the model >> config - configurations of the model
*/ */
void T2TFNN::InitModel(T2TConfig& config) void FNN::InitModel(Config& config)
{ {
devID = config.devID; devID = config.devID;
...@@ -69,6 +66,9 @@ void T2TFNN::InitModel(T2TConfig& config) ...@@ -69,6 +66,9 @@ void T2TFNN::InitModel(T2TConfig& config)
_SetDataFanInOut(&w1, scale); _SetDataFanInOut(&w1, scale);
_SetDataFanInOut(&w2, scale); _SetDataFanInOut(&w2, scale);
w1.SetDataRand(-(DTYPE)sqrt(6.0F / inSize), (DTYPE)sqrt(6.0F / inSize));
w2.SetDataRand(-(DTYPE)sqrt(6.0F / hSize), (DTYPE)sqrt(6.0F / hSize));
b1.SetZeroAll(); b1.SetZeroAll();
b2.SetZeroAll(); b2.SetZeroAll();
} }
...@@ -79,7 +79,7 @@ y = max(0, x * w1 + b1) * w2 + b2 ...@@ -79,7 +79,7 @@ y = max(0, x * w1 + b1) * w2 + b2
>> input - the input tensor >> input - the input tensor
>> return - the output tensor >> return - the output tensor
*/ */
XTensor T2TFNN::Make(XTensor& input, bool isTraining) XTensor FNN::Make(XTensor& input, bool isTraining)
{ {
XTensor t1; XTensor t1;
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University. * Copyright (C) 2020 NiuTrans Research. All rights reserved.
* All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
...@@ -20,20 +19,20 @@ ...@@ -20,20 +19,20 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/ */
#ifndef __T2TFNN_H__ #ifndef __FNN_H__
#define __T2TFNN_H__ #define __FNN_H__
#include "T2TUtility.h" #include "LayerNorm.h"
#include "T2TLayerNormal.h" #include "../Utility.h"
#include "../../../tensor/XTensor.h" #include "../../../tensor/XTensor.h"
using namespace nts; using namespace nts;
namespace transformer namespace nmt
{ {
/* a fnn: y = max(0, x * w1 + b1) * w2 + b2 */ /* a fnn: y = max(0, x * w1 + b1) * w2 + b2 */
class T2TFNN class FNN
{ {
public: public:
/* device id */ /* device id */
...@@ -66,13 +65,13 @@ public: ...@@ -66,13 +65,13 @@ public:
public: public:
/* constructor */ /* constructor */
T2TFNN(); FNN();
/* de-constructor */ /* de-constructor */
~T2TFNN(); ~FNN();
/* initialize the model */ /* initialize the model */
void InitModel(T2TConfig& config); void InitModel(Config& config);
/* make the network */ /* make the network */
XTensor Make(XTensor& input, bool isTraining); XTensor Make(XTensor& input, bool isTraining);
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University. * Copyright (C) 2020 NiuTrans Research. All rights reserved.
* All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
...@@ -19,16 +18,13 @@ ...@@ -19,16 +18,13 @@
* $Created by: Bei Li (libei_neu@outlook.com) 2020-02-03 * $Created by: Bei Li (libei_neu@outlook.com) 2020-02-03
*/ */
#include "GLU.h"
#include <cmath> #include "Embedding.h"
#include "../Utility.h"
#include "T2TUtility.h"
#include "T2TEmbedding.h"
#include "T2TGatedLinearUnit.h"
#include "../../../tensor/core/CHeader.h" #include "../../../tensor/core/CHeader.h"
#include "../../../tensor/function/FHeader.h" #include "../../../tensor/function/FHeader.h"
namespace transformer namespace nmt
{ {
/* constructor */ /* constructor */
...@@ -48,7 +44,7 @@ GLU::~GLU() ...@@ -48,7 +44,7 @@ GLU::~GLU()
initialize the model initialize the model
>> config - configurations of the model >> config - configurations of the model
*/ */
void GLU::InitModel(T2TConfig& config) void GLU::InitModel(Config& config)
{ {
devID = config.devID; devID = config.devID;
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University. * Copyright (C) 2020 NiuTrans Research. All rights reserved.
* All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
...@@ -23,12 +22,11 @@ ...@@ -23,12 +22,11 @@
#ifndef __GLU_H__ #ifndef __GLU_H__
#define __GLU_H__ #define __GLU_H__
#include "T2TLayerNormal.h" #include "LayerNorm.h"
#include "T2TGatedLinearUnit.h"
using namespace nts; using namespace nts;
namespace transformer namespace nmt
{ {
/* a fnn: y = max(0, x * w1 + b1) * w2 + b2 */ /* a fnn: y = max(0, x * w1 + b1) * w2 + b2 */
...@@ -68,7 +66,7 @@ public: ...@@ -68,7 +66,7 @@ public:
~GLU(); ~GLU();
/* initialize the model */ /* initialize the model */
void InitModel(T2TConfig& config); void InitModel(Config& config);
/* make the network */ /* make the network */
XTensor Make(XTensor& input); XTensor Make(XTensor& input);
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University. * Copyright (C) 2020 NiuTrans Research. All rights reserved.
* All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
...@@ -19,19 +18,16 @@ ...@@ -19,19 +18,16 @@
* $Created by: Bei Li (libei_neu@outlook.com) 2020-02-03 * $Created by: Bei Li (libei_neu@outlook.com) 2020-02-03
*/ */
#include <cmath> #include "Embedding.h"
#include "LayerNorm.h"
#include "T2TUtility.h" #include "LayerHistory.h"
#include "T2TEmbedding.h" #include "../Utility.h"
#include "T2TLayerNormal.h"
#include "T2TLayerHistory.h"
#include "../../../tensor/core/CHeader.h" #include "../../../tensor/core/CHeader.h"
#define SAFE_DELETE(x) do{ if((x) != NULL){delete (x); (x) = NULL;} } while(false) #define SAFE_DELETE(x) do{ if((x) != NULL){delete (x); (x) = NULL;} } while(false)
#define SAFE_DELETE_ARRAY(x) do{ if((x) != NULL) {delete [] (x); (x)=NULL;} } while(false) #define SAFE_DELETE_ARRAY(x) do{ if((x) != NULL) {delete [] (x); (x)=NULL;} } while(false)
namespace transformer namespace nmt
{ {
/* constructor */ /* constructor */
...@@ -54,7 +50,7 @@ LayerHistory::~LayerHistory() ...@@ -54,7 +50,7 @@ LayerHistory::~LayerHistory()
initialize the model initialize the model
>> config - configurations of the model >> config - configurations of the model
*/ */
void LayerHistory::InitModel(T2TConfig& config) void LayerHistory::InitModel(Config& config)
{ {
devID = config.devID; devID = config.devID;
d = config.modelSize; d = config.modelSize;
...@@ -62,7 +58,7 @@ void LayerHistory::InitModel(T2TConfig& config) ...@@ -62,7 +58,7 @@ void LayerHistory::InitModel(T2TConfig& config)
InitTensor2D(&weight, nlayer + 1, nlayer + 1, X_FLOAT, devID); InitTensor2D(&weight, nlayer + 1, nlayer + 1, X_FLOAT, devID);
layerNorms = new T2TLN[nlayer]; layerNorms = new LN[nlayer];
/* initialize the layer normalization of each layer */ /* initialize the layer normalization of each layer */
for (int i = 0; i < nlayer; i++) { for (int i = 0; i < nlayer; i++) {
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University. * Copyright (C) 2020 NiuTrans Research. All rights reserved.
* All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
...@@ -22,14 +21,14 @@ ...@@ -22,14 +21,14 @@
#ifndef __LAYERHISTORY_H__ #ifndef __LAYERHISTORY_H__
#define __LAYERHISTORY_H__ #define __LAYERHISTORY_H__
#include "T2TLayerNormal.h" #include "LayerNorm.h"
#include "T2TLayerHistory.h" #include "LayerHistory.h"
#include "../../../tensor/function/FHeader.h" #include "../../../tensor/function/FHeader.h"
using namespace nts; using namespace nts;
namespace transformer namespace nmt
{ {
/* /*
...@@ -61,7 +60,7 @@ public: ...@@ -61,7 +60,7 @@ public:
TensorList history; TensorList history;
/* layer normalization for each intimidate layer */ /* layer normalization for each intimidate layer */
T2TLN* layerNorms; LN* layerNorms;
public: public:
/* constructor */ /* constructor */
...@@ -71,7 +70,7 @@ public: ...@@ -71,7 +70,7 @@ public:
~LayerHistory(); ~LayerHistory();
/* initialize the model */ /* initialize the model */
void InitModel(T2TConfig& config); void InitModel(Config& config);
/* add the layer output to the history */ /* add the layer output to the history */
void Add(XTensor& tensor); void Add(XTensor& tensor);
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University. * Copyright (C) 2020 NiuTrans Research. All rights reserved.
* All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
...@@ -20,24 +19,23 @@ ...@@ -20,24 +19,23 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/ */
#include <cmath> #include "Embedding.h"
#include "T2TUtility.h" #include "LayerNorm.h"
#include "T2TEmbedding.h" #include "../Utility.h"
#include "T2TLayerNormal.h"
#include "../../../tensor/core/CHeader.h" #include "../../../tensor/core/CHeader.h"
namespace transformer namespace nmt
{ {
/* constructor */ /* constructor */
T2TLN::T2TLN() LN::LN()
{ {
devID = -1; devID = -1;
d = 0; d = 0;
} }
/* de-constructor */ /* de-constructor */
T2TLN::~T2TLN() LN::~LN()
{ {
} }
...@@ -47,7 +45,7 @@ initialize the model ...@@ -47,7 +45,7 @@ initialize the model
>> argv - list of pointers to the arguments >> argv - list of pointers to the arguments
>> config - configurations of the model >> config - configurations of the model
*/ */
void T2TLN::InitModel(T2TConfig& config) void LN::InitModel(Config& config)
{ {
devID = config.devID; devID = config.devID;
...@@ -57,6 +55,8 @@ void T2TLN::InitModel(T2TConfig& config) ...@@ -57,6 +55,8 @@ void T2TLN::InitModel(T2TConfig& config)
InitTensor1D(&b, d, X_FLOAT, devID); InitTensor1D(&b, d, X_FLOAT, devID);
w.SetDataRand(1.0F, 1.0F); w.SetDataRand(1.0F, 1.0F);
b.SetZeroAll(); b.SetZeroAll();
w.SetDataFixed(1);
} }
/* /*
...@@ -64,7 +64,7 @@ make the network ...@@ -64,7 +64,7 @@ make the network
>> input - the input tensor >> input - the input tensor
>> return - layer normalization output >> return - layer normalization output
*/ */
XTensor T2TLN::Make(XTensor& input) XTensor LN::Make(XTensor& input)
{ {
XTensor& x = input; XTensor& x = input;
XTensor xn; XTensor xn;
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University. * Copyright (C) 2020 NiuTrans Research. All rights reserved.
* All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
...@@ -20,20 +19,20 @@ ...@@ -20,20 +19,20 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/ */
#ifndef __T2TLAYERNORMAL_H__ #ifndef __LAYERNORMAL_H__
#define __T2TLAYERNORMAL_H__ #define __LAYERNORMAL_H__
#include "T2TUtility.h" #include "../Utility.h"
#include "../../../network/XNet.h" #include "../../../network//XNet.h"
using namespace nts; using namespace nts;
namespace transformer namespace nmt
{ {
/* layer normalization: y = norm(x) * w + b /* layer normalization: y = norm(x) * w + b
where norm(x) = (x - mean)/standardDeviation */ where norm(x) = (x - mean)/standardDeviation */
class T2TLN class LN
{ {
public: public:
/* device id */ /* device id */
...@@ -50,13 +49,13 @@ public: ...@@ -50,13 +49,13 @@ public:
public: public:
/* constructor */ /* constructor */
T2TLN(); LN();
/* de-constructor */ /* de-constructor */
~T2TLN(); ~LN();
/* initialize the model */ /* initialize the model */
void InitModel(T2TConfig& config); void InitModel(Config& config);
/* make the network */ /* make the network */
XTensor Make(XTensor& input); XTensor Make(XTensor& input);
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University. * Copyright (C) 2020 NiuTrans Research. All rights reserved.
* All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
...@@ -16,12 +15,12 @@ ...@@ -16,12 +15,12 @@
*/ */
/* /*
* $Created by: Chi (huchinlp@foxmail.com) 2020-03-21 * $Created by: HU Chi (huchinlp@foxmail.com) 2020-03-21
*/ */
#include "T2TNNUtil.h" #include "NNUtil.h"
namespace transformer namespace nmt
{ {
/* /*
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University. * Copyright (C) 2020 NiuTrans Research. All rights reserved.
* All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
...@@ -16,11 +15,11 @@ ...@@ -16,11 +15,11 @@
*/ */
/* /*
* $Created by: Chi (huchinlp@foxmail.com) 2020-03-21 * $Created by: HU Chi (huchinlp@foxmail.com) 2020-03-21
*/ */
#ifndef __T2TNNUTIL_H__ #ifndef __NNUTIL_H__
#define __T2TNNUTIL_H__ #define __NNUTIL_H__
#include "../../../tensor/XGlobal.h" #include "../../../tensor/XGlobal.h"
#include "../../../tensor/core/CHeader.h" #include "../../../tensor/core/CHeader.h"
...@@ -28,7 +27,7 @@ ...@@ -28,7 +27,7 @@
using namespace nts; using namespace nts;
namespace transformer namespace nmt
{ {
/* the gather function for tensor with any dimension */ /* the gather function for tensor with any dimension */
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University. * Copyright (C) 2020 NiuTrans Research. All rights reserved.
* All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
...@@ -20,18 +19,16 @@ ...@@ -20,18 +19,16 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/ */
#include <cmath> #include "Output.h"
#include "Embedding.h"
#include "T2TOutput.h" #include "../Utility.h"
#include "T2TUtility.h"
#include "T2TEmbedding.h"
#include "../../../tensor/core/CHeader.h" #include "../../../tensor/core/CHeader.h"
namespace transformer namespace nmt
{ {
/* constructor */ /* constructor */
T2TOutput::T2TOutput() Output::Output()
{ {
devID = -1; devID = -1;
vSize = -1; vSize = -1;
...@@ -39,7 +36,7 @@ T2TOutput::T2TOutput() ...@@ -39,7 +36,7 @@ T2TOutput::T2TOutput()
} }
/* de-constructor */ /* de-constructor */
T2TOutput::~T2TOutput() Output::~Output()
{ {
} }
...@@ -47,7 +44,7 @@ T2TOutput::~T2TOutput() ...@@ -47,7 +44,7 @@ T2TOutput::~T2TOutput()
initialize the model initialize the model
>> config - configurations of the model >> config - configurations of the model
*/ */
void T2TOutput::InitModel(T2TConfig& config) void Output::InitModel(Config& config)
{ {
devID = config.devID; devID = config.devID;
hSize = config.modelSize; hSize = config.modelSize;
...@@ -66,7 +63,7 @@ make the network (redefined output tensor) ...@@ -66,7 +63,7 @@ make the network (redefined output tensor)
>> isTraining - whether it is used for training >> isTraining - whether it is used for training
>> normalized - whether ignore the log-softmax >> normalized - whether ignore the log-softmax
*/ */
void T2TOutput::Make(XTensor& input, XTensor& output, bool isTraining, bool normalized) void Output::Make(XTensor& input, XTensor& output, bool isTraining, bool normalized)
{ {
XTensor& x = input; XTensor& x = input;
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University. * Copyright (C) 2020 NiuTrans Research. All rights reserved.
* All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
...@@ -20,19 +19,19 @@ ...@@ -20,19 +19,19 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/ */
#ifndef __T2TOUTPUT_H__ #ifndef __OUTPUT_H__
#define __T2TOUTPUT_H__ #define __OUTPUT_H__
#include "T2TUtility.h" #include "../Utility.h"
#include "../../../tensor/function/FHeader.h" #include "../../../tensor/function/FHeader.h"
using namespace nts; using namespace nts;
namespace transformer namespace nmt
{ {
/* output layer */ /* output layer */
class T2TOutput class Output
{ {
public: public:
/* device id */ /* device id */
...@@ -49,13 +48,13 @@ public: ...@@ -49,13 +48,13 @@ public:
public: public:
/* constructor */ /* constructor */
T2TOutput(); Output();
/* de-constructor */ /* de-constructor */
~T2TOutput(); ~Output();
/* initialize the model */ /* initialize the model */
void InitModel(T2TConfig& config); void InitModel(Config& config);
/* make the network (redefined output tensor) */ /* make the network (redefined output tensor) */
void Make(XTensor& input, XTensor& output, bool isTraining, bool normalized); void Make(XTensor& input, XTensor& output, bool isTraining, bool normalized);
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: HU Chi (huchinlp@foxmail.com) 2019-04-05
*/
#include "DataSet.h"
#include "StringUtil.h"
#include <string>
#include <vector>
#include <fstream>
#include <algorithm>
#include "..//..//..//tensor/XUtility.h"
using namespace nts;
bool Compare(const Example& a, const Example& b) {
return a.values.size() > b.values.size();
}
bool CompareRes(const Result& a, const Result& b) {
return a.id < b.id;
}
void DataSet::RerankRes(){
sort(resBuffer.begin(), resBuffer.end(), CompareRes);
}
/*
load data from the file to the buffer
*/
void DataSet::LoadDataToBuffer()
{
string line;
buffer.clear();
bufferUsed = 0;
const string tokenDelimiter = " ";
int id = 0;
while (getline(*fp, line)) {
vector<int> values = Split<int>(line, tokenDelimiter);
Example example;
example.id = id++;
example.values = values;
buffer.emplace_back(example);
}
if (fp->eof()) {
fp->seekg(fp->beg);
}
if (sortBuffer) {
sort(buffer.begin(), buffer.end(), Compare);
}
resBuffer.reserve(buffer.size());
}
/*
select a field and generate a mini-batch by indices
>>> batchEnc - a tensor to store the batch of input
>>> paddingEnc - a tensor to store the batch of paddings
>>> batchSize - batch size
>>> devID - devices id, -1 for CPU
>>> mem - the memory pool
*/
vector<int> DataSet::LoadBatch(XTensor * batchEnc, XTensor * paddingEnc,
size_t batchSize, int devID)
{
size_t realBatchSize = batchSize;
/* real batch size */
if ((buffer.size()-bufferUsed) < batchSize) {
realBatchSize = buffer.size()-bufferUsed;
}
/* get the maximum sentence length in a mini-batch */
size_t maxLen = 0;
if (realBatchSize == 1)
maxLen = buffer[bufferUsed].values.size();
for (size_t i = 0; i < realBatchSize - 1; ++i) {
maxLen = max(maxLen, buffer[bufferUsed+i].values.size());
}
CheckNTErrors(maxLen != 0, "wrong length dectected");
int* batchValues = new int[realBatchSize * maxLen];
float* paddingValues = new float[realBatchSize * maxLen];
for (int i = 0; i < realBatchSize * maxLen; ++i) {
batchValues[i] = 1.0F;
}
memset(paddingValues, 0, sizeof(float) * maxLen * realBatchSize);
size_t cur = 0;
/* left padding */
vector<int> indices;
indices.reserve(realBatchSize);
for (size_t i = 0; i < realBatchSize; ++i) {
indices.push_back(buffer[bufferUsed + i].id);
cur = maxLen * (i + 1) - buffer[bufferUsed+i].values.size();
for (int v : buffer[bufferUsed + i].values) {
batchValues[cur] = v;
paddingValues[cur++] = 1.0F;
}
cur = maxLen * (i + 1);
}
InitTensor2DV2(batchEnc, realBatchSize, maxLen, X_INT, devID);
InitTensor2DV2(paddingEnc, realBatchSize, maxLen, X_FLOAT, devID);
bufferUsed += realBatchSize;
batchEnc->SetData(batchValues, batchEnc->unitNum);
paddingEnc->SetData(paddingValues, paddingEnc->unitNum);
delete[] batchValues;
delete[] paddingValues;
return indices;
}
/*
the constructor of DataSet
>>> fname - path of the data file
*/
void DataSet::Init(const char* fname)
{
fp = new ifstream(fname);
CheckNTErrors(fp->is_open(), "can not open the file");
bufferUsed = 0;
LoadDataToBuffer();
if (bufferSize == 0)
bufferSize = buffer.size();
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: HU Chi (huchinlp@foxmail.com) 2019-04-03
*/
#ifndef __DATASET_H__
#define __DATASET_H__
#include "../../..//tensor/XTensor.h"
#include "../../..//tensor/XGlobal.h"
#include <cstdio>
#include <fstream>
#include <unordered_map>
#include <vector>
using namespace std;
using namespace nts;
struct Example {
int id;
vector<int> values;
};
struct Result {
int id;
XTensor values;
};
using BufferType = vector<Example>;
using ResBufferType = vector<Result>;
bool Compare(const Example& a, const Example& b);
bool CompareRes(const Result& a, const Result& b);
namespace nts { // namespace nts(NiuTrans.Tensor)
/* A `DataSet` is associated with a file which contains variable length data.*/
struct DataSet {
/* the data buffer */
BufferType buffer;
/* the result buffer */
ResBufferType resBuffer;
/* the pointer to file stream */
ifstream* fp{nullptr};
/* size of the data buffer */
size_t bufferSize{ 0 };
/* size of used data in buffer */
size_t bufferUsed{ 0 };
/* wether sort the dataset */
bool sortBuffer{ true };
/* load data from a file to the buffer */
void LoadDataToBuffer();
/* rerank result for output */
void RerankRes();
/* generate a mini-batch */
vector<int> LoadBatch(XTensor * batchEnc, XTensor * paddingEnc,
size_t batchSize, int devID);
/* initlization function */
void Init(const char* fname);
/* check if the buffer is empty */
bool IsEmpty() {
if (bufferUsed < bufferSize)
return false;
return true;
}
/* de-constructor */
~DataSet() {
if (fp)
fp->close();
delete fp;
}
};
} // namespace nts(NiuTrans.Tensor)
#endif // __DATASET_H__
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: HU Chi (huchinlp@foxmail.com) 2019-03-18
*/
#include "StringUtil.h"
namespace nts {
/* split string by delimiter, this will return indices of all sub-strings */
vector<pair<int, int>> SplitToPos(const string& s, const string& delimiter)
{
vector<pair<int, int>> fields;
if (delimiter.length() == 0) {
fields.emplace_back(0, s.length());
return fields;
}
int pos = 0;
int start = 0;
while ((pos = s.find(delimiter, start)) != string::npos) {
if (pos != start) {
fields.emplace_back(start, pos);
}
start = pos + delimiter.length();
}
if (start != s.length()) {
fields.emplace_back(start, s.length());
}
return fields;
}
}
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: HU Chi (huchinlp@foxmail.com) 2019-03-18
*/
#ifndef __STRING_UTIL_H__
#define __STRING_UTIL_H__
#include <cstdlib>
#include <string>
#include <utility>
#include <vector>
using namespace std;
namespace nts {
/* Splits a string based on the given delimiter string. Each pair in the
* returned vector has the start and past-the-end positions for each of the
* parts of the original string. Empty fields are not represented in the output.
*/
vector<pair<int, int>> SplitToPos(const string& s, const string& delimiter);
/* Splits the given string and converts each part to the given T. */
template <typename T>
vector<T> Split(const string& s, const string& delimiter);
template <>
inline vector<string> Split(const string& s, const string& delimiter)
{
vector<string> fields;
for (const auto& p : SplitToPos(s, delimiter)) {
fields.emplace_back(s.substr(p.first, p.second - p.first));
}
return fields;
}
template <>
inline vector<int> Split(const string& s, const string& delimiter)
{
vector<int> fields;
for (const auto& p : SplitToPos(s, delimiter)) {
fields.emplace_back(strtol(s.data() + p.first, nullptr, 10));
}
return fields;
}
template <>
inline vector<int64_t> Split(const string& s, const string& delimiter)
{
vector<int64_t> fields;
for (const auto& p : SplitToPos(s, delimiter)) {
fields.emplace_back(strtoll(s.data() + p.first, nullptr, 10));
}
return fields;
}
template <>
inline vector<float> Split(const string& s, const string& delimiter)
{
vector<float> fields;
for (const auto& p : SplitToPos(s, delimiter)) {
fields.emplace_back(strtof(s.data() + p.first, nullptr));
}
return fields;
}
template <>
inline vector<uint8_t> Split(const string& s, const string& delimiter)
{
vector<uint8_t> fields;
for (const auto& p : SplitToPos(s, delimiter)) {
fields.emplace_back(strtol(s.data() + p.first, nullptr, 10));
}
return fields;
}
} // namespace nts
#endif // __STRING_UTIL_H__
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
*/
#include "T2TBatchLoader.h"
#include "../module/T2TUtility.h"
#include "../../../tensor/XUtility.h"
#include "../../../tensor/core/CHeader.h"
#include "../../../network/XNoder.h"
namespace transformer
{
/* constructor */
T2TBatchLoader::T2TBatchLoader()
{
seqLen = NULL;
seqLen2 = NULL;
nseqBuf = 0;
nextSeq = -1;
nextBatch = -1;
buf = NULL;
buf2 = NULL;
bufBatch = NULL;
bufSize = 0;
bufBatchSize = 0;
seqOffset = NULL;
}
/* de-constructor */
T2TBatchLoader::~T2TBatchLoader()
{
delete[] buf;
delete[] buf2;
delete[] bufBatch;
delete[] seqLen;
delete[] seqLen2;
delete[] seqOffset;
}
/*
initialization
>> argc - number of arguments
>> argv - list of pointers to the arguments
*/
void T2TBatchLoader::Init(T2TConfig& config)
{
bufSize = config.bufSize;
isDoubledEnd = config.isDoubledEnd;
isSmallBatch = config.isSmallBatch;
isBigBatch = config.isBigBatch;
isRandomBatch = config.isRandomBatch;
bucketSize = config.bucketSize;
buf = new int[bufSize];
buf2 = new int[bufSize];
bufBatch = new BatchNode[bufSize];
seqLen = new int[bufSize];
seqLen2 = new int[bufSize];
seqOffset = new int[bufSize];
}
char line[MAX_SEQUENCE_LENGTH];
struct SampleNode
{
int id;
int offset;
int* p;
int size;
int value;
int key;
};
int CompareSampleNode(const void* a, const void* b)
{
return ((SampleNode*)b)->value - ((SampleNode*)a)->value;
}
int CompareSampleNodeV2(const void* a, const void* b)
{
return ((SampleNode*)b)->key - ((SampleNode*)a)->key;
}
/*
load data to buffer
>> file - where to load data
>> isSorted - indicates whether the samples are sorted by length
>> step - the number of sequences we go over when move to the next sample
*/
int T2TBatchLoader::LoadBuf(FILE* file, bool isSorted, int step)
{
int lineCount = 0;
int seqCount = 0;
int wordCount = 0;
while (fgets(line, MAX_SEQUENCE_LENGTH - 1, file)) {
int len = (int)strlen(line);
while (line[len - 1] == '\r' || line[len - 1] == '\n') {
line[len - 1] = 0;
len--;
}
len = (int)strlen(line);
if (len == 0)
continue;
/* how many characters are in a word */
int wSize = 0;
/* how many words are in the sentence */
int wNum = 0;
int wNumLocal = 0;
int i = 0;
for (i = 0; i < len; i++) {
/* load word (id) seperated by space or tab */
if ((line[i] == ' ' || line[i] == '\t') && wSize > 0) {
line[i] = 0;
if (wSize == 3 && line[i - 1] == '|' && line[i - 2] == '|' && line[i - 3] == '|') {
seqLen[seqCount] = wNumLocal;
seqOffset[seqCount] = wordCount + wNum - wNumLocal;
seqCount++;
wNumLocal = 0;
}
else {
buf[wordCount + wNum++] = atoi(line + i - wSize);
wNumLocal++;
}
wSize = 0;
}
else
wSize++;
}
if (wSize > 0) {
buf[wordCount + wNum++] = atoi(line + i - wSize);
wNumLocal++;
}
seqLen[seqCount] = wNumLocal;
seqOffset[seqCount] = wordCount + wNum - wNumLocal;
seqCount++;
wordCount += wNum;
lineCount++;
if (wordCount >= bufSize - MAX_SEQUENCE_LENGTH)
break;
CheckNTErrors(seqCount % step == 0, "Wrong number of sequences!");
}
nseqBuf = seqCount;
nextSeq = 0;
/* sort the sequences by length */
if (isSorted) {
CheckNTErrors(seqCount % step == 0, "Wrong number of sequences!");
SampleNode* nodes = new SampleNode[seqCount];
int count = 0;
int offset = 0;
for (int i = 0; i < seqCount; i += step) {
SampleNode& node = nodes[count];
node.id = count;
node.offset = i;
node.p = buf + offset;
node.size = 0;
int max = 0;
for (int j = 0; j < step; j++) {
node.size += seqLen[i + j];
max = MAX(max, seqLen[i + j]);
}
node.value = max;
node.key = rand();
count++;
offset += node.size;
}
qsort(nodes, count, sizeof(SampleNode), CompareSampleNode);
/* distribute samples into buckets. In each bucket, sequences have
similar a length */
if (bucketSize > 0) {
int low = 0;
int high = low + bucketSize;
int n = count - 1;
int m = n;
int num = 0;
while (num < count) {
for (m = n; m >= 0; m--) {
if (nodes[m].value > high)
break;
}
qsort(nodes + m + 1, n - m, sizeof(SampleNode), CompareSampleNodeV2);
num += (n - m);
n = m;
low += bucketSize;
high = low + bucketSize;
}
}
count = 0;
offset = 0;
for (int i = 0; i < seqCount; i += step) {
SampleNode& node = nodes[count];
memcpy(buf2 + offset, node.p, sizeof(int) * node.size);
for (int j = 0; j < step; j++) {
seqLen2[i + j] = seqLen[node.offset + j];
seqOffset[i + j] = offset + (j > 0 ? seqLen[node.offset + j - 1] : 0);
}
count += 1;
offset += node.size;
}
int* tmp = buf;
buf = buf2;
buf2 = tmp;
tmp = seqLen;
seqLen = seqLen2;
seqLen2 = tmp;
delete[] nodes;
}
return lineCount;
}
/* clear the data buffer */
void T2TBatchLoader::ClearBuf()
{
nseqBuf = 0;
nextSeq = -1;
}
/*
set the random batch flag
>> flag - as it is
*/
void T2TBatchLoader::SetRandomBatch(bool flag)
{
isRandomBatch = flag;
}
/*
load a batch of sequences
>> file - the handle to the data file
>> isLM - indicates whether the data is used for training lms
>> batchEnc - the batch of the input sequences
>> paddingEnc - padding of the input sequences
>> batchDec - the batch of the output sequences
>> paddingDec - padding of the output sequences
>> gold - gold standard
>> seqs - keep the sequences in an array
>> vsEnc - size of the encoder vocabulary
>> vsDec - size of the decoder vocabulary
>> sBatch - batch size of sequences
>> wBatch - batch size of words
>> isSorted - indicates whether the sequences are sorted by length
>> wCount - word count
>> devID - device id
>> isTraining - indicates whether we are training the model
*/
int T2TBatchLoader::LoadBatch(FILE* file, bool isLM,
XTensor* batchEnc, XTensor* paddingEnc,
XTensor* batchDec, XTensor* paddingDec,
XTensor* gold, XTensor* label,
int* seqs,
int vsEnc, int vsDec, int sBatch, int wBatch,
bool isSorted, int& ws, int& wCount,
int devID, bool isTraining)
{
if (isLM) {
return LoadBatchLM(file, batchEnc, paddingEnc, batchDec, paddingDec, gold, label,
seqs, vsEnc, sBatch, wBatch,
isSorted, wCount, devID, isTraining);
}
else {
return LoadBatchMT(file, batchEnc, paddingEnc, batchDec, paddingDec, gold, label,
seqs, vsEnc, vsDec, sBatch, wBatch,
isSorted, ws, wCount, devID, isTraining);
}
}
/*
load a batch of sequences (for LM)
>> file - the handle to the data file
>> isLM - indicates whether the data is used for training lms
>> batchEnc - the batch of the input sequences
>> paddingEnc - padding of the input sequences
>> batchDec - the batch of the output sequences
>> paddingDec - padding of the output sequences
>> gold - gold standard (distribution of every position)
>> label - (gold standard) label index of every position
>> seqs - keep the sequences in an array
>> vSize - vocabulary size
>> sBatch - batch size of sequences
>> wBatch - batch size of words
>> isSorted - indicates whether the sequences are sorted by length
>> wCount - word count
>> devID - device id
>> isTraining - indicates whether we are training the model
*/
int T2TBatchLoader::LoadBatchLM(FILE* file,
XTensor* batchEnc, XTensor* paddingEnc,
XTensor* batchDec, XTensor* paddingDec,
XTensor* gold, XTensor* label,
int* seqs,
int vSize, int sBatch, int wBatch,
bool isSorted, int& wCount,
int devID, bool isTraining)
{
if (nextSeq < 0 || nextSeq >= nseqBuf)
LoadBuf(file, isSorted, 1);
int seq = MAX(nextSeq, 0);
int wc = 0;
int wn = 0;
int sc = 0;
int max = 0;
while (seq + sc < nseqBuf) {
int len = isDoubledEnd ? seqLen[seq + sc] : seqLen[seq + sc] - 1;
CheckNTErrors(len > 0, "Empty sequence!");
wn = len;
wc += wn;
sc += 1;
if (max < wn)
max = wn;
int tc = isBigBatch ? wc : max * sc;
if (sc >= sBatch && tc >= wBatch)
break;
}
wCount = 0;
nextSeq = seq + sc;
if (sc <= 0)
return 0;
int dims[MAX_TENSOR_DIM_NUM];
dims[0] = sc;
dims[1] = max;
dims[2] = vSize;
InitTensor2D(batchEnc, sc, max, X_INT, devID);
InitTensor2D(label, sc, max, X_INT, devID);
InitTensor(gold, 3, dims, X_FLOAT, devID);
InitTensor2D(paddingEnc, sc, max, X_FLOAT, devID);
InitTensor2D(paddingDec, sc, max, X_FLOAT, devID);
batchEnc->SetZeroAll();
label->SetZeroAll();
gold->SetZeroAll();
paddingEnc->SetZeroAll();
paddingDec->SetZeroAll();
int seqSize = 0;
int* batchEncValues = new int[batchEnc->unitNum];
int* labelValues = new int[label->unitNum];
MTYPE* goldOffsets = new MTYPE[gold->unitNum];
MTYPE* paddingEncOffsets = new MTYPE[paddingEnc->unitNum];
MTYPE* paddingDecOffsets = new MTYPE[paddingDec->unitNum];
int wGold = 0;
memset(batchEncValues, 0, sizeof(int) * batchEnc->unitNum);
memset(labelValues, 0, sizeof(int) * label->unitNum);
for (int s = seq; s < seq + sc; s++) {
int len = isDoubledEnd ? seqLen[s] : seqLen[s] - 1;
CheckNTErrors(len <= max, "Something is wrong!");
for (int w = 0; w < len; w++) {
int num = buf[seqOffset[s] + w];
batchEncValues[(int)batchEnc->GetOffset2D(s - seq, w)] = num;
paddingEncOffsets[wCount] = paddingEnc->GetOffset2D(s - seq, w);
paddingDecOffsets[wCount] = paddingDec->GetOffset2D(s - seq, w);
if (w > 0) {
goldOffsets[wGold++] = gold->GetOffset3D(s - seq, w - 1, num);
labelValues[(int)label->GetOffset2D(s - seq, w - 1)] = buf[seqOffset[s] + w];
}
if (w == len - 1) {
if (isDoubledEnd) {
goldOffsets[wGold++] = gold->GetOffset3D(s - seq, w, num);
labelValues[(int)label->GetOffset2D(s - seq, w)] = buf[seqOffset[s] + w];
}
else {
goldOffsets[wGold++] = gold->GetOffset3D(s - seq, w, buf[seqOffset[s] + w + 1]);
labelValues[(int)label->GetOffset2D(s - seq, w)] = buf[seqOffset[s] + w + 1];
}
}
wCount++;
if (seqs != NULL)
seqs[seqSize++] = buf[seqOffset[s] + w];
}
if (seqs != NULL) {
for (int w = len; w < max; w++)
seqs[seqSize++] = -1;
}
}
batchEnc->SetData(batchEncValues, batchEnc->unitNum);
label->SetData(labelValues, label->unitNum);
gold->SetDataBatched(goldOffsets, 1.0F, wGold);
paddingEnc->SetDataBatched(paddingEncOffsets, 1.0F, wCount);
paddingDec->SetDataBatched(paddingDecOffsets, 1.0F, wCount);
/*XTensor * tmp = NewTensorBuf(paddingEnc, devID);
_ConvertDataType(batchEnc, tmp);
_NotEqual(tmp, paddingEnc, 0);
DelTensorBuf(tmp);
XTensor * tmp2 = NewTensorBuf(paddingDec, devID);
_ConvertDataType(batchEnc, tmp2);
_NotEqual(tmp2, paddingDec, 0);
DelTensorBuf(tmp2);*/
delete[] batchEncValues;
delete[] labelValues;
delete[] goldOffsets;
delete[] paddingEncOffsets;
delete[] paddingDecOffsets;
return sc;
}
int CompareBatchNode(const void* a, const void* b)
{
return ((BatchNode*)b)->key - ((BatchNode*)a)->key;
}
/*
load a batch of sequences (for MT)
>> file - the handle to the data file
>> batchEnc - the batch of the input sequences
>> paddingEnc - padding of the input sequences
>> batchDec - the batch of the output sequences
>> paddingDec - padding of the output sequences
>> gold - gold standard (distribution of every position)
>> label - (gold standard) label index of every position
>> seqs - keep the sequences in an array
>> vSizeEnc - size of the encoder vocabulary
>> vSizeDec - size of the decoder vocabulary
>> sBatch - batch size of sequences
>> wBatch - batch size of words
>> isSorted - indicates whether the sequences are sorted by length
>> wCount - word count
>> devID - device id
>> isTraining - indicates whether we are training the model
*/
int T2TBatchLoader::LoadBatchMT(FILE* file,
XTensor* batchEnc, XTensor* paddingEnc,
XTensor* batchDec, XTensor* paddingDec,
XTensor* gold, XTensor* label,
int* seqs,
int vSizeEnc, int vSizeDec, int sBatch, int wBatch,
bool isSorted, int& ws, int& wCount,
int devID, bool isTraining)
{
if (nextBatch < 0 || nextBatch >= bufBatchSize) {
LoadBuf(file, isSorted, 2);
int seq = 0;
bufBatchSize = 0;
nextBatch = 0;
/* we segment the buffer into batches */
while (seq < nseqBuf) {
int wcEnc = 0;
int wcDec = 0;
int wnEnc = 0;
int wnDec = 0;
int maxEnc = 0;
int maxDec = 0;
int sc = 0;
while (seq + sc < nseqBuf) {
/* source-side sequence */
wnEnc = seqLen[seq + sc];
/* target-side sequence */
wnDec = isDoubledEnd ? seqLen[seq + sc + 1] : seqLen[seq + sc + 1] - 1;
int tcEnc = isBigBatch ? (wcEnc + wnEnc) : MAX(maxEnc, wnEnc) * (sc + 2) / 2;
int tcDec = isBigBatch ? (wcDec + wnDec) : MAX(maxDec, wnDec) * (sc + 2) / 2;
if (sc != 0 && sc > sBatch * 2 && (tcEnc > wBatch || tcDec > wBatch))
break;
wcEnc += wnEnc;
sc += 1;
if (maxEnc < wnEnc)
maxEnc = wnEnc;
wcDec += wnDec;
sc += 1;
if (maxDec < wnDec)
maxDec = wnDec;
}
BatchNode& batch = bufBatch[bufBatchSize];
batch.beg = seq;
batch.end = seq + sc;
batch.maxEnc = maxEnc;
batch.maxDec = maxDec;
batch.key = rand();
bufBatchSize++;
seq = seq + sc;
}
if (isRandomBatch)
qsort(bufBatch, bufBatchSize, sizeof(BatchNode), CompareBatchNode);
}
if (bufBatchSize <= 0)
return 0;
BatchNode& batch = bufBatch[nextBatch++];
int seq = batch.beg;
int sc = batch.end - batch.beg;
int maxEnc = batch.maxEnc;
int maxDec = batch.maxDec;
CheckNTErrors(sc % 2 == 0, "The input samples must be paired");
int sCount = sc / 2;
int seqSize = 0;
InitTensor2D(batchEnc, sCount, maxEnc, X_INT, devID);
InitTensor2D(paddingEnc, sCount, maxEnc, X_FLOAT, devID);
InitTensor2D(batchDec, sCount, maxDec, X_INT, devID);
InitTensor2D(paddingDec, sCount, maxDec, X_FLOAT, devID);
InitTensor2D(label, sCount, maxDec, X_INT, devID);
//InitTensor(gold, 3, dimsDec, X_FLOAT, devID);
batchEnc->SetZeroAll();
paddingEnc->SetZeroAll();
batchDec->SetZeroAll();
paddingDec->SetZeroAll();
label->SetZeroAll();
//gold->SetZeroAll();
int wCountEnc = 0;
int wCountDec = 0;
int wCountPad = 0;
wCount = 0;
int* batchEncValues = new int[batchEnc->unitNum];
int* batchDecValues = new int[batchDec->unitNum];
int* labelValues = new int[label->unitNum];
MTYPE* paddingEncOffsets = new MTYPE[sc * maxEnc / 2];
MTYPE* paddingDecOffsets = new MTYPE[sc * maxDec / 2];
//MTYPE * goldOffsets = new MTYPE[sc * maxDec / 2];
memset(batchEncValues, 0, sizeof(int) * batchEnc->unitNum);
memset(batchDecValues, 0, sizeof(int) * batchDec->unitNum);
memset(labelValues, 0, sizeof(int) * batchDec->unitNum);
/* batch of the source-side sequences */
for (int s = seq; s < seq + sc; s += 2) {
int len = seqLen[s];
int sent = (s - seq) / 2;
for (int w = 0; w < len; w++) {
int num = buf[seqOffset[s] + w];
batchEncValues[batchEnc->GetOffset2D(sent, w)] = num;
paddingEncOffsets[wCountEnc] = paddingEnc->GetOffset2D(sent, w);
wCountEnc++;
}
}
ws = wCountEnc;
batchEnc->SetData(batchEncValues, batchEnc->unitNum);
paddingEnc->SetDataBatched(paddingEncOffsets, 1.0F, wCountEnc);
//XTensor * tmp = NewTensorBuf(paddingEnc, devID);
//_ConvertDataType(batchEnc, tmp);
//tmp->Dump(stderr, "tmp:");
//_NotEqual(tmp, paddingEnc, 0);
//DelTensorBuf(tmp);
/* batch of the target-side sequences */
for (int s = seq + 1; s < seq + sc; s += 2) {
int len = isDoubledEnd ? seqLen[s] : seqLen[s] - 1;
CheckNTErrors(len <= maxDec, "Something is wrong!");
int sent = (s - seq - 1) / 2;
for (int w = 0; w < len; w++) {
int num = buf[seqOffset[s] + w];
batchDecValues[batchDec->GetOffset2D(sent, w)] = num;
//paddingDecOffsets[wCountDec] = paddingDec->GetOffset2D(sent, w);
if (w < len - 1) {
paddingDecOffsets[wCountPad++] = paddingDec->GetOffset2D(sent, w);
wCount++;
}
if (w > 0) {
//goldOffsets[wGold++] = gold->GetOffset3D(sent, w - 1, buf[seqOffset[s] + w]);
labelValues[label->GetOffset2D(sent, w - 1)] = buf[seqOffset[s] + w];
}
if (w == len - 1) {
if (isDoubledEnd) {
//goldOffsets[wGold++] = gold->GetOffset3D(sent, w, buf[seqOffset[s] + w]);
labelValues[label->GetOffset2D(sent, w)] = buf[seqOffset[s] + w];
}
else {
//goldOffsets[wGold++] = gold->GetOffset3D(sent, w, buf[seqOffset[s] + w + 1]);
labelValues[label->GetOffset2D(sent, w)] = buf[seqOffset[s] + w + 1];
}
}
//wCount++;
wCountDec++;
if (seqs != NULL)
seqs[seqSize++] = buf[seqOffset[s] + w];
}
if (seqs != NULL) {
for (int w = len; w < maxDec; w++)
seqs[seqSize++] = -1;
}
}
batchDec->SetData(batchDecValues, batchDec->unitNum);
label->SetData(labelValues, label->unitNum);
paddingDec->SetDataBatched(paddingDecOffsets, 1.0F, wCountPad);
//XTensor * tmp2 = NewTensorBuf(paddingDec, devID);
//_ConvertDataType(batchDec, tmp2);
//_NotEqual(tmp2, paddingDec, 0);
//DelTensorBuf(tmp2);
//gold->SetDataBatched(goldOffsets, 1.0F, wGold);
delete[] batchEncValues;
delete[] batchDecValues;
delete[] labelValues;
delete[] paddingEncOffsets;
delete[] paddingDecOffsets;
//delete[] goldOffsets;
return sc;
}
/*
shuffle lines of the file
>> srcFile - the source file to shuffle
>> tgtFile - the resulting file
*/
void T2TBatchLoader::Shuffle(const char* srcFile, const char* tgtFile)
{
char* line = new char[MAX_LINE_LENGTH];
#ifndef WIN32
sprintf(line, "shuf %s > %s", srcFile, tgtFile);
system(line);
#else
ShowNTErrors("Cannot shuffle the file on WINDOWS systems!");
#endif
delete[] line;
}
}
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-04-25
* it is cold today but I'll move to a warm place tomorrow :)
*/
#ifndef __T2TBATCHLOADER_H__
#define __T2TBATCHLOADER_H__
#include "../module/T2TUtility.h"
#include "../../../network/XNet.h"
using namespace nts;
namespace transformer
{
#define MAX_SEQUENCE_LENGTH 1024 * 4
/* node to keep batch information */
struct BatchNode
{
/* beginning position */
int beg;
/* end position */
int end;
/* maximum word number on the encoder side */
int maxEnc;
/* maximum word number on the decoder side */
int maxDec;
/* a key for sorting */
int key;
};
class T2TBatchLoader
{
public:
/* buffer for loading words */
int* buf;
/* another buffer */
int* buf2;
/* batch buf */
BatchNode* bufBatch;
/* buffer size */
int bufSize;
/* size of batch buffer */
int bufBatchSize;
/* length of each sequence */
int* seqLen;
/* another array */
int* seqLen2;
/* offset of the first word for each sequence */
int* seqOffset;
/* number of sequences in the buffer */
int nseqBuf;
/* offset for next sequence in the buffer */
int nextSeq;
/* offset for next batch */
int nextBatch;
/* indicates whether we double the </s> symbol for the output of LM */
bool isDoubledEnd;
/* indicates whether we use batchsize = max * sc
rather rather than batchsize = word-number, where max is the maximum
length and sc is the sentence number */
bool isSmallBatch;
/* counterpart of "isSmallBatch" */
bool isBigBatch;
/* randomize batches */
bool isRandomBatch;
/* bucket size */
int bucketSize;
public:
/* constructor */
T2TBatchLoader();
/* de-constructor */
~T2TBatchLoader();
/* initialization */
void Init(T2TConfig& config);
/* load data to buffer */
int LoadBuf(FILE* file, bool isSorted, int step);
/* clear data buffer */
void ClearBuf();
/* set the random batch flag */
void SetRandomBatch(bool flag = true);
/* load a batch of sequences */
int LoadBatch(FILE* file, bool isLM,
XTensor* batchEnc, XTensor* paddingEnc,
XTensor* batchDec, XTensor* paddingDec,
XTensor* gold, XTensor* label,
int* seqs,
int vsEnc, int vsDec, int sBatch, int wBatch,
bool isSorted, int& ws, int& wCount,
int devID, bool isTraining);
/* load a batch of sequences (for language modeling) */
int LoadBatchLM(FILE* file,
XTensor* batchEnc, XTensor* paddingEnc,
XTensor* batchDec, XTensor* paddingDec,
XTensor* gold, XTensor* label,
int* seqs, int vs, int sBatch, int wBatch,
bool isSorted, int& wCount,
int devID, bool isTraining);
/* load a batch of sequences (for machine translation) */
int LoadBatchMT(FILE* file,
XTensor* batchEnc, XTensor* paddingEnc,
XTensor* batchDec, XTensor* paddingDec,
XTensor* gold, XTensor* label,
int* seqs, int vsEnc, int vsDec, int sBatch, int wBatch,
bool isSorted, int& ws, int& wCount,
int devID, bool isTraining);
/* shuffle the data file */
void Shuffle(const char* srcFile, const char* tgtFile);
};
}
#endif
\ No newline at end of file
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: HU Chi (huchinlp@foxmail.com) 2020-08-09
* TODO: refactor the data loader class and references
*/
#include <string>
#include <vector>
#include <cstdlib>
#include <fstream>
#include <algorithm>
#include "TrainDataSet.h"
#include "../Utility.h"
#include "../translate/Vocab.h"
using namespace nmt;
namespace nts {
/* sort the dataset by length (in descending order) */
void TrainDataSet::SortByLength() {
sort(buffer.items, buffer.items + buffer.count,
[](TrainExample* a, TrainExample* b) {
return (a->srcSent.Size() + a->tgtSent.Size())
> (b->srcSent.Size() + b->tgtSent.Size());
});
}
/* sort buckets by key (in descending order) */
void TrainDataSet::SortBucket() {
sort(buffer.items, buffer.items + buffer.count,
[](TrainExample* a, TrainExample* b) {
return a->bucketKey > b->bucketKey;
});
}
/*
sort the output by key in a range (in descending order)
>> begin - the first index of the range
>> end - the last index of the range
*/
void TrainDataSet::SortInBucket(int begin, int end) {
sort(buffer.items + begin, buffer.items + end,
[](TrainExample* a, TrainExample* b) {
return (a->key > b->key);
});
}
/*
load all data from a file to the buffer
training data format (binary):
first 8 bit: number of sentence pairs
subsequent segements:
source sentence length (4 bit)
target sentence length (4 bit)
source tokens (4 bit per token)
target tokens (4 bit per token)
*/
void TrainDataSet::LoadDataToBuffer()
{
buffer.Clear();
curIdx = 0;
int id = 0;
uint64_t sentNum = 0;
int srcVocabSize = 0;
int tgtVocabSize = 0;
fread(&srcVocabSize, sizeof(srcVocabSize), 1, fp);
fread(&tgtVocabSize, sizeof(tgtVocabSize), 1, fp);
fread(&sentNum, sizeof(uint64_t), 1, fp);
CheckNTErrors(sentNum > 0, "Invalid sentence pairs number");
while (id < sentNum) {
int srcLen = 0;
int tgtLen = 0;
fread(&srcLen, sizeof(int), 1, fp);
fread(&tgtLen, sizeof(int), 1, fp);
CheckNTErrors(srcLen > 0, "Invalid source sentence length");
CheckNTErrors(tgtLen > 0, "Invalid target sentence length");
IntList srcSent;
IntList tgtSent;
srcSent.ReadFromFile(fp, srcLen);
tgtSent.ReadFromFile(fp, tgtLen);
TrainExample* example = new TrainExample;
example->id = id++;
example->key = id;
example->srcSent = srcSent;
example->tgtSent = tgtSent;
buffer.Add(example);
}
fclose(fp);
XPRINT1(0, stderr, "[INFO] loaded %d sentences\n", id);
}
/*
load a mini-batch to the device (for training)
>> batchEnc - a tensor to store the batch of encoder input
>> paddingEnc - a tensor to store the batch of encoder paddings
>> batchDec - a tensor to store the batch of decoder input
>> paddingDec - a tensor to store the batch of decoder paddings
>> label - a tensor to store the label of input
>> minSentBatch - the minimum number of sentence batch
>> batchSize - the maxium number of words in a batch
>> devID - the device id, -1 for the CPU
<< return - number of target tokens and sentences
*/
UInt64List TrainDataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
XTensor* batchDec, XTensor* paddingDec, XTensor* label,
size_t minSentBatch, size_t batchSize, int devID)
{
UInt64List info;
size_t srcTokenNum = 0;
size_t tgtTokenNum = 0;
int realBatchSize = 1;
if (!isTraining)
realBatchSize = minSentBatch;
/* get the maximum source sentence length in a mini-batch */
size_t maxSrcLen = buffer[curIdx]->srcSent.Size();
/* max batch size */
const int MAX_BATCH_SIZE = 512;
/* dynamic batching for sentences, enabled when the dataset is used for training */
if (isTraining) {
while ((realBatchSize < (buffer.Size() - curIdx))
&& (realBatchSize * maxSrcLen < batchSize)
&& (realBatchSize < MAX_BATCH_SIZE)
&& (realBatchSize * buffer[curIdx + realBatchSize]->srcSent.Size() < batchSize)) {
if (maxSrcLen < buffer[curIdx + realBatchSize]->srcSent.Size())
maxSrcLen = buffer[curIdx + realBatchSize]->srcSent.Size();
realBatchSize++;
}
}
/* real batch size */
if ((buffer.Size() - curIdx) < realBatchSize) {
realBatchSize = buffer.Size() - curIdx;
}
CheckNTErrors(realBatchSize > 0, "Invalid batch size");
/* get the maximum target sentence length in a mini-batch */
size_t maxTgtLen = buffer[curIdx]->tgtSent.Size();
for (size_t i = 0; i < realBatchSize; i++) {
if (maxTgtLen < buffer[curIdx + i]->tgtSent.Size())
maxTgtLen = buffer[curIdx + i]->tgtSent.Size();
}
for (size_t i = 0; i < realBatchSize; i++) {
if (maxSrcLen < buffer[curIdx + i]->srcSent.Size())
maxSrcLen = buffer[curIdx + i]->srcSent.Size();
}
CheckNTErrors(maxSrcLen != 0, "Invalid source length for batching");
int* batchEncValues = new int[realBatchSize * maxSrcLen];
float* paddingEncValues = new float[realBatchSize * maxSrcLen];
int* labelVaues = new int[realBatchSize * maxTgtLen];
int* batchDecValues = new int[realBatchSize * maxTgtLen];
float* paddingDecValues = new float[realBatchSize * maxTgtLen];
for (int i = 0; i < realBatchSize * maxSrcLen; i++) {
batchEncValues[i] = PAD;
paddingEncValues[i] = 1;
}
for (int i = 0; i < realBatchSize * maxTgtLen; i++) {
batchDecValues[i] = PAD;
labelVaues[i] = PAD;
paddingDecValues[i] = 1.0F;
}
size_t curSrc = 0;
size_t curTgt = 0;
/*
batchEnc: end with EOS (left padding)
batchDec: begin with SOS (right padding)
label: end with EOS (right padding)
*/
for (int i = 0; i < realBatchSize; ++i) {
srcTokenNum += buffer[curIdx + i]->srcSent.Size();
tgtTokenNum += buffer[curIdx + i]->tgtSent.Size();
curSrc = maxSrcLen * i;
for (int j = 0; j < buffer[curIdx + i]->srcSent.Size(); j++) {
batchEncValues[curSrc++] = buffer[curIdx + i]->srcSent[j];
}
curTgt = maxTgtLen * i;
for (int j = 0; j < buffer[curIdx + i]->tgtSent.Size(); j++) {
if (j > 0)
labelVaues[curTgt - 1] = buffer[curIdx + i]->tgtSent[j];
batchDecValues[curTgt++] = buffer[curIdx + i]->tgtSent[j];
}
labelVaues[curTgt - 1] = EOS;
while (curSrc < maxSrcLen * (i + 1))
paddingEncValues[curSrc++] = 0;
while (curTgt < maxTgtLen * (i + 1))
paddingDecValues[curTgt++] = 0;
}
InitTensor2D(batchEnc, realBatchSize, maxSrcLen, X_INT, devID);
InitTensor2D(paddingEnc, realBatchSize, maxSrcLen, X_FLOAT, devID);
InitTensor2D(batchDec, realBatchSize, maxTgtLen, X_INT, devID);
InitTensor2D(paddingDec, realBatchSize, maxTgtLen, X_FLOAT, devID);
InitTensor2D(label, realBatchSize, maxTgtLen, X_INT, devID);
curIdx += realBatchSize;
batchEnc->SetData(batchEncValues, batchEnc->unitNum);
paddingEnc->SetData(paddingEncValues, paddingEnc->unitNum);
batchDec->SetData(batchDecValues, batchDec->unitNum);
paddingDec->SetData(paddingDecValues, paddingDec->unitNum);
label->SetData(labelVaues, label->unitNum);
delete[] batchEncValues;
delete[] paddingEncValues;
delete[] batchDecValues;
delete[] paddingDecValues;
delete[] labelVaues;
info.Add(tgtTokenNum);
info.Add(realBatchSize);
return info;
}
/*
the constructor of DataSet
>> dataFile - path of the data file
>> bucketSize - size of the bucket to keep similar length sentence pairs
>> training - indicates whether it is used for training
*/
void TrainDataSet::Init(const char* dataFile, int myBucketSize, bool training)
{
fp = fopen(dataFile, "rb");
CheckNTErrors(fp, "can not open the training file");
curIdx = 0;
bucketSize = myBucketSize;
isTraining = training;
LoadDataToBuffer();
SortByLength();
if (isTraining)
BuildBucket();
}
/* check if the buffer is empty */
bool TrainDataSet::IsEmpty() {
if (curIdx < buffer.Size())
return false;
return true;
}
/* reset the buffer */
void TrainDataSet::ClearBuf()
{
curIdx = 0;
/* make different batches in different epochs */
SortByLength();
if (isTraining)
BuildBucket();
}
/* group data into buckets with similar length */
void TrainDataSet::BuildBucket()
{
size_t idx = 0;
/* build and shuffle buckets */
while (idx < buffer.Size()) {
/* sentence number in a bucket */
size_t sentNum = 1;
/* get the maximum source sentence length in a bucket */
size_t maxSrcLen = buffer[idx]->srcSent.Size();
/* bucketing for sentences */
while ((sentNum < (buffer.Size() - idx))
&& (sentNum * maxSrcLen < bucketSize)
&& (sentNum * buffer[curIdx + sentNum]->srcSent.Size() < bucketSize)) {
if (maxSrcLen < buffer[idx + sentNum]->srcSent.Size())
maxSrcLen = buffer[idx + sentNum]->srcSent.Size();
sentNum++;
}
/* make sure the number is valid */
if ((buffer.Size() - idx) < sentNum) {
sentNum = buffer.Size() - idx;
}
int randomKey = rand();
/* shuffle items in a bucket */
for (size_t i = 0; i < sentNum; i++) {
buffer[idx + i]->bucketKey = randomKey;
}
idx += sentNum;
}
SortBucket();
/* sort items in a bucket */
idx = 0;
while (idx < buffer.Size()) {
size_t sentNum = 0;
int bucketKey = buffer[idx + sentNum]->bucketKey;
while (sentNum < (buffer.Size() - idx)
&& buffer[idx + sentNum]->bucketKey == bucketKey) {
buffer[idx + sentNum]->key = buffer[idx + sentNum]->srcSent.Size();
sentNum++;
}
SortInBucket(idx, idx + sentNum);
idx += sentNum;
}
}
/* de-constructor */
TrainDataSet::~TrainDataSet()
{
/* release the buffer */
for (int i = 0; i < buffer.Size(); i++)
delete buffer[i];
}
}
\ No newline at end of file
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: HU Chi (huchinlp@foxmail.com) 2019-04-03
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-06
*/
#ifndef __TRAIN_DATASET_H__
#define __TRAIN_DATASET_H__
#include <cstdio>
#include <vector>
#include <fstream>
#include "../../../tensor/XList.h"
#include "../../../tensor/XTensor.h"
#include "../../../tensor/XGlobal.h"
#define MAX_WORD_NUM 120
using namespace std;
namespace nts {
/* a class of sentence pairs for training */
struct TrainExample {
/* id of the sentence pair */
int id;
/* source language setence (tokenized) */
IntList srcSent;
/* target language setence (tokenized) */
IntList tgtSent;
/* the key used to shuffle items in a bucket */
int key;
/* the key used to shuffle buckets */
int bucketKey;
};
/* A `TrainDataSet` is associated with a file which contains training data. */
struct TrainDataSet {
public:
/* the data buffer */
TrainBufferType buffer;
/* a list of empty line number */
IntList emptyLines;
/* the pointer to file stream */
FILE* fp;
/* current index in the buffer */
size_t curIdx;
/* size of used data in the buffer */
size_t bufferUsed;
/* size of the bucket used for grouping sentences */
size_t bucketSize;
/* indicates whether it is used for training */
bool isTraining;
public:
/* sort the input by length (in descending order) */
void SortByLength();
/* sort buckets by key (in descending order) */
void SortBucket();
/* sort the output by key (in descending order) */
void SortInBucket(int begin, int end);
/* load data from a file to the buffer */
void LoadDataToBuffer();
/* generate a mini-batch */
UInt64List LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
XTensor* batchDec, XTensor* paddingDec, XTensor* label,
size_t minSentBatch, size_t batchSize, int devID);
/* initialization function */
void Init(const char* dataFile, int bucketSize, bool training);
/* check if the buffer is empty */
bool IsEmpty();
/* reset the buffer */
void ClearBuf();
/* group data into buckets with similar length */
void BuildBucket();
/* de-constructor */
~TrainDataSet();
};
}
#endif // __TRAIN_DATASET_H__
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University. * Copyright (C) 2020 NiuTrans Research. All rights reserved.
* All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
...@@ -19,30 +18,31 @@ ...@@ -19,30 +18,31 @@
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-08-02 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-08-02
*/ */
#include <cmath> #include "Trainer.h"
#include "T2TTrainer.h" #include "../Utility.h"
#include "../module/T2TUtility.h" #include "../../../network/XNoder.h"
#include "../../../tensor/XUtility.h" #include "../../../tensor/XUtility.h"
#include "../../../tensor/core/CHeader.h" #include "../../../tensor/core/CHeader.h"
#include "../../../tensor/loss/LHeader.h" #include "../../../tensor/loss/LHeader.h"
#include "../../../network/XNoder.h"
#ifndef WIN32 #ifndef WIN32
#include <sys/time.h> #include <sys/time.h>
#include <unistd.h> #include <unistd.h>
#endif #endif
#include "../../../tensor/XMem.h"
namespace transformer namespace nmt
{ {
/* constructor */ /* constructor */
T2TTrainer::T2TTrainer() Trainer::Trainer()
{ {
cfg = NULL; cfg = NULL;
} }
/* de-constructor */ /* de-constructor */
T2TTrainer::~T2TTrainer() Trainer::~Trainer()
{ {
for (int i = 0; i < moments.count; i++) { for (int i = 0; i < moments.count; i++) {
XTensor* m = (XTensor*)moments.Get(i); XTensor* m = (XTensor*)moments.Get(i);
...@@ -59,15 +59,17 @@ T2TTrainer::~T2TTrainer() ...@@ -59,15 +59,17 @@ T2TTrainer::~T2TTrainer()
initialization initialization
>> config - configurations of the training process >> config - configurations of the training process
*/ */
void T2TTrainer::Init(T2TConfig& config) void Trainer::Init(Config& config)
{ {
cfg = &config; cfg = &config;
lrate = config.lrate; lrate = config.lrate;
lrbias = config.lrbias; lrbias = config.lrbias;
sBatchSize = config.sBatchSize; sBatchSize = config.sBatchSize;
wBatchSize = config.wBatchSize; wBatchSize = config.wBatchSize;
bucketSize = config.bucketSize;
nepoch = config.nepoch; nepoch = config.nepoch;
nstep = config.nstep; nstep = config.nstep;
maxCheckpoint = config.maxCheckpoint;
d = config.modelSize; d = config.modelSize;
nwarmup = config.nwarmup; nwarmup = config.nwarmup;
vSize = config.srcVocabSize; vSize = config.srcVocabSize;
...@@ -81,17 +83,12 @@ void T2TTrainer::Init(T2TConfig& config) ...@@ -81,17 +83,12 @@ void T2TTrainer::Init(T2TConfig& config)
nStepCheckpoint = config.nStepCheckpoint; nStepCheckpoint = config.nStepCheckpoint;
useEpochCheckpoint = config.useEpochCheckpoint; useEpochCheckpoint = config.useEpochCheckpoint;
updateStep = config.updateStep; updateStep = config.updateStep;
isDebugged = config.isDebugged;
isLenSorted = config.isLenSorted; isLenSorted = config.isLenSorted;
adamBeta1T = 1.0F; adamBeta1T = 1.0F;
adamBeta2T = 1.0F; adamBeta2T = 1.0F;
batchLoader.Init(config);
} }
int tc = 0;
/* /*
train the model train the model
>> fn - training data file >> fn - training data file
...@@ -99,8 +96,14 @@ train the model ...@@ -99,8 +96,14 @@ train the model
>> modelFN - where we keep the model >> modelFN - where we keep the model
>> model - model to train >> model - model to train
*/ */
void T2TTrainer::Train(const char* fn, const char* validFN, const char* modelFN, T2TModel* model) void Trainer::Train(const char* fn, const char* validFN,
const char* modelFN, Model* model)
{ {
/* disable cache during training */
for (int i = 0; i < model->decoder->nlayer; i++) {
model->decoder->selfAttCache[i].enable = false;
model->decoder->enDeAttCache[i].enable = false;
}
int step = 0; int step = 0;
int wc = 0; int wc = 0;
int ws = 0; int ws = 0;
...@@ -126,26 +129,26 @@ void T2TTrainer::Train(const char* fn, const char* validFN, const char* modelFN, ...@@ -126,26 +129,26 @@ void T2TTrainer::Train(const char* fn, const char* validFN, const char* modelFN,
#endif #endif
int devID = model->devID; int devID = model->devID;
XNet net;
PrepareModel(model); PrepareModel(model);
double startT = GetClockSec(); double startT = GetClockSec();
for (epoch = 1; epoch <= nepoch; epoch++) { batchLoader.Init(fn, bucketSize, true);
#ifndef WIN32
if (isShuffled) {
fprintf(stderr, "shuffle the file\n");
batchLoader.Shuffle(fn, trainFN);
}
#endif
FILE* file = fopen(trainFN, "r"); for (epoch = 1; epoch <= nepoch; epoch++) {
CheckNTErrors(file, "cannot open training file!");
wordCount = 0; wordCount = 0;
loss = 0; loss = 0;
/* reset the batch loader */
batchLoader.ClearBuf();
while (!batchLoader.IsEmpty())
{
XNet net;
net.Clear();
/* batch of sequences (on the encoder and decoder sides) */ /* batch of sequences (on the encoder and decoder sides) */
XTensor batchEnc; XTensor batchEnc;
XTensor batchDec; XTensor batchDec;
...@@ -157,14 +160,11 @@ void T2TTrainer::Train(const char* fn, const char* validFN, const char* modelFN, ...@@ -157,14 +160,11 @@ void T2TTrainer::Train(const char* fn, const char* validFN, const char* modelFN,
XTensor paddingEnc; XTensor paddingEnc;
XTensor paddingDec; XTensor paddingDec;
/* gold standard */ UInt64List info = batchLoader.LoadBatch(&batchEnc, &paddingEnc, &batchDec, &paddingDec, &label,
XTensor gold; sBatchSize, wBatchSize, devID);
while (batchLoader.LoadBatch(file, model->isLM, wc = info[0];
&batchEnc, &paddingEnc, &batchDec, &paddingDec, &gold, &label, ws = info[1];
NULL, vSize, vSizeTgt,
sBatchSize, wBatchSize, isLenSorted, ws, wc, devID, true))
{
CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch"); CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch");
/* output probabilities */ /* output probabilities */
...@@ -204,10 +204,18 @@ void T2TTrainer::Train(const char* fn, const char* validFN, const char* modelFN, ...@@ -204,10 +204,18 @@ void T2TTrainer::Train(const char* fn, const char* validFN, const char* modelFN,
/* update the parameters */ /* update the parameters */
if (gradStep == updateStep) { if (gradStep == updateStep) {
/* learning rate */
lr = lrate * (1.0F / (float)sqrt((float)d)) * float warmupEndLR = lrate;
(float)MIN(pow((float)validStep + 1, -0.5F - lrbias), float warmupInitLR = 1e-7;
((float)validStep + 1) * pow((float)nwarmup, -1.5F - lrbias)); float lrStep = (warmupEndLR - warmupInitLR) / nwarmup;
float decayFactor = warmupEndLR * pow(float(nwarmup), 0.5F);
/* learning rate, scheduled by inverse square root */
if (step < nwarmup)
lr = warmupInitLR + step * lrStep;
else
lr = decayFactor * pow((float)step, -0.5F);
/* model update */ /* model update */
Update(model, lr); Update(model, lr);
...@@ -224,15 +232,21 @@ void T2TTrainer::Train(const char* fn, const char* validFN, const char* modelFN, ...@@ -224,15 +232,21 @@ void T2TTrainer::Train(const char* fn, const char* validFN, const char* modelFN,
break; break;
} }
if (step == 10) {
// LOG("after backward --------");
// lossTensor.mem->ShowMemUsage(stderr);
// exit(0);
}
if (step % 100 == 0) { if (step % 100 == 0) {
double elapsed = GetClockSec() - startT; double elapsed = GetClockSec() - startT;
XPRINT8(0, stderr, "[INFO] elapsed=%.1fs, step=%d, epoch=%d, total word=%d, total batch=%d, loss=%.3f, ppl=%.3f, sppl=%.3f", LOG("elapsed=%.1fs, step=%d, epoch=%d, "
elapsed, step, epoch, "total word=%d, total batch=%d, loss=%.3f, ppl=%.3f, lr=%.2e",
wordCountTotal, batchCountTotal, elapsed, step, epoch, wordCountTotal, batchCountTotal,
loss / wordCount, exp(loss / wordCount), exp(lossBatch / wc)); loss / wordCount / log(2.0), exp(loss / wordCount), lr);
if (!doUpdate) if (!doUpdate)
XPRINT(0, stderr, " (no update)"); XPRINT(0, stderr, " (no update)");
XPRINT(0, stderr, "\n");
} }
if (nStepCheckpoint > 0 && ++nStepCheck >= nStepCheckpoint) { if (nStepCheckpoint > 0 && ++nStepCheck >= nStepCheckpoint) {
...@@ -242,8 +256,6 @@ void T2TTrainer::Train(const char* fn, const char* validFN, const char* modelFN, ...@@ -242,8 +256,6 @@ void T2TTrainer::Train(const char* fn, const char* validFN, const char* modelFN,
} }
} }
fclose(file);
if (isEnd) if (isEnd)
break; break;
...@@ -255,10 +267,14 @@ void T2TTrainer::Train(const char* fn, const char* validFN, const char* modelFN, ...@@ -255,10 +267,14 @@ void T2TTrainer::Train(const char* fn, const char* validFN, const char* modelFN,
epoch = MIN(epoch, nepoch); epoch = MIN(epoch, nepoch);
XPRINT7(0, stderr, "[INFO] lr=%.2e, elapsed=%.1fs, step=%d, epoch=%d, word=%d, loss=%.3f, ppl=%.3f\n", LOG("lr=%.2e, elapsed=%.1fs, step=%d, "
lr, elapsed, step, epoch, wordCountTotal, loss / wordCount, exp(loss / wordCount)); "epoch=%d, word=%d, loss=%.3f, ppl=%.3f",
XPRINT4(0, stderr, "[INFO] training finished (took %.1fs, step=%d, skipped=%d and epoch=%d)\n", lr, elapsed, step, epoch, wordCountTotal, loss / wordCount / log(2.0), exp(loss / wordCount));
elapsed, step, nSkipped, epoch); LOG("training finished (took %.1fs, step=%d, "
"skipped=%d and epoch=%d)", elapsed, step, nSkipped, epoch);
LOG("saving the final model");
model->Dump(modelFN);
delete[] trainFN; delete[] trainFN;
} }
...@@ -269,7 +285,7 @@ test the model ...@@ -269,7 +285,7 @@ test the model
>> ofn - output data file >> ofn - output data file
>> model - model that is trained >> model - model that is trained
*/ */
void T2TTrainer::Validate(const char* fn, const char* ofn, T2TModel* model) void Trainer::Validate(const char* fn, const char* ofn, Model* model)
{ {
int wc = 0; int wc = 0;
int ws = 0; int ws = 0;
...@@ -278,13 +294,12 @@ void T2TTrainer::Validate(const char* fn, const char* ofn, T2TModel* model) ...@@ -278,13 +294,12 @@ void T2TTrainer::Validate(const char* fn, const char* ofn, T2TModel* model)
float loss = 0; float loss = 0;
/* data files */ /* data files */
FILE* file = fopen(fn, "rb"); batchLoader.Init(fn, 0, false);
CheckNTErrors(file, "Cannot read the test file");
FILE* ofile = fopen(ofn, "wb");
CheckNTErrors(ofile, "Cannot open the output file");
double startT = GetClockSec(); double startT = GetClockSec();
while (!batchLoader.IsEmpty())
{
/* batch of input sequences */ /* batch of input sequences */
XTensor batchEnc; XTensor batchEnc;
XTensor batchDec; XTensor batchDec;
...@@ -296,24 +311,19 @@ void T2TTrainer::Validate(const char* fn, const char* ofn, T2TModel* model) ...@@ -296,24 +311,19 @@ void T2TTrainer::Validate(const char* fn, const char* ofn, T2TModel* model)
XTensor paddingEnc; XTensor paddingEnc;
XTensor paddingDec; XTensor paddingDec;
/* gold standard */
XTensor gold;
/* an array that keeps the sequences */
int* seqs = new int[MILLION];
batchLoader.ClearBuf();
while (batchLoader.LoadBatch(file, model->isLM,
&batchEnc, &paddingEnc, &batchDec, &paddingDec, &gold, &label,
seqs, vSize, vSizeTgt,
1, 1, false, ws, wc, model->devID, false))
{
CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch");
/* output probabilities */ /* output probabilities */
XTensor output; XTensor output;
/* prediction probabilities */
XTensor labelOnehot;
XTensor lossTensor;
UInt64List info = batchLoader.LoadBatch(&batchEnc, &paddingEnc, &batchDec, &paddingDec, &label,
sBatchSize, 0, model->devID);
wc = info[0];
ws = info[1];
CheckNTErrors(batchEnc.order == 2, "Wrong tensor order of the sequence batch");
/* make the network */ /* make the network */
if (model->isLM) if (model->isLM)
model->MakeLM(batchEnc, output, paddingEnc, false); model->MakeLM(batchEnc, output, paddingEnc, false);
...@@ -326,52 +336,20 @@ void T2TTrainer::Validate(const char* fn, const char* ofn, T2TModel* model) ...@@ -326,52 +336,20 @@ void T2TTrainer::Validate(const char* fn, const char* ofn, T2TModel* model)
int bSize = output.GetDim(0); int bSize = output.GetDim(0);
int length = output.GetDim(1); int length = output.GetDim(1);
/* prediction probabilities */
XTensor labelOnehot;
XTensor lossTensor;
labelOnehot = IndexToOnehot(label, vSizeTgt, 0); labelOnehot = IndexToOnehot(label, vSizeTgt, 0);
lossTensor = CrossEntropy(output, labelOnehot, paddingDec); lossTensor = CrossEntropy(output, labelOnehot, paddingDec);
float lossBatch = ReduceSumAllValue(lossTensor); float lossBatch = ReduceSumAllValue(lossTensor);
/* dump the test result */
for (int s = 0; s < bSize; s++) {
DTYPE sum = 0;
int* seq = seqs + s * length;
for (int i = 0; i < length; i++) {
if (seq[i] >= 0) {
fprintf(ofile, "%d ", seq[i]);
}
else
break;
}
fprintf(ofile, "||| ");
for (int i = 0; i < length; i++) {
if (seq[i] >= 0) {
DTYPE p = lossTensor.Get2D(s, i);
fprintf(ofile, "%.3e ", p);
sum += p;
}
else
break;
}
fprintf(ofile, "||| %e\n", sum);
}
loss += lossBatch; loss += lossBatch;
wordCount += wc; wordCount += wc;
sentCount += bSize; sentCount += bSize;
} }
fclose(file);
fclose(ofile);
delete[] seqs;
double elapsed = GetClockSec() - startT; double elapsed = GetClockSec() - startT;
XPRINT5(0, stderr, "[INFO] test finished (took %.1fs, sentence=%d, word=%d, loss=%.3f and ppl=%.3f)\n", LOG("test finished (took %.1fs, sentence=%d, word=%d, loss=%.3f and ppl=%.3f)",
elapsed, sentCount, wordCount, loss / wordCount, exp(loss / wordCount)); elapsed, sentCount, wordCount, loss / wordCount / log(2.0), exp(loss / wordCount));
} }
/* /*
...@@ -382,20 +360,29 @@ make a checkpoint ...@@ -382,20 +360,29 @@ make a checkpoint
>> label - label of the model >> label - label of the model
>> id - id of the checkpoint >> id - id of the checkpoint
*/ */
void T2TTrainer::MakeCheckpoint(T2TModel* model, const char* validFN, const char* modelFN, const char* label, int id) void Trainer::MakeCheckpoint(Model* model, const char* validFN,
const char* modelFN, const char* label, int id)
{ {
fprintf(stderr, "make a checkpoint\n"); LOG("make a checkpoint");
char* fn = new char[MAX_LINE_LENGTH]; char* fn = new char[MAX_LINE_LENGTH];
Trainer validator;
validator.Init(*cfg);
/* save last checkpoints */
id = validator.maxCheckpoint - (maxCheckpoint--);
if (maxCheckpoint == 0)
maxCheckpoint = validator.maxCheckpoint;
sprintf(fn, "%s.%s.%03d", modelFN, label, id); sprintf(fn, "%s.%s.%03d", modelFN, label, id);
model->Dump(fn); model->Dump(fn);
delete[] fn; delete[] fn;
char* fn2 = new char[MAX_LINE_LENGTH]; char* fn2 = new char[MAX_LINE_LENGTH];
sprintf(fn2, "%s.%s.%03d.output", modelFN, label, id); sprintf(fn2, "%s.%s.%03d.output", modelFN, label, id);
if (validFN != NULL) { if (validFN != NULL) {
T2TTrainer trainer;
trainer.Init(*cfg); validator.Validate(validFN, fn2, model);
trainer.Validate(validFN, fn2, model);
} }
delete[] fn2; delete[] fn2;
} }
...@@ -405,12 +392,12 @@ update the model by delta rule ...@@ -405,12 +392,12 @@ update the model by delta rule
\theta_{new} = \theta - \lrate * grad \theta_{new} = \theta - \lrate * grad
where where
\lrate = d^-0.5 * min(stepNum^{-0.5}, stepNum * warmupStepNum^{-1.5}) \lrate = d^-0.5 * min(stepNum^{-0.5}, stepNum * warmupStepNum^{-1.5})
>> model - the t2t model >> model - the model
>> lr - learning rate >> lr - learning rate
*/ */
void T2TTrainer::Update(T2TModel* model, const float lr) void Trainer::Update(Model* model, const float lr)
{ {
TensorList ws(100); TensorList ws;
model->GetParams(ws); model->GetParams(ws);
...@@ -465,12 +452,12 @@ void T2TTrainer::Update(T2TModel* model, const float lr) ...@@ -465,12 +452,12 @@ void T2TTrainer::Update(T2TModel* model, const float lr)
prepare model for training prepare model for training
>> model - the model for training >> model - the model for training
*/ */
void T2TTrainer::PrepareModel(T2TModel* model) void Trainer::PrepareModel(Model* model)
{ {
moments.Clear(); moments.Clear();
moments2nd.Clear(); moments2nd.Clear();
TensorList ws(100); TensorList ws;
model->GetParams(ws); model->GetParams(ws);
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University. * Copyright (C) 2020 NiuTrans Research. All rights reserved.
* All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
...@@ -19,25 +18,24 @@ ...@@ -19,25 +18,24 @@
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-08-02 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-08-02
*/ */
#ifndef __T2TTRAINER_H__ #ifndef __TRAINER_H__
#define __T2TTRAINER_H__ #define __TRAINER_H__
#include "../T2TModel.h" #include "../Model.h"
#include "T2TBatchLoader.h" #include "TrainDataSet.h"
#include "../../../tensor/function/FHeader.h"
using namespace nts; using namespace nts;
namespace transformer namespace nmt
{ {
/* trainer of the T2T model */ /* trainer of the model */
class T2TTrainer class Trainer
{ {
public: public:
/* configurations */ /* configurations */
T2TConfig* cfg; Config* cfg;
/* dimension size of each inner layer */ /* dimension size of each inner layer */
int d; int d;
...@@ -63,12 +61,18 @@ public: ...@@ -63,12 +61,18 @@ public:
/* word batch size */ /* word batch size */
int wBatchSize; int wBatchSize;
/* size of bucket for grouping data by length */
int bucketSize;
/* training epoch number */ /* training epoch number */
int nepoch; int nepoch;
/* traing step number */ /* traing step number */
int nstep; int nstep;
/* the maximum number of saved checkpoints */
int maxCheckpoint;
/* indicates whether we use adam */ /* indicates whether we use adam */
bool useAdam; bool useAdam;
...@@ -100,39 +104,36 @@ public: ...@@ -100,39 +104,36 @@ public:
/* number of batches on which we do model update */ /* number of batches on which we do model update */
int updateStep; int updateStep;
/* indicates whether we intend to debug the net */
bool isDebugged;
/* indicates whether the sequence is sorted by length */ /* indicates whether the sequence is sorted by length */
bool isLenSorted; bool isLenSorted;
/* for batching */ /* used for loading batches */
T2TBatchLoader batchLoader; TrainDataSet batchLoader;
public: public:
/* constructor */ /* constructor */
T2TTrainer(); Trainer();
/* de-constructor */ /* de-constructor */
~T2TTrainer(); ~Trainer();
/* initialize the trainer */ /* initialize the trainer */
void Init(T2TConfig& config); void Init(Config& config);
/* train the model */ /* train the model */
void Train(const char* fn, const char* validFN, const char* modelFN, T2TModel* model); void Train(const char* fn, const char* validFN, const char* modelFN, Model* model);
/* test the model */ /* test the model */
void Validate(const char* fn, const char* ofn, T2TModel* model); void Validate(const char* fn, const char* ofn, Model* model);
/* make a checkpoint */ /* make a checkpoint */
void MakeCheckpoint(T2TModel* model, const char* validFN, const char* modelFN, const char* label, int id); void MakeCheckpoint(Model* model, const char* validFN, const char* modelFN, const char* label, int id);
/* update the model by delta rule */ /* update the model by delta rule */
void Update(T2TModel* model, const float lr); void Update(Model* model, const float lr);
/* prepare model for training */ /* prepare model for training */
void PrepareModel(T2TModel* model); void PrepareModel(Model* model);
}; };
} }
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University. * Copyright (C) 2020 NiuTrans Research. All rights reserved.
* All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
...@@ -26,23 +25,25 @@ ...@@ -26,23 +25,25 @@
#include <fstream> #include <fstream>
#include <algorithm> #include <algorithm>
#include "T2TDataSet.h" #include "DataSet.h"
#include "../module/T2TUtility.h" #include "../Utility.h"
using namespace transformer; using namespace nmt;
namespace nts { namespace nts {
/* sort the output by id (in ascending order) */ /* sort the output by id (in ascending order) */
void DataSet::SortInput() { void DataSet::SortInput() {
sort(inputBuffer.items, inputBuffer.items + inputBuffer.count, [](Example* a, Example* b) { sort(inputBuffer.items, inputBuffer.items + inputBuffer.count,
[](Example* a, Example* b) {
return a->values.count > b->values.count; return a->values.count > b->values.count;
}); });
} }
/* sort the input by length (in descending order) */ /* sort the input by length (in descending order) */
void DataSet::SortOutput() { void DataSet::SortOutput() {
sort(outputBuffer.items, outputBuffer.items + outputBuffer.count, [](Result* a, Result* b) { sort(outputBuffer.items, outputBuffer.items + outputBuffer.count,
[](Result* a, Result* b) {
return a->id < b->id; return a->id < b->id;
}); });
} }
...@@ -74,7 +75,7 @@ void DataSet::LoadDataToBuffer() ...@@ -74,7 +75,7 @@ void DataSet::LoadDataToBuffer()
: line.size() - indices[i]; : line.size() - indices[i];
string word = line.substr(indices[i], offset); string word = line.substr(indices[i], offset);
if (srcVocab.word2id.find(word) == srcVocab.word2id.end()) if (srcVocab.word2id.find(word) == srcVocab.word2id.end())
values.Add(3); values.Add(UNK);
else else
values.Add(srcVocab.word2id.at(word)); values.Add(srcVocab.word2id.at(word));
} }
...@@ -100,7 +101,7 @@ void DataSet::LoadDataToBuffer() ...@@ -100,7 +101,7 @@ void DataSet::LoadDataToBuffer()
} }
/* /*
load a mini-batch to the device load a mini-batch to the device (for translating)
>> batchEnc - a tensor to store the batch of input >> batchEnc - a tensor to store the batch of input
>> paddingEnc - a tensor to store the batch of paddings >> paddingEnc - a tensor to store the batch of paddings
>> minSentBatch - the minimum number of sentence batch >> minSentBatch - the minimum number of sentence batch
...@@ -117,10 +118,10 @@ UInt64List DataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc, ...@@ -117,10 +118,10 @@ UInt64List DataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
size_t maxLen = inputBuffer[bufferUsed]->values.Size(); size_t maxLen = inputBuffer[bufferUsed]->values.Size();
/* dynamic batching for sentences */ /* dynamic batching for sentences */
while ((realBatchSize < (inputBuffer.Size() - bufferUsed)) //while ((realBatchSize < (inputBuffer.Size() - bufferUsed))
&& (realBatchSize * maxLen < batchSize)) { // && (realBatchSize * maxLen < batchSize)) {
realBatchSize++; // realBatchSize++;
} //}
/* real batch size */ /* real batch size */
if ((inputBuffer.Size() - bufferUsed) < realBatchSize) { if ((inputBuffer.Size() - bufferUsed) < realBatchSize) {
...@@ -133,13 +134,13 @@ UInt64List DataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc, ...@@ -133,13 +134,13 @@ UInt64List DataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
float* paddingValues = new float[realBatchSize * maxLen]; float* paddingValues = new float[realBatchSize * maxLen];
for (int i = 0; i < realBatchSize * maxLen; i++) { for (int i = 0; i < realBatchSize * maxLen; i++) {
batchValues[i] = 1; batchValues[i] = PAD;
paddingValues[i] = 0.0F; paddingValues[i] = 1.0F;
} }
size_t cur = 0; size_t curSrc = 0;
/* left padding */ /* right padding */
UInt64List infos; UInt64List infos;
size_t totalLength = 0; size_t totalLength = 0;
...@@ -147,11 +148,11 @@ UInt64List DataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc, ...@@ -147,11 +148,11 @@ UInt64List DataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
infos.Add(inputBuffer[bufferUsed + i]->id); infos.Add(inputBuffer[bufferUsed + i]->id);
totalLength += inputBuffer[bufferUsed + i]->values.Size(); totalLength += inputBuffer[bufferUsed + i]->values.Size();
cur = maxLen * (i + 1) - inputBuffer[bufferUsed + i]->values.Size(); curSrc = maxLen * i;
for (int j = 0; j < inputBuffer[bufferUsed + i]->values.Size(); j++) { for (int j = 0; j < inputBuffer[bufferUsed + i]->values.Size(); j++)
batchValues[cur] = inputBuffer[bufferUsed + i]->values[j]; batchValues[curSrc++] = inputBuffer[bufferUsed + i]->values[j];
paddingValues[cur++] = 1.0F; while (curSrc < maxLen * (i + 1))
} paddingValues[curSrc++] = 0;
} }
infos.Add(totalLength); infos.Add(totalLength);
...@@ -178,7 +179,7 @@ the constructor of DataSet ...@@ -178,7 +179,7 @@ the constructor of DataSet
void DataSet::Init(const char* dataFile, const char* srcVocabFN, const char* tgtVocabFN) void DataSet::Init(const char* dataFile, const char* srcVocabFN, const char* tgtVocabFN)
{ {
fp = new ifstream(dataFile); fp = new ifstream(dataFile);
CheckNTErrors(fp->is_open(), "can not open the file"); CheckNTErrors(fp->is_open(), "Can not open the test data");
bufferUsed = 0; bufferUsed = 0;
CheckNTErrors(strcmp(srcVocabFN, "") != 0, "missing source vocab file"); CheckNTErrors(strcmp(srcVocabFN, "") != 0, "missing source vocab file");
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University. * Copyright (C) 2020 NiuTrans Research. All rights reserved.
* All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
...@@ -26,7 +25,7 @@ ...@@ -26,7 +25,7 @@
#include <cstdio> #include <cstdio>
#include <vector> #include <vector>
#include <fstream> #include <fstream>
#include "T2TVocab.h" #include "Vocab.h"
#include "../../../tensor/XList.h" #include "../../../tensor/XList.h"
#include "../../../tensor/XTensor.h" #include "../../../tensor/XTensor.h"
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University. * Copyright (C) 2020 NiuTrans Research. All rights reserved.
* All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
...@@ -22,11 +21,11 @@ ...@@ -22,11 +21,11 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/ */
#include "T2TLengthPenalty.h" #include "LengthPenalty.h"
using namespace nts; using namespace nts;
namespace transformer namespace nmt
{ {
/* /*
...@@ -36,7 +35,7 @@ where n = length of the sequence ...@@ -36,7 +35,7 @@ where n = length of the sequence
>> alpha - the parameter controls the length preference >> alpha - the parameter controls the length preference
<< return - length penalty of the sequence << return - length penalty of the sequence
*/ */
float T2TLengthPenalizer::GNMT(float length, float alpha) float LengthPenalizer::GNMT(float length, float alpha)
{ {
float base; float base;
float lp; float lp;
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University. * Copyright (C) 2020 NiuTrans Research. All rights reserved.
* All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
...@@ -22,21 +21,21 @@ ...@@ -22,21 +21,21 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/ */
#ifndef __T2TLENGTHPENALTY_H__ #ifndef __LENGTHPENALTY_H__
#define __T2TLENGTHPENALTY_H__ #define __LENGTHPENALTY_H__
#include "../module/T2TUtility.h" #include "../Utility.h"
#include "../../../tensor/XTensor.h" #include "../../../tensor/XTensor.h"
using namespace nts; using namespace nts;
namespace transformer namespace nmt
{ {
/* We intend to penalize short sequences because they have higher score /* We intend to penalize short sequences because they have higher score
in product of a sequence of probability-like terms and have more chances in product of a sequence of probability-like terms and have more chances
to beat others in search. */ to beat others in search. */
class T2TLengthPenalizer class LengthPenalizer
{ {
public: public:
/* GNMT-like length penalty: pl = ((5 + n)/(5 + 1))^\alpha /* GNMT-like length penalty: pl = ((5 + n)/(5 + 1))^\alpha
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University. * Copyright (C) 2020 NiuTrans Research. All rights reserved.
* All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
...@@ -22,23 +21,23 @@ ...@@ -22,23 +21,23 @@
#include <iostream> #include <iostream>
#include "T2TPredictor.h" #include "Predictor.h"
#include "../module/T2TNNUtil.h" #include "../module/NNUtil.h"
using namespace nts; using namespace nts;
namespace transformer namespace nmt
{ {
/* constructor */ /* constructor */
T2TStateBundle::T2TStateBundle() StateBundle::StateBundle()
{ {
states = NULL; states = NULL;
isStart = false; isStart = false;
} }
/* de-constructor */ /* de-constructor */
T2TStateBundle::~T2TStateBundle() StateBundle::~StateBundle()
{ {
if (states != NULL) if (states != NULL)
delete[] states; delete[] states;
...@@ -48,18 +47,18 @@ T2TStateBundle::~T2TStateBundle() ...@@ -48,18 +47,18 @@ T2TStateBundle::~T2TStateBundle()
create states create states
>> num - number of states >> num - number of states
*/ */
void T2TStateBundle::MakeStates(int num) void StateBundle::MakeStates(int num)
{ {
CheckNTErrors(num > 0, "invalid number"); CheckNTErrors(num > 0, "invalid number");
if (states != NULL) if (states != NULL)
delete[] states; delete[] states;
states = new T2TState[num]; states = new State[num];
for (int i = 0; i < num; i++) { for (int i = 0; i < num; i++) {
states[i].prediction = -1; states[i].prediction = -1;
states[i].pid = T2T_PID_EMPTY; states[i].pid = _PID_EMPTY;
states[i].isEnd = false; states[i].isEnd = false;
states[i].isStart = false; states[i].isStart = false;
states[i].isCompleted = false; states[i].isCompleted = false;
...@@ -74,26 +73,26 @@ void T2TStateBundle::MakeStates(int num) ...@@ -74,26 +73,26 @@ void T2TStateBundle::MakeStates(int num)
} }
/* constructor */ /* constructor */
T2TPredictor::T2TPredictor() Predictor::Predictor()
{ {
startSymbol = 2; startSymbol = 2;
} }
/* de-constructor */ /* de-constructor */
T2TPredictor::~T2TPredictor() Predictor::~Predictor()
{ {
} }
/* /*
create an initial state create an initial state
>> model - the t2t model >> model - the model
>> top - the top-most layer of the network >> top - the top-most layer of the network
>> input - input of the network >> input - input of the network
>> beamSize - beam size >> beamSize - beam size
>> state - the state to be initialized >> state - the state to be initialized
*/ */
void T2TPredictor::Create(T2TModel* model, XTensor* top, const XTensor* input, void Predictor::Create(Model* model, XTensor* top, const XTensor* input,
int beamSize, T2TStateBundle* state) int beamSize, StateBundle* state)
{ {
int dims[MAX_TENSOR_DIM_NUM]; int dims[MAX_TENSOR_DIM_NUM];
for (int i = 0; i < input->order - 1; i++) for (int i = 0; i < input->order - 1; i++)
...@@ -114,20 +113,20 @@ void T2TPredictor::Create(T2TModel* model, XTensor* top, const XTensor* input, ...@@ -114,20 +113,20 @@ void T2TPredictor::Create(T2TModel* model, XTensor* top, const XTensor* input,
set start symbol set start symbol
>> symbol - the symbol (in integer) >> symbol - the symbol (in integer)
*/ */
void T2TPredictor::SetStartSymbol(int symbol) void Predictor::SetStartSymbol(int symbol)
{ {
startSymbol = symbol; startSymbol = symbol;
} }
/* /*
read a state read a state
>> model - the t2t model that keeps the network created so far >> model - the model that keeps the network created so far
>> state - a set of states. It keeps >> state - a set of states. It keeps
1) hypotheses (states) 1) hypotheses (states)
2) probabilities of hypotheses 2) probabilities of hypotheses
3) parts of the network for expanding toward the next state 3) parts of the network for expanding toward the next state
*/ */
void T2TPredictor::Read(T2TModel* model, T2TStateBundle* state) void Predictor::Read(Model* model, StateBundle* state)
{ {
m = model; m = model;
s = state; s = state;
...@@ -147,7 +146,7 @@ predict the next state ...@@ -147,7 +146,7 @@ predict the next state
>> needReorder - whether we need reordering the states >> needReorder - whether we need reordering the states
>> nstep - current time step of the target sequence >> nstep - current time step of the target sequence
*/ */
void T2TPredictor::Predict(T2TStateBundle* next, XTensor& aliveState, XTensor& encoding, void Predictor::Predict(StateBundle* next, XTensor& aliveState, XTensor& encoding,
XTensor& inputEnc, XTensor& paddingEnc, int batchSize, bool isStart, XTensor& inputEnc, XTensor& paddingEnc, int batchSize, bool isStart,
XTensor& reorderState, bool needReorder, int nstep) XTensor& reorderState, bool needReorder, int nstep)
{ {
...@@ -221,14 +220,14 @@ void T2TPredictor::Predict(T2TStateBundle* next, XTensor& aliveState, XTensor& e ...@@ -221,14 +220,14 @@ void T2TPredictor::Predict(T2TStateBundle* next, XTensor& aliveState, XTensor& e
generate paths up to the states of the current step generate paths up to the states of the current step
>> state - state bundle of the current step >> state - state bundle of the current step
*/ */
XTensor T2TPredictor::GeneratePaths(T2TStateBundle* state) XTensor Predictor::GeneratePaths(StateBundle* state)
{ {
CheckNTErrors(state->stateNum >= 0, "Illegal state!"); CheckNTErrors(state->stateNum >= 0, "Illegal state!");
int distance = -1; int distance = -1;
for (int i = 0; i < state->stateNum; i++) { for (int i = 0; i < state->stateNum; i++) {
T2TState* cur = state->states + i; State* cur = state->states + i;
int nsteps = 0; int nsteps = 0;
while (cur != NULL) { while (cur != NULL) {
...@@ -245,7 +244,7 @@ XTensor T2TPredictor::GeneratePaths(T2TStateBundle* state) ...@@ -245,7 +244,7 @@ XTensor T2TPredictor::GeneratePaths(T2TStateBundle* state)
path.SetZeroAll(); path.SetZeroAll();
for (int i = 0; i < state->stateNum; i++) { for (int i = 0; i < state->stateNum; i++) {
T2TState* cur = state->states + i; State* cur = state->states + i;
int nsteps = 0; int nsteps = 0;
while (cur != NULL) { while (cur != NULL) {
...@@ -263,21 +262,21 @@ get the predictions of the previous step ...@@ -263,21 +262,21 @@ get the predictions of the previous step
>> state - state bundle of the current step >> state - state bundle of the current step
>> devID - the device id for the predictions >> devID - the device id for the predictions
*/ */
XTensor T2TPredictor::GetLastPrediction(T2TStateBundle* state, int devID) XTensor Predictor::GetLastPrediction(StateBundle* state, int devID)
{ {
CheckNTErrors(state->stateNum >= 0, "Illegal state!"); CheckNTErrors(state->stateNum >= 0, "Illegal state!");
IntList last; IntList last;
for (int i = 0; i < state->stateNum; i++) { for (int i = 0; i < state->stateNum; i++) {
T2TState* cur = state->states + i; State* cur = state->states + i;
last.Add(cur->prediction); last.Add(cur->prediction);
} }
XTensor lastPred; XTensor lastPred;
InitTensor2D(&lastPred, last.Size(), 1, X_INT, devID); InitTensor2D(&lastPred, int(last.Size()), 1, X_INT, devID);
lastPred.SetData(last.items, last.Size()); lastPred.SetData(last.items, int(last.Size()));
return lastPred; return lastPred;
} }
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University. * Copyright (C) 2020 NiuTrans Research. All rights reserved.
* All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
...@@ -21,22 +20,22 @@ ...@@ -21,22 +20,22 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/ */
#ifndef __T2TPREDICTOR_H__ #ifndef __PREDICTOR_H__
#define __T2TPREDICTOR_H__ #define __PREDICTOR_H__
#include "../T2TModel.h" #include "../Model.h"
#include "T2TLengthPenalty.h" #include "LengthPenalty.h"
using namespace std; using namespace std;
namespace transformer namespace nmt
{ {
#define T2T_PID_EMPTY -1 #define _PID_EMPTY -1
/* state for search. It keeps the path (back-pointer), prediction distribution, /* state for search. It keeps the path (back-pointer), prediction distribution,
and etc. It can be regarded as a hypotheses in translation. */ and etc. It can be regarded as a hypotheses in translation. */
class T2TState class State
{ {
public: public:
/* we assume that the prediction is an integer */ /* we assume that the prediction is an integer */
...@@ -69,11 +68,11 @@ public: ...@@ -69,11 +68,11 @@ public:
int nstep; int nstep;
/* pointer to the previous state */ /* pointer to the previous state */
T2TState* last; State* last;
}; };
/* a bundle of states */ /* a bundle of states */
class T2TStateBundle class StateBundle
{ {
public: public:
/* predictions */ /* predictions */
...@@ -98,7 +97,7 @@ public: ...@@ -98,7 +97,7 @@ public:
float nstep; float nstep;
/* list of states */ /* list of states */
T2TState* states; State* states;
/* number of states */ /* number of states */
int stateNum; int stateNum;
...@@ -108,10 +107,10 @@ public: ...@@ -108,10 +107,10 @@ public:
public: public:
/* constructor */ /* constructor */
T2TStateBundle(); StateBundle();
/* de-constructor */ /* de-constructor */
~T2TStateBundle(); ~StateBundle();
/* create states */ /* create states */
void MakeStates(int num); void MakeStates(int num);
...@@ -122,14 +121,14 @@ public: ...@@ -122,14 +121,14 @@ public:
we get the state of previous words and then generate the next word. we get the state of previous words and then generate the next word.
Here, a state can be regarded as the representation of words (word Here, a state can be regarded as the representation of words (word
indices, hidden states, embeddings and etc.). */ indices, hidden states, embeddings and etc.). */
class T2TPredictor class Predictor
{ {
private: private:
/* pointer to the transformer model */ /* pointer to the transformer model */
T2TModel* m; Model* m;
/* current state */ /* current state */
T2TStateBundle* s; StateBundle* s;
/* start symbol */ /* start symbol */
int startSymbol; int startSymbol;
...@@ -139,30 +138,30 @@ private: ...@@ -139,30 +138,30 @@ private:
public: public:
/* constructor */ /* constructor */
T2TPredictor(); Predictor();
/* de-constructor */ /* de-constructor */
~T2TPredictor(); ~Predictor();
/* create an initial state */ /* create an initial state */
void Create(T2TModel* model, XTensor* top, const XTensor* input, int beamSize, T2TStateBundle* state); void Create(Model* model, XTensor* top, const XTensor* input, int beamSize, StateBundle* state);
/* set the start symbol */ /* set the start symbol */
void SetStartSymbol(int symbol); void SetStartSymbol(int symbol);
/* read a state */ /* read a state */
void Read(T2TModel* model, T2TStateBundle* state); void Read(Model* model, StateBundle* state);
/* predict the next state */ /* predict the next state */
void Predict(T2TStateBundle* next, XTensor& aliveIndices, XTensor& encoding, void Predict(StateBundle* next, XTensor& aliveIndices, XTensor& encoding,
XTensor& inputEnc, XTensor& paddingEnc, int rawBatchSize, XTensor& inputEnc, XTensor& paddingEnc, int rawBatchSize,
bool isStart, XTensor& reorderState, bool needReorder, int nstep); bool isStart, XTensor& reorderState, bool needReorder, int nstep);
/* generate paths up to the states of the current step */ /* generate paths up to the states of the current step */
XTensor GeneratePaths(T2TStateBundle* state); XTensor GeneratePaths(StateBundle* state);
/* get the predictions of the previous step */ /* get the predictions of the previous step */
XTensor GetLastPrediction(T2TStateBundle* state, int devID); XTensor GetLastPrediction(StateBundle* state, int devID);
}; };
} }
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University. * Copyright (C) 2020 NiuTrans Research. All rights reserved.
* All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
...@@ -20,13 +19,13 @@ ...@@ -20,13 +19,13 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
*/ */
#include "T2TSearch.h" #include "Search.h"
#include "../module/T2TUtility.h" #include "../Utility.h"
#include "../../../tensor/core/CHeader.h" #include "../../../tensor/core/CHeader.h"
using namespace nts; using namespace nts;
namespace transformer namespace nmt
{ {
/* constructor */ /* constructor */
BeamSearch::BeamSearch() BeamSearch::BeamSearch()
...@@ -55,7 +54,7 @@ initialize the model ...@@ -55,7 +54,7 @@ initialize the model
>> argc - number of arguments >> argc - number of arguments
>> argv - list of pointers to the arguments >> argv - list of pointers to the arguments
*/ */
void BeamSearch::Init(T2TConfig& config) void BeamSearch::Init(Config& config)
{ {
beamSize = config.beamSize; beamSize = config.beamSize;
batchSize = config.sBatchSize; batchSize = config.sBatchSize;
...@@ -105,10 +104,10 @@ search for the most promising states ...@@ -105,10 +104,10 @@ search for the most promising states
>> output - output that represents the sequences as rows >> output - output that represents the sequences as rows
>> score - score of the sequences >> score - score of the sequences
*/ */
void BeamSearch::Search(T2TModel* model, XTensor& input, XTensor& padding, void BeamSearch::Search(Model* model, XTensor& input, XTensor& padding,
IntList* output, XTensor& score) IntList* output, XTensor& score)
{ {
T2TPredictor predictor; Predictor predictor;
XTensor maskEnc; XTensor maskEnc;
XTensor encoding; XTensor encoding;
XTensor encodingBeam; XTensor encodingBeam;
...@@ -140,10 +139,10 @@ void BeamSearch::Search(T2TModel* model, XTensor& input, XTensor& padding, ...@@ -140,10 +139,10 @@ void BeamSearch::Search(T2TModel* model, XTensor& input, XTensor& padding,
CheckNTErrors(lengthLimit > 0, "no max length specified!"); CheckNTErrors(lengthLimit > 0, "no max length specified!");
maxLength = lengthLimit; maxLength = lengthLimit;
T2TStateBundle* states = new T2TStateBundle[lengthLimit + 1]; StateBundle* states = new StateBundle[lengthLimit + 1];
T2TStateBundle* first = states; StateBundle* first = states;
T2TStateBundle* cur = NULL; StateBundle* cur = NULL;
T2TStateBundle* next = NULL; StateBundle* next = NULL;
/* create the first state */ /* create the first state */
predictor.Create(model, &encodingBeam, &input, beamSize, first); predictor.Create(model, &encodingBeam, &input, beamSize, first);
...@@ -213,7 +212,7 @@ compute the model score for each hypotheses ...@@ -213,7 +212,7 @@ compute the model score for each hypotheses
>> prev - the beam of the previous state >> prev - the beam of the previous state
>> beam - the beam that keeps a number of states >> beam - the beam that keeps a number of states
*/ */
void BeamSearch::Score(T2TStateBundle* prev, T2TStateBundle* beam) void BeamSearch::Score(StateBundle* prev, StateBundle* beam)
{ {
XTensor& score = beam->modelScore; XTensor& score = beam->modelScore;
XTensor& prob = beam->prob; XTensor& prob = beam->prob;
...@@ -244,7 +243,7 @@ void BeamSearch::Score(T2TStateBundle* prev, T2TStateBundle* beam) ...@@ -244,7 +243,7 @@ void BeamSearch::Score(T2TStateBundle* prev, T2TStateBundle* beam)
beam->nstep = prev->nstep + 1.0F; beam->nstep = prev->nstep + 1.0F;
/* the GNMT-like length penalty */ /* the GNMT-like length penalty */
float lp = T2TLengthPenalizer::GNMT(beam->nstep, alpha); float lp = LengthPenalizer::GNMT(beam->nstep, alpha);
/* score = log-prob/lp */ /* score = log-prob/lp */
score = probPath / lp; score = probPath / lp;
...@@ -279,7 +278,7 @@ generate tokens for the next state via beam pruning ...@@ -279,7 +278,7 @@ generate tokens for the next state via beam pruning
>> prev - the last beam >> prev - the last beam
>> beam - the beam that keeps a number of states >> beam - the beam that keeps a number of states
*/ */
void BeamSearch::Generate(T2TStateBundle* prev, T2TStateBundle* beam) void BeamSearch::Generate(StateBundle* prev, StateBundle* beam)
{ {
int dims[MAX_TENSOR_DIM_NUM]; int dims[MAX_TENSOR_DIM_NUM];
int dimsBeam[MAX_TENSOR_DIM_NUM]; int dimsBeam[MAX_TENSOR_DIM_NUM];
...@@ -323,7 +322,7 @@ void BeamSearch::Generate(T2TStateBundle* prev, T2TStateBundle* beam) ...@@ -323,7 +322,7 @@ void BeamSearch::Generate(T2TStateBundle* prev, T2TStateBundle* beam)
/* keep the most promising candidates in the beam */ /* keep the most promising candidates in the beam */
TopK(score, scoreTopK, index, -1, beamSize, true); TopK(score, scoreTopK, index, -1, beamSize, true);
float lp = T2TLengthPenalizer::GNMT(beam->nstep, alpha); float lp = LengthPenalizer::GNMT(beam->nstep, alpha);
CopyValues(index, indexCPU); CopyValues(index, indexCPU);
CopyValues(index, preID); CopyValues(index, preID);
...@@ -375,14 +374,14 @@ expand the search graph ...@@ -375,14 +374,14 @@ expand the search graph
>> beam - the beam that keeps a number of states >> beam - the beam that keeps a number of states
>> reorderState - the new order of states >> reorderState - the new order of states
*/ */
void BeamSearch::Expand(T2TStateBundle* prev, T2TStateBundle* beam, XTensor& reorderState) void BeamSearch::Expand(StateBundle* prev, StateBundle* beam, XTensor& reorderState)
{ {
CheckNTErrors(beam->prediction.unitNum == beam->preID.unitNum, CheckNTErrors(beam->prediction.unitNum == beam->preID.unitNum,
"A problem occurs in the beam!"); "A problem occurs in the beam!");
beam->MakeStates(beam->prediction.unitNum); beam->MakeStates(beam->prediction.unitNum);
T2TState* states = beam->states; State* states = beam->states;
XTensor& idRef = beam->preID; XTensor& idRef = beam->preID;
XTensor& modelScoreRef = beam->modelScore; XTensor& modelScoreRef = beam->modelScore;
XTensor& probRef = beam->prob; XTensor& probRef = beam->prob;
...@@ -424,7 +423,7 @@ void BeamSearch::Expand(T2TStateBundle* prev, T2TStateBundle* beam, XTensor& reo ...@@ -424,7 +423,7 @@ void BeamSearch::Expand(T2TStateBundle* prev, T2TStateBundle* beam, XTensor& reo
for (int i = 0; i < beam->stateNum; i += beamSize) { for (int i = 0; i < beam->stateNum; i += beamSize) {
for (int j = 0; j < beamSize; j++) { for (int j = 0; j < beamSize; j++) {
int k = i + j; int k = i + j;
T2TState& state = states[k]; State& state = states[k];
int offset = id.GetInt(k); int offset = id.GetInt(k);
int pid = i / beamSize; int pid = i / beamSize;
...@@ -432,7 +431,7 @@ void BeamSearch::Expand(T2TStateBundle* prev, T2TStateBundle* beam, XTensor& reo ...@@ -432,7 +431,7 @@ void BeamSearch::Expand(T2TStateBundle* prev, T2TStateBundle* beam, XTensor& reo
if (offset != j) if (offset != j)
needReorder = true; needReorder = true;
T2TState* last = prev->states + pid * beamSize + offset; State* last = prev->states + pid * beamSize + offset;
CheckNTErrors(offset >= 0, "Wrong state index!"); CheckNTErrors(offset >= 0, "Wrong state index!");
...@@ -482,12 +481,12 @@ collect hypotheses with ending symbols. Given a beam of hypotheses, ...@@ -482,12 +481,12 @@ collect hypotheses with ending symbols. Given a beam of hypotheses,
we remove the finished hypotheses and keep them in a heap. we remove the finished hypotheses and keep them in a heap.
>> beam - the beam that keeps a number of states >> beam - the beam that keeps a number of states
*/ */
void BeamSearch::Collect(T2TStateBundle* beam) void BeamSearch::Collect(StateBundle* beam)
{ {
T2TState* states = beam->states; State* states = beam->states;
for (int i = 0; i < beam->stateNum; i++) { for (int i = 0; i < beam->stateNum; i++) {
T2TState& state = states[i]; State& state = states[i];
CheckNTErrors(state.pid >= 0 && state.pid < batchSize, CheckNTErrors(state.pid >= 0 && state.pid < batchSize,
"Invalid sample id!"); "Invalid sample id!");
...@@ -508,13 +507,13 @@ void BeamSearch::Collect(T2TStateBundle* beam) ...@@ -508,13 +507,13 @@ void BeamSearch::Collect(T2TStateBundle* beam)
fill the hypothesis heap with incomplete hypotheses fill the hypothesis heap with incomplete hypotheses
>> beam - the beam that keeps a number of states (final) >> beam - the beam that keeps a number of states (final)
*/ */
void BeamSearch::FillHeap(T2TStateBundle* beam) void BeamSearch::FillHeap(StateBundle* beam)
{ {
T2TState* states = beam->states; State* states = beam->states;
for (int i = 0; i < beam->stateNum / beamSize; i++) { for (int i = 0; i < beam->stateNum / beamSize; i++) {
for (int j = 0; j < beamSize; j++) { for (int j = 0; j < beamSize; j++) {
T2TState& state = states[i * beamSize + j]; State& state = states[i * beamSize + j];
/* we push the incomplete hypothesis into the heap */ /* we push the incomplete hypothesis into the heap */
if (fullHypos[state.pid].Count() == 0 && state.isEnd && state.isCompleted) { if (fullHypos[state.pid].Count() == 0 && state.isEnd && state.isCompleted) {
...@@ -548,10 +547,10 @@ void BeamSearch::Dump(IntList* output, XTensor* score) ...@@ -548,10 +547,10 @@ void BeamSearch::Dump(IntList* output, XTensor* score)
int c = heap.Count(); int c = heap.Count();
float bestScore = -1e9F; float bestScore = -1e9F;
T2TState* state = NULL; State* state = NULL;
for (int i = 0; i < c; i++) { for (int i = 0; i < c; i++) {
auto node = heap.Pop(); auto node = heap.Pop();
T2TState* s = (T2TState*)node.index; State* s = (State*)node.index;
if (i == 0 || bestScore < node.value) { if (i == 0 || bestScore < node.value) {
state = s; state = s;
bestScore = node.value; bestScore = node.value;
...@@ -619,12 +618,12 @@ void BeamSearch::SetEnd(const int* tokens, const int tokenNum) ...@@ -619,12 +618,12 @@ void BeamSearch::SetEnd(const int* tokens, const int tokenNum)
check whether all hypotheses are completed check whether all hypotheses are completed
>> beam - the beam that keeps the searching states >> beam - the beam that keeps the searching states
*/ */
bool BeamSearch::IsAllCompleted(T2TStateBundle* beam) bool BeamSearch::IsAllCompleted(StateBundle* beam)
{ {
T2TState* states = beam->states; State* states = beam->states;
for (int i = 0; i < beam->stateNum; i++) { for (int i = 0; i < beam->stateNum; i++) {
T2TState& state = states[i]; State& state = states[i];
if (!state.isCompleted) if (!state.isCompleted)
return false; return false;
} }
...@@ -640,11 +639,11 @@ update the beam by removing finished hypotheses ...@@ -640,11 +639,11 @@ update the beam by removing finished hypotheses
>> alivePadding - new paddings for the inputs, (B, L) >> alivePadding - new paddings for the inputs, (B, L)
<< aliveIdx - the indices of alive states << aliveIdx - the indices of alive states
*/ */
void BeamSearch::RemoveFinishedStates(T2TStateBundle* beam, XTensor& aliveEncoding, void BeamSearch::RemoveFinishedStates(StateBundle* beam, XTensor& aliveEncoding,
XTensor& aliveInput, XTensor& alivePadding, XTensor& aliveInput, XTensor& alivePadding,
XTensor& aliveState) XTensor& aliveState)
{ {
T2TState* states = beam->states; State* states = beam->states;
/* get the indices of uncompleted sentences and states */ /* get the indices of uncompleted sentences and states */
aliveSentList.Clear(); aliveSentList.Clear();
...@@ -674,12 +673,12 @@ void BeamSearch::RemoveFinishedStates(T2TStateBundle* beam, XTensor& aliveEncodi ...@@ -674,12 +673,12 @@ void BeamSearch::RemoveFinishedStates(T2TStateBundle* beam, XTensor& aliveEncodi
} }
} }
InitTensor1D(&aliveState, aliveStateList.Size(), X_INT, aliveEncoding.devID); InitTensor1D(&aliveState, int(aliveStateList.Size()), X_INT, aliveEncoding.devID);
aliveState.SetData(aliveStateList.items, aliveStateList.Size()); aliveState.SetData(aliveStateList.items, int(aliveStateList.Size()));
XTensor aliveSent; XTensor aliveSent;
InitTensor1D(&aliveSent, aliveSentList.Size(), X_INT, aliveEncoding.devID); InitTensor1D(&aliveSent, int(aliveSentList.Size()), X_INT, aliveEncoding.devID);
aliveSent.SetData(aliveSentList.items, aliveSentList.Size()); aliveSent.SetData(aliveSentList.items, int(aliveSentList.Size()));
if (aliveStateList.Size() < aliveEncoding.dimSize[0] && aliveStateList.Size() > 0) { if (aliveStateList.Size() < aliveEncoding.dimSize[0] && aliveStateList.Size() > 0) {
aliveInput = AutoGather(aliveInput, aliveState); aliveInput = AutoGather(aliveInput, aliveState);
...@@ -697,7 +696,7 @@ void BeamSearch::RemoveFinishedStates(T2TStateBundle* beam, XTensor& aliveEncodi ...@@ -697,7 +696,7 @@ void BeamSearch::RemoveFinishedStates(T2TStateBundle* beam, XTensor& aliveEncodi
make a mask to prevent duplicated entries in beam expansion for the first position make a mask to prevent duplicated entries in beam expansion for the first position
>> beam - the beam that keeps the searching states >> beam - the beam that keeps the searching states
*/ */
XTensor BeamSearch::MakeFirstMask(T2TStateBundle* beam) XTensor BeamSearch::MakeFirstMask(StateBundle* beam)
{ {
XTensor& prob = beam->prob; XTensor& prob = beam->prob;
XTensor mask; XTensor mask;
...@@ -742,7 +741,7 @@ initialize the model ...@@ -742,7 +741,7 @@ initialize the model
>> argc - number of arguments >> argc - number of arguments
>> argv - list of pointers to the arguments >> argv - list of pointers to the arguments
*/ */
void GreedySearch::Init(T2TConfig& config) void GreedySearch::Init(Config& config)
{ {
batchSize = config.wBatchSize; batchSize = config.wBatchSize;
endSymbols[0] = config.endID; endSymbols[0] = config.endID;
...@@ -798,7 +797,7 @@ search for the most promising states ...@@ -798,7 +797,7 @@ search for the most promising states
>> padding - padding of the input >> padding - padding of the input
>> output - output that represents the sequences as rows >> output - output that represents the sequences as rows
*/ */
void GreedySearch::Search(T2TModel* model, XTensor& input, void GreedySearch::Search(Model* model, XTensor& input,
XTensor& padding, IntList* output) XTensor& padding, IntList* output)
{ {
XTensor maskEnc; XTensor maskEnc;
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University. * Copyright (C) 2020 NiuTrans Research. All rights reserved.
* All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
...@@ -20,15 +19,15 @@ ...@@ -20,15 +19,15 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
*/ */
#ifndef __T2TSEARCH_H__ #ifndef __SEARCH_H__
#define __T2TSEARCH_H__ #define __SEARCH_H__
#include "../T2TModel.h" #include "../Model.h"
#include "T2TPredictor.h" #include "Predictor.h"
using namespace std; using namespace std;
namespace transformer namespace nmt
{ {
/* The class organizes the search process. It calls "predictors" to generate /* The class organizes the search process. It calls "predictors" to generate
...@@ -42,7 +41,7 @@ private: ...@@ -42,7 +41,7 @@ private:
float alpha; float alpha;
/* predictor */ /* predictor */
T2TPredictor predictor; Predictor predictor;
/* max length of the generated sequence */ /* max length of the generated sequence */
int maxLength; int maxLength;
...@@ -88,28 +87,28 @@ public: ...@@ -88,28 +87,28 @@ public:
~BeamSearch(); ~BeamSearch();
/* initialize the model */ /* initialize the model */
void Init(T2TConfig& config); void Init(Config& config);
/* search for the most promising states */ /* search for the most promising states */
void Search(T2TModel* model, XTensor& input, XTensor& padding, IntList* output, XTensor& score); void Search(Model* model, XTensor& input, XTensor& padding, IntList* output, XTensor& score);
/* preparation */ /* preparation */
void Prepare(int myBatchSize, int myBeamSize); void Prepare(int myBatchSize, int myBeamSize);
/* compute the model score for each hypotheses */ /* compute the model score for each hypotheses */
void Score(T2TStateBundle* prev, T2TStateBundle* beam); void Score(StateBundle* prev, StateBundle* beam);
/* generate token indices via beam pruning */ /* generate token indices via beam pruning */
void Generate(T2TStateBundle* prev, T2TStateBundle* beam); void Generate(StateBundle* prev, StateBundle* beam);
/* expand the search graph */ /* expand the search graph */
void Expand(T2TStateBundle* prev, T2TStateBundle* beam, XTensor& reorderState); void Expand(StateBundle* prev, StateBundle* beam, XTensor& reorderState);
/* collect hypotheses with ending symbol */ /* collect hypotheses with ending symbol */
void Collect(T2TStateBundle* beam); void Collect(StateBundle* beam);
/* fill the hypotheses heap with incomplete hypotheses */ /* fill the hypotheses heap with incomplete hypotheses */
void FillHeap(T2TStateBundle* beam); void FillHeap(StateBundle* beam);
/* save the output sequences and score */ /* save the output sequences and score */
void Dump(IntList* output, XTensor* score); void Dump(IntList* output, XTensor* score);
...@@ -118,17 +117,17 @@ public: ...@@ -118,17 +117,17 @@ public:
bool IsEnd(int token); bool IsEnd(int token);
/* check whether all hypotheses are completed */ /* check whether all hypotheses are completed */
bool IsAllCompleted(T2TStateBundle* beam); bool IsAllCompleted(StateBundle* beam);
/* update the beam by pruning finished states */ /* update the beam by pruning finished states */
void RemoveFinishedStates(T2TStateBundle* beam, XTensor& aliveEncoding, void RemoveFinishedStates(StateBundle* beam, XTensor& aliveEncoding,
XTensor& aliveInput, XTensor& alivePadding, XTensor& aliveIdx); XTensor& aliveInput, XTensor& alivePadding, XTensor& aliveIdx);
/* set end symbols for search */ /* set end symbols for search */
void SetEnd(const int* tokens, const int tokenNum); void SetEnd(const int* tokens, const int tokenNum);
/* make a mask to prevent duplicated entries in beam expansion for the first position */ /* make a mask to prevent duplicated entries in beam expansion for the first position */
XTensor MakeFirstMask(T2TStateBundle* beam); XTensor MakeFirstMask(StateBundle* beam);
}; };
class GreedySearch class GreedySearch
...@@ -136,7 +135,7 @@ class GreedySearch ...@@ -136,7 +135,7 @@ class GreedySearch
private: private:
/* predictor */ /* predictor */
T2TPredictor predictor; Predictor predictor;
/* max length of the generated sequence */ /* max length of the generated sequence */
int maxLength; int maxLength;
...@@ -164,10 +163,10 @@ public: ...@@ -164,10 +163,10 @@ public:
~GreedySearch(); ~GreedySearch();
/* initialize the model */ /* initialize the model */
void Init(T2TConfig& config); void Init(Config& config);
/* search for the most promising states */ /* search for the most promising states */
void Search(T2TModel* model, XTensor& input, XTensor& padding, IntList* output); void Search(Model* model, XTensor& input, XTensor& padding, IntList* output);
/* preparation */ /* preparation */
void Prepare(int myBatchSize); void Prepare(int myBatchSize);
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University. * Copyright (C) 2020 NiuTrans Research. All rights reserved.
* All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
...@@ -20,27 +19,25 @@ ...@@ -20,27 +19,25 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
*/ */
#include <cmath> #include "Search.h"
#include "Translator.h"
#include "T2TTranslator.h" #include "../Utility.h"
#include "T2TSearch.h"
#include "../module/T2TUtility.h"
#include "../../../tensor/XTensor.h" #include "../../../tensor/XTensor.h"
#include "../../../tensor/XUtility.h" #include "../../../tensor/XUtility.h"
#include "../../../tensor/core/CHeader.h" #include "../../../tensor/core/CHeader.h"
using namespace nts; using namespace nts;
namespace transformer namespace nmt
{ {
/* constructor */ /* constructor */
T2TTranslator::T2TTranslator() Translator::Translator()
{ {
} }
/* de-constructor */ /* de-constructor */
T2TTranslator::~T2TTranslator() Translator::~Translator()
{ {
if (beamSize > 1) if (beamSize > 1)
delete (BeamSearch*)seacher; delete (BeamSearch*)seacher;
...@@ -49,7 +46,7 @@ T2TTranslator::~T2TTranslator() ...@@ -49,7 +46,7 @@ T2TTranslator::~T2TTranslator()
} }
/* initialize the model */ /* initialize the model */
void T2TTranslator::Init(T2TConfig& config) void Translator::Init(Config& config)
{ {
beamSize = config.beamSize; beamSize = config.beamSize;
vSize = config.srcVocabSize; vSize = config.srcVocabSize;
...@@ -58,17 +55,17 @@ void T2TTranslator::Init(T2TConfig& config) ...@@ -58,17 +55,17 @@ void T2TTranslator::Init(T2TConfig& config)
wordBatch = config.wBatchSize; wordBatch = config.wBatchSize;
if (beamSize > 1) { if (beamSize > 1) {
XPRINT1(0, stderr, "Translating with beam search (%d)\n", beamSize); LOG("translating with beam search (%d)", beamSize);
seacher = new BeamSearch(); seacher = new BeamSearch();
((BeamSearch*)seacher)->Init(config); ((BeamSearch*)seacher)->Init(config);
} }
else if (beamSize == 1) { else if (beamSize == 1) {
XPRINT1(0, stderr, "Translating with greedy search\n", beamSize); LOG("translating with greedy search");
seacher = new GreedySearch(); seacher = new GreedySearch();
((GreedySearch*)seacher)->Init(config); ((GreedySearch*)seacher)->Init(config);
} }
else { else {
CheckNTErrors(false, "invalid beam size\n"); CheckNTErrors(false, "Invalid beam size\n");
} }
} }
...@@ -80,8 +77,8 @@ test the model ...@@ -80,8 +77,8 @@ test the model
>> ofn - output data file >> ofn - output data file
>> model - pretrained model >> model - pretrained model
*/ */
void T2TTranslator::Translate(const char* ifn, const char* sfn, const char* tfn, void Translator::Translate(const char* ifn, const char* sfn,
const char* ofn, T2TModel* model) const char* tfn, const char* ofn, Model* model)
{ {
int wc = 0; int wc = 0;
int wordCountTotal = 0; int wordCountTotal = 0;
...@@ -99,8 +96,7 @@ void T2TTranslator::Translate(const char* ifn, const char* sfn, const char* tfn, ...@@ -99,8 +96,7 @@ void T2TTranslator::Translate(const char* ifn, const char* sfn, const char* tfn,
XTensor paddingEnc; XTensor paddingEnc;
batchLoader.Init(ifn, sfn, tfn); batchLoader.Init(ifn, sfn, tfn);
XPRINT1(0, stderr, "[INFO] loaded the input file, elapsed=%.1fs \n", LOG("loaded the input file, elapsed=%.1fs ", GetClockSec() - startT);
GetClockSec() - startT);
int count = 0; int count = 0;
double batchStart = GetClockSec(); double batchStart = GetClockSec();
...@@ -130,22 +126,22 @@ void T2TTranslator::Translate(const char* ifn, const char* sfn, const char* tfn, ...@@ -130,22 +126,22 @@ void T2TTranslator::Translate(const char* ifn, const char* sfn, const char* tfn,
for (int i = 0; i < indices.Size() - 1; ++i) { for (int i = 0; i < indices.Size() - 1; ++i) {
Result* res = new Result; Result* res = new Result;
res->id = indices[i]; res->id = int(indices[i]);
res->res = output[i]; res->res = output[i];
batchLoader.outputBuffer.Add(res); batchLoader.outputBuffer.Add(res);
} }
delete[] output; delete[] output;
wc += indices[-1]; wc += int(indices[-1]);
wordCountTotal += indices[-1]; wordCountTotal += int(indices[-1]);
sentCount += (indices.Size() - 1); sentCount += int(indices.Size() - 1);
batchCount += 1; batchCount += 1;
if (count % 1 == 0) { if (count % 1 == 0) {
double elapsed = GetClockSec() - batchStart; double elapsed = GetClockSec() - batchStart;
batchStart = GetClockSec(); batchStart = GetClockSec();
XPRINT3(0, stderr, "[INFO] elapsed=%.1fs, sentence=%f, sword=%.1fw/s\n", LOG("elapsed=%.1fs, sentence=%f, sword=%.1fw/s",
elapsed, float(sentCount) / float(batchLoader.inputBuffer.Size()), elapsed, float(sentCount) / float(batchLoader.inputBuffer.Size()),
double(wc) / elapsed); double(wc) / elapsed);
wc = 0; wc = 0;
...@@ -169,7 +165,7 @@ void T2TTranslator::Translate(const char* ifn, const char* sfn, const char* tfn, ...@@ -169,7 +165,7 @@ void T2TTranslator::Translate(const char* ifn, const char* sfn, const char* tfn,
double elapsed = GetClockSec() - startDump; double elapsed = GetClockSec() - startDump;
XPRINT2(0, stderr, "[INFO] translation completed (word=%d, sent=%llu)\n", LOG("translation completed (word=%d, sent=%zu)",
wordCountTotal, batchLoader.inputBuffer.Size() + batchLoader.emptyLines.Size()); wordCountTotal, batchLoader.inputBuffer.Size() + batchLoader.emptyLines.Size());
} }
...@@ -178,7 +174,7 @@ dump the result into the file ...@@ -178,7 +174,7 @@ dump the result into the file
>> file - data file >> file - data file
>> output - output tensor >> output - output tensor
*/ */
void T2TTranslator::Dump(FILE* file, XTensor* output) void Translator::Dump(FILE* file, XTensor* output)
{ {
if (output != NULL && output->unitNum != 0) { if (output != NULL && output->unitNum != 0) {
int seqLength = output->dimSize[output->order - 1]; int seqLength = output->dimSize[output->order - 1];
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University. * Copyright (C) 2020 NiuTrans Research. All rights reserved.
* All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
...@@ -21,17 +20,17 @@ ...@@ -21,17 +20,17 @@
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-06 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-06
*/ */
#ifndef __T2TTESTER_H__ #ifndef __TESTER_H__
#define __T2TTESTER_H__ #define __TESTER_H__
#include "T2TSearch.h" #include "Search.h"
#include "T2TDataSet.h" #include "DataSet.h"
namespace transformer namespace nmt
{ {
/* This class translates test sentences with a trained model. */ /* This class translates test sentences with a trained model. */
class T2TTranslator class Translator
{ {
public: public:
/* vocabulary size of the source side */ /* vocabulary size of the source side */
...@@ -57,17 +56,17 @@ public: ...@@ -57,17 +56,17 @@ public:
public: public:
/* constructor */ /* constructor */
T2TTranslator(); Translator();
/* de-constructor */ /* de-constructor */
~T2TTranslator(); ~Translator();
/* initialize the model */ /* initialize the model */
void Init(T2TConfig& config); void Init(Config& config);
/* test the model */ /* test the model */
void Translate(const char* ifn, const char* vfn, const char* ofn, void Translate(const char* ifn, const char* vfn, const char* ofn,
const char* tfn, T2TModel* model); const char* tfn, Model* model);
/* dump the result into the file */ /* dump the result into the file */
void Dump(FILE* file, XTensor* output); void Dump(FILE* file, XTensor* output);
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2018, Natural Language Processing Lab, Northeastern University. * Copyright (C) 2020 NiuTrans Research. All rights reserved.
* All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
...@@ -21,8 +20,8 @@ ...@@ -21,8 +20,8 @@
#include <fstream> #include <fstream>
#include "T2TVocab.h" #include "Vocab.h"
#include "../module/T2TUtility.h" #include "../Utility.h"
namespace nts { namespace nts {
...@@ -31,7 +30,7 @@ void Vocab::Load(const string& src) ...@@ -31,7 +30,7 @@ void Vocab::Load(const string& src)
{ {
string vsz, sid; string vsz, sid;
ifstream f(src, ios::in); ifstream f(src, ios::in);
CheckNTErrors(f.is_open(), "Unable to open the vocabulary file"); CheckNTErrors(f.is_open(), "unable to open the vocabulary file");
/* get the vocab size and the start id */ /* get the vocab size and the start id */
f >> vsz >> sid; f >> vsz >> sid;
......
/* NiuTrans.Tensor - an open-source tensor library /* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2018, Natural Language Processing Lab, Northeastern University. * Copyright (C) 2020 NiuTrans Research. All rights reserved.
* All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
...@@ -19,8 +18,8 @@ ...@@ -19,8 +18,8 @@
* $Created by: HU Chi (huchinlp@foxmail.com) 2020-01-03 * $Created by: HU Chi (huchinlp@foxmail.com) 2020-01-03
*/ */
#ifndef __T2TVOCAB_H__ #ifndef __VOCAB_H__
#define __T2TVOCAB_H__ #define __VOCAB_H__
#include <cstdio> #include <cstdio>
#include <unordered_map> #include <unordered_map>
...@@ -30,10 +29,10 @@ using namespace std; ...@@ -30,10 +29,10 @@ using namespace std;
namespace nts { namespace nts {
/* user-defined symbols */ /* user-defined symbols */
#define UNK 0
#define PAD 1 #define PAD 1
#define SOS 2 #define SOS 2
#define EOS 2 #define EOS 2
#define UNK 3
/* the vocabulary class */ /* the vocabulary class */
struct Vocab struct Vocab
......
...@@ -180,8 +180,6 @@ extern FILE * tF; ...@@ -180,8 +180,6 @@ extern FILE * tF;
extern int tmpCountV2; extern int tmpCountV2;
extern int nnnTotal; extern int nnnTotal;
void PrintTrace(void);
} /* end of the nts (NiuTrans.Tensor) namespace */ } /* end of the nts (NiuTrans.Tensor) namespace */
#endif #endif
...@@ -1511,9 +1511,12 @@ void XMem::ShowMemUsage(FILE * file) ...@@ -1511,9 +1511,12 @@ void XMem::ShowMemUsage(FILE * file)
} }
MTYPE bufTotal = bufSize; MTYPE bufTotal = bufSize;
MTYPE bufUsed = bufUsed;
fprintf(file, "block mem:%.1fMB used:%.1fMB usage:%.3f\n", fprintf(file, "block mem:%.1fMB used:%.1fMB usage:%.3f\n",
(DTYPE)blockTotal/MILLION, (DTYPE)blockUsed/MILLION, (DTYPE)blockUsed/blockTotal); (DTYPE)blockTotal/MILLION, (DTYPE)blockUsed/MILLION, (DTYPE)blockUsed/blockTotal);
fprintf(file, "buffer mem:%.1fMB used:%.1fMB usage:%.3f\n",
(DTYPE)bufTotal / 1024 / 1024, (DTYPE)bufUsed / 1024 / 1024, (DTYPE)bufUsed / bufTotal);
} }
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-01
*/
#include "../../XDevice.h"
#include "../../XUtility.h"
#include "Sub.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/*
subtraction of data arrays (CUDA Kernel)
c = a - b * \beta
>> a - A matrix
>> b - another matrix
>> c - where we put a-b
>> size - the size of a/b/c
>> beta - the coefficient
*/
__global__
void KernelSUB(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size)
c[i] = a[i] - b[i] * beta;
}
/*
tensor subtraction c = a - b * \beta (cuda version)
>> a - a tensor
>> b - another tensor
>> c - where we put a-b*\beta.
>> beta - the scaling factor
*/
void _CudaSub(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
{
CheckNTErrors(a && b && c, "Empty tensor input!");
CheckNTErrors((a->unitNum == b->unitNum && a->unitNum == c->unitNum),
"Unmatched tensors in addition!");
CheckNTErrors((a->dataType == b->dataType && a->dataType == c->dataType),
"Unmatched tensors in addition!");
CheckNTErrors((a->devID == b->devID && a->devID == c->devID),
"The tensors must be on the same!");
int devIDBackup = XDevice::GetGPUDevice();
XDevice::SetGPUDevice(a->devID);
if (!a->isSparse && !b->isSparse) {
CheckNTErrors(!c->isSparse, "Illegal use of sparse matrix in addition!");
if (a->dataType == DEFAULT_DTYPE &&
b->dataType == DEFAULT_DTYPE &&
c->dataType == DEFAULT_DTYPE)
{
int gridSize[3], blockSize[3];
GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
dim3 blocks(gridSize[0]);
dim3 threads(blockSize[0]);
KernelSUB << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, a->unitNum, beta);
}
else {
// TODO!!
ShowNTErrors("TODO!");
}
}
else {
// TODO!!
ShowNTErrors("TODO!");
}
XDevice::SetGPUDevice(devIDBackup);
}
/* subtraction over arrays
tensor subtraction c = a - b * \beta (cuda version) with an input handle
>> devID - device ID (MUST >= 0)
>> handle - cuda handle
>> a - an array
>> b - another array
>> c - where we put a-b
>> size - size of the array
>> beta - the coefficient
*/
void _CudaSubWithHandle(int devID, cublasHandle_t * handle, DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta)
{
if (size == 0)
return;
if (c == NULL)
c = a;
CheckNTErrors((a && b && c), "Empty arrays in addition!");
int devIDBackup;
ProtectCudaDev(devID, devIDBackup);
if (c == a) {
#ifdef DOUBELPRICSION
cublasDaxpy(*handle, size, &beta, b, 1, a, 1);
#else
cublasSaxpy(*handle, size, &beta, b, 1, a, 1);
#endif
}
else {
int gridSize[3], blockSize[3];
GDevs.GetCudaThread(devID, size, gridSize, blockSize);
dim3 blocks(gridSize[0]);
dim3 threads(blockSize[0]);
KernelSUB<<<blocks, threads>>>((DTYPE*)a, (DTYPE*)b, (DTYPE*)c, size, beta);
}
BacktoCudaDev(devID, devIDBackup);
}
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-01
*/
#ifndef __SUB_CUH__
#define __SUB_CUH__
#include "Sub.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* subtraction of data arrays (CUDA Kernel) */
__global__
void KernelSUB(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta = (DTYPE)1.0);
/* tensor subtraction c = a - b * \beta (cuda version) */
void _CudaSub(const XTensor * a, const XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);
/* tensor subtraction c = a - b * \beta (cuda version) with an input handle */
void _CudaSubWithHandle(int devID, cublasHandle_t * handle, DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta = (DTYPE)1.0);
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
#endif // __SUB_CUH__
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
*/
#include <math.h>
#include "Sub.h"
#include "SubDim.h"
#include "SubDim.cuh"
#include "../../XName.h"
#include "../../XUtility.h"
#include "../movement/CopyValues.h"
#include "../shape/IsSameShaped.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
tensor subtraction
c = a - b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> c - where we put a-b*\beta. we save it in a if c is NULL
>> n - the dimension index
>> beta - the scaling factor
*/
void _SubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta)
{
n = MODX(n, a->order);
CheckNTErrors(a && b && c, "Empty tensor input!");
CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in subtraction!");
CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
"Unmatched data types in subtraction!");
CheckNTErrors(a->order == c->order, "The input tensors do not have the same order in subtraction!");
CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
CheckDev(a->devID, b->devID);
if (beta == 0) {
_CopyValues(a, c);
return;
}
if (_IsSameShaped(a, b)) {
_Sub(a, b, c, beta);
return;
}
if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
#ifdef USE_CUDA
_CudaSubDim(a, b, c, n, beta);
#else
ShowNTErrors("Please specify USE_CUDA and recompile the code!");
#endif
}
else {
int stride = 1;
int blockSize = a->dimSize[n];
int blockNum = 1;
for (int i = a->order - 1; i >= 0; i--) {
if (i > n)
stride *= a->dimSize[i];
else if (i < n)
blockNum *= a->dimSize[i];
}
if (a->dataType == DEFAULT_DTYPE) {
int num = a->unitNum;
if (stride > 1) {
for (int i = 0, j = 0; i < num; i += stride, j++) {
DTYPE * ap = (DTYPE*)a->data + i;
DTYPE bv = *((DTYPE*)b->data + j % blockSize) * beta;
DTYPE * cp = (DTYPE*)c->data + i;
for (int k = 0; k < stride; k++)
cp[k] = ap[k] - bv;
}
}
else if (stride == 1) {
DTYPE * bp = (DTYPE*)b->data;
for (int i = 0; i < num; i += blockSize) {
DTYPE * ap = (DTYPE*)a->data + i;
DTYPE * cp = (DTYPE*)c->data + i;
if (beta == 1.0F) {
for (int j = 0; j < blockSize; j++)
cp[j] = ap[j] - bp[j];
}
else {
for (int j = 0; j < blockSize; j++)
cp[j] = ap[j] - bp[j] * beta;
}
}
}
else {
ShowNTErrors("Something is wrong!");
}
}
else {
ShowNTErrors("TODO!");
}
}
}
/*
tensor subtraction (do it on site)
keep the result in the input tensor and return nothing
c = a - b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> n - the dimension index
>> beta - the scaling factor
*/
void _SubDim(XTensor * a, const XTensor * b, int n, DTYPE beta)
{
_SubDim(a, b, a, n, beta);
}
/*
tensor subtraction (return an XTensor structure and make tensor connections)
make a new tensor to keep the result and return it
c = a - b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> n - the dimension index
>> beta - the scaling factor
<< return - the result tensor by tensor subtraction
*/
XTensor SubDim(const XTensor &a, const XTensor &b, int n, DTYPE beta)
{
XTensor c(&a);
c.SetTMPFlag();
n = MODX(n, a.order);
/* call _Sub function */
_SubDim(&a, &b, &c, n, beta);
/* tensor connections */
if (a.enableGrad && b.enableGrad) {
XLink::MakeLink(&a, &b, &c, MATH_SUBDIM);
XLink::AddParamToHeadInt(&c, n);
XLink::AddParamToHead(&c, beta);
}
return c;
}
/*
tensor subtraction
c = a - b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> c - where we put a-b*\beta. we save it in a if c is NULL
>> n - the dimension index
>> beta - the scaling factor
*/
void SubDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta)
{
if (!c.isInit || !IsSameShaped(a, c)) {
InitTensorV2(&c, &a);
}
/* call _Sub function */
_SubDim(&a, &b, &c, n, beta);
if (a.enableGrad && b.enableGrad) {
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_SUBDIM);
XLink::AddParamToHeadInt(&c, n);
XLink::AddParamToHead(&c, beta);
}
}
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
*/
#include "SubDim.cuh"
#include "../../XDevice.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/*
tensor subtraction of a tensor and a row vector
c = a - b * \beta
where a is a tensor and b is a row vector
>> a - pointer to the data array of a
>> b - pointer to the data array of b
>> c - pointer to the data array of c
>> rowNum - number of rows of a and c
>> colNum - number of columns of a and c (i.e., the size of b)
>> beta - the scaling factor
*/
template <class T, bool betaFired>
__global__
void KernelSubWithRow(T * a, T * b, T * c, int rowNum, int colNum, T beta)
{
__shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int col = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
if (col >= colNum || row >= rowNum)
return;
if (threadIdx.y == 0)
bv[threadIdx.x] = b[col];
__syncthreads();
int offset = colNum * row + col;
if (betaFired)
c[offset] = a[offset] - bv[threadIdx.x] * beta;
else
c[offset] = a[offset] - bv[threadIdx.x];
}
/*
tensor subtraction of a tensor and a colum vector
c = a - b * \beta
where a is a tensor and b is a colum vector
>> a - pointer to the data array of a
>> b - pointer to the data array of b
>> c - pointer to the data array of c
>> rowNum - number of rows of a and c (i.e., the size of b)
>> colNum - number of columns of a and c
>> blockNum - size of a block (matrix), i.e., rowNum * colNum
>> blockNum - number of matrics
>> beta - the scaling factor
*/
template <class T, bool betaFired>
__global__
void KernelSubWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, T beta)
{
__shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = colIndex % colNum;
int block = colIndex / colNum;
if (row >= rowNum || block >= blockNum)
return;
if (threadIdx.x == 0)
bv[threadIdx.y] = b[row];
__syncthreads();
int offset = block * blockSize + row * colNum + col;
if (betaFired)
c[offset] = a[offset] - bv[threadIdx.y] * beta;
else
c[offset] = a[offset] - bv[threadIdx.y];
}
/*
tensor subtraction (cuda version)
c = a - b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> c - where we put a+b*\beta. we save it in a if c is NULL
>> n - the dimension index
>> beta - the scaling factor
*/
void _CudaSubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta)
{
CheckNTErrors(a && b && c, "Empty tensor input!");
CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in subtraction!");
CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
"Unmatched data types in subtraction!");
CheckNTErrors(a->order == c->order, "The input tensors do not have the same order in subtraction!");
CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
int stride = 1;
int blockSize = a->dimSize[n];
int blockNum = 1;
for (int i = a->order - 1; i >= 0; i--) {
if (i > n)
stride *= a->dimSize[i];
else if (i < n)
blockNum *= a->dimSize[i];
}
int cudaGrids[3];
int cudaBlocks[3];
int devIDBackup = 0;
ProtectCudaDev(a->devID, devIDBackup);
if (a->dataType == DEFAULT_DTYPE) {
if (stride > 1) {
GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
if (beta == (DTYPE)1.0F)
KernelSubWithCol<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockSize, stride, blockSize * stride, blockNum, beta);
else
KernelSubWithCol<DTYPE, true> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockSize, stride, blockSize * stride, blockNum, beta);
}
else if (stride == 1) {
GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
if (beta == (DTYPE)1.0F)
KernelSubWithRow<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockNum, blockSize, beta);
else
KernelSubWithRow<DTYPE, true> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockNum, blockSize, beta);
}
else {
ShowNTErrors("Something is wrong!");
}
}
else {
ShowNTErrors("TODO!");
}
BacktoCudaDev(a->devID, devIDBackup);
}
#endif
} // namespace nts(NiuTrans.Tensor)
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
*/
#ifndef __SUBDIM_CUH__
#define __SUBDIM_CUH__
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* tensor subtraction c = a - b * \beta where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting (cuda version) */
void _CudaSubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta = (DTYPE)1.0);
#endif
} // namespace nts(NiuTrans.Tensor)
#endif // __SUBDIM_CUH__
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
*/
#ifndef __SUBDIM_H__
#define __SUBDIM_H__
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* tensor subtraction c = a - b * \beta where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting*/
void _SubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta = (DTYPE)1.0);
/* tensor subtraction c = a - b * \beta where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting. we keep the result in the input tensor a and return nothing */
void _SubDim(XTensor * a, const XTensor * b, int n, DTYPE beta = (DTYPE)1.0);
/* tensor subtraction c = a - b * \beta where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting. We make a new tensor c to keep the result and return it */
XTensor SubDim(const XTensor &a, const XTensor &b, int n, DTYPE beta = (DTYPE)1.0);
/* tensor subtraction c = a - b * \beta where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting*/
void SubDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta = (DTYPE)1.0);
} // namespace nts(NiuTrans.Tensor)
#endif // __SUBDIM_H__
...@@ -136,7 +136,6 @@ i.e., a is summed with b by broadcasting ...@@ -136,7 +136,6 @@ i.e., a is summed with b by broadcasting
>> a - a tensor >> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a >> b - another tensor whose size is equal to that of dimension n of a
>> n - the dimension index >> n - the dimension index
>> inplace - indicates whether the result will be placed in the input tensor
>> beta - the scaling factor >> beta - the scaling factor
*/ */
void _SumDim(XTensor * a, const XTensor * b, int n, DTYPE beta) void _SumDim(XTensor * a, const XTensor * b, int n, DTYPE beta)
......
...@@ -29,6 +29,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -29,6 +29,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* generate data items with a Glorot initialization*/ /* generate data items with a Glorot initialization*/
void _SetDataXavierNormal(XTensor * tensor, DTYPE gain = 1.0F); void _SetDataXavierNormal(XTensor * tensor, DTYPE gain = 1.0F);
/* generate data items with a xavier initialization */ /* generate data items with a xavier initialization */
void _SetDataFanInOut(XTensor * tensor, DTYPE gain = 1.0F); void _SetDataFanInOut(XTensor * tensor, DTYPE gain = 1.0F);
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Creted by: Guan Huhao 2020-02-05
* $Updated by: Xu Chen (email: hello_master1954@163.com) 2020-05-01
*/
#include "../../XGlobal.h"
#include "Float16.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
float16 float16::SetOverFlow()
{
exp = 31;
data = 0;
return *this;
}
int float16::IsOverlFlow() const
{
return exp==31;
}
// mask for calculate the highest 1
unsigned int float16::mask[32] =
{
0xffffffff,0xfffffffe,0xfffffffc,0xfffffff8,0xfffffff0,0xffffffe0,0xffffffc0,0xffffff80,
0xffffff00,0xfffffe00,0xfffffc00,0xfffff800,0xfffff000,0xffffe000,0xffffc000,0xffff8000,
0xffff0000,0xfffe0000,0xfffc0000,0xfff80000,0xfff00000,0xffe00000,0xffc00000,0xff800000,
0xff000000,0xfe000000,0xfc000000,0xf8000000,0xf0000000,0xe0000000,0xc0000000,0x80000000
};
// to calculate the power of 2
unsigned int float16::pow2[32] =
{
0x00000001,0x00000002,0x00000004,0x00000008,0x00000010,0x00000020,0x00000040,0x00000080,
0x00000100,0x00000200,0x00000400,0x00000800,0x00001000,0x00002000,0x00004000,0x00008000,
0x00010000,0x00020000,0x00040000,0x00080000,0x00100000,0x00200000,0x00400000,0x00800000,
0x01000000,0x02000000,0x04000000,0x08000000,0x10000000,0x20000000,0x40000000,0x80000000,
};
// compare the absolute value, if a < b return 1, else return 0
int float16::AbsCompare(const float16 & a, const float16 & b)
{
if (a.exp < b.exp)
return 1;
else if (a.exp > b.exp)
return 0;
return a.data < b.data;
}
// get inverse that a * inverse(a) == 1
float16 float16::GetInverse() const
{
float16 ans;
ans.sign = sign;
ans.exp = 29 - exp;
int rec = pow2[31];
//let it div 0x80000000
rec /= (this->data | pow2[10]);
if (!(rec & pow2[21])) {
rec <<= 1;
ans.exp++;
}
rec >>= 10;
ans.data = rec;
return ans;
}
/* constructor by (sign, exp, data), similar to ieee 32 floating point
>> s - sign: 1bit
>> e - exp: 5bit
>> d - data: 10bit
*/
float16::float16(const int& s, const int& e, const int& d)
{
sign = s;
exp = e;
data = d;
}
/* initializes the 16bit floating point to 0
*/
float16::float16()
{
sign = 0;
exp = 0;
data = 0;
}
/* constructor by other datatype
We convert the data to float and convert float to float16.
>> data - num
*/
template<class T>
float16::float16(const T& data)
{
*this = (float)data;
}
template float16::float16 (const int &);
template float16::float16 (const double &);
/* constructor by a 32-bit float num
>> data - 32-bit float num
*/
float16::float16(const float& data)
{
*this = data;
}
void float16::Dump()
{
printf("sign: %d\texp: %d\tdata: %d\n", sign, exp, data);
}
/*
convert float16 to float and return
construct of 32-bit is
the 31th bit present the sign
the 30th~23th bit present the exp, with 128 offset
rest 23th~0th store the data
*/
float float16::Float()
{
int ret = 0;
ret = IsOverlFlow() ? 0x7f800000 :
(sign ? 0x80000000 : 0) | ((exp + 112) << 23) | (data << 13);
float p = *(float*)&ret;
return p;
}
// basic assignment function
float16 float16::operator = (const float16& a)
{
sign = a.sign;
exp = a.exp;
data = a.data;
return *this;
}
// convert float to float16
float16 float16::operator = (const float& a)
{
unsigned int p = *(unsigned int*)&a;
sign = p & pow2[31] ? 1 : 0;
if (a > 65535 || a < -65535)
return SetOverFlow();
exp = ((p >> 23)& (0xf)) | ((p >> 26 & 0x10));
data = (p >> 13);
return *this;
}
/* Template assignment function is force change other datetype to float,
then call the float assignment function.
Template assignment function now support int and double.
*/
template <class T>
float16 float16::operator = (const T& data)
{
*this = (float)data;
return *this;
}
template float16 float16:: operator = <int>(const int&);
template float16 float16:: operator = <double>(const double&);
/*
template for multi-datatype overload
>> operator - the overload operator, e.g. <, =
>> return_type - the returned datetype of function, e.g, int, float
>> expression - the returned expression
*/
#define _OVERLOAD_OPRATER_TEMPLATE(operation, returnType, expression) \
template<class T> \
returnType float16::operator operation (const T & data) \
{ \
float16 rec=(float)data; \
return expression; \
} \
template returnType float16::operator operation <int>(const int&); \
template returnType float16::operator operation <float>(const float&); \
template returnType float16::operator operation <double>(const double&);
// overload operator (less than) a<b
int float16::operator < (const float16& data)
{
if (sign < data.sign)
return 1;
else if (sign > data.sign)
return 0;
if (exp < data.exp)
return 1;
else if (exp > data.exp)
return 0;
return this->data < data.data;
}
_OVERLOAD_OPRATER_TEMPLATE(< , int, *this < rec)
// overload opertator <= (less or equal than) a <= b
int float16::operator <= (const float16& data)
{
if (sign < data.sign)
return 1;
else if (sign > data.sign)
return 0;
if (exp < data.exp)
return 1;
else if (exp > data.exp)
return 0;
return this->data <= data.data;
}
_OVERLOAD_OPRATER_TEMPLATE(<= , int, *this <= rec)
// overload operator (greater than) a > b
int float16::operator > (const float16& data)
{
if (sign > data.sign)
return 1;
else if (sign < data.sign)
return 0;
if (exp > data.exp)
return 1;
else if (exp < data.exp)
return 0;
return this->data > data.data;
}
_OVERLOAD_OPRATER_TEMPLATE(> , int, * this > rec)
// overload opertator >= (greater or equal than) a >= b
int float16::operator >= (const float16& data)
{
if (sign > data.sign)
return 1;
else if (sign < data.sign)
return 0;
if (exp > data.exp)
return 1;
else if (exp < data.exp)
return 0;
return this->data >= data.data;
}
_OVERLOAD_OPRATER_TEMPLATE(>= , int, *this < rec)
// overload operator + (add) a + b
float16 float16::operator + (const float16& data)
{
float16 ans;
// avoid overflow inf + anything = inf
if (this->IsOverlFlow())
return *this;
if (data.IsOverlFlow())
return data;
/* the greater number determine the sign and
the smaller should be >> to aligment to the greater one */
if (AbsCompare(*this, data)) {
ans.sign = data.sign;
// rec the exp
int recp = data.exp;
//to calculate the data
int recd = (data.data | (pow2[10])) +
((data.sign ^ sign) ? -1 : 1) *
(((pow2[10]) | this->data) >> (data.exp - exp));
//because the date may carry, if carryed >> the data, and change its exp
if (recd) {
//to make the highest one is 10th bit
while (mask[10] & recd) {
recd >>= 1;
recp++;
}
//to make the highest one is 10th bit
while (!(mask[10] & recd)) {
recd <<= 1;
recp--;
}
}
// if data==0, exp should be 0
else
recp = 0;
ans.data = recd;
// if overflow should set overflow
if (recp >= 31)
ans.SetOverFlow();
else {
ans.exp = recp;
ans.data = recd;
}
}
// same as above. while divided into two part? reduce assignment to increase efficent
else {
ans.sign = sign;
int recp = exp;
int recd = (this->data | (pow2[10])) +
((sign ^ data.sign) ? -1 : 1) *
(((pow2[10]) | data.data) >> (exp - data.exp));
if (recd) {
while (mask[10] & recd) {
recd >>= 1;
recp++;
}
while (!(mask[10] & recd)) {
recd <<= 1;
recp--;
}
}
else
recp = 0;
if (recp >= 31)
ans.SetOverFlow();
else {
ans.exp = recp;
ans.data = recd;
}
}
return ans;
}
_OVERLOAD_OPRATER_TEMPLATE(+, float16, *this = *this + rec)
//overide operator +=
float16 float16::operator+=(const float16& data) {
return *this = *this + data;
}
_OVERLOAD_OPRATER_TEMPLATE(+=, float16, *this = *this + rec)
//overide operator -(negetive) -a
float16 float16::operator - ()
{
sign ^= 1;
float16 rec = *this;
sign ^= 1;
return rec;
}
//overide operator - (substraction) a-b
float16 float16::operator - (const float16& data)
{
float16 ans;
if (this->IsOverlFlow())
return *this;
if (data.IsOverlFlow())
return data;
/* same as add only diffrent is the sign judge,
a possitive number sub a greater number will be negtive. */
if (AbsCompare(*this, data)) {
ans.sign = !data.sign;
int recp = data.exp;
int recd = (data.data | (pow2[10])) +
((data.sign ^ sign) ? 1 : -1) *
(((pow2[10]) | this->data) >> (data.exp - exp));
if (recd) {
while (mask[10] & recd) {
recd >>= 1;
recp++;
}
while (!(mask[10] & recd)) {
recd <<= 1;
recp--;
}
}
else recp = 0;
if (recp >= 31)
ans.SetOverFlow();
else {
ans.data = recd;
ans.exp = recp;
}
}
else {
ans.sign = sign;
int recp = exp;
int recd = (this->data | (pow2[10])) +
((sign ^ data.sign) ? 1 : -1) *
(((pow2[10]) | data.data) >> (exp - data.exp));
if (recd) {
while (mask[10] & recd) {
recd >>= 1;
recp++;
}
while (!(mask[10] & recd)) {
recd <<= 1;
recp--;
}
}
else recp = 0;
if (recp >= 31)
ans.SetOverFlow();
else {
ans.data = recd;
ans.exp = recp;
}
}
return ans;
}
_OVERLOAD_OPRATER_TEMPLATE(-, float16, *this = *this - rec)
// overide operator -=
float16 float16::operator-=(const float16& data)
{
return *this = *this - data;
}
_OVERLOAD_OPRATER_TEMPLATE(-=, float16, *this = *this - rec)
// overload operator * (multiple) a * b
float16 float16::operator * (const float16& data)
{
//if(IsOverlFlow())
// return *this;
//if(data.IsOverlFlow())
// return data;
float16 ans;
// ^ to get zhe result sign different will be 1(negtive), same will be 0 positive;
ans.sign = sign ^ data.sign;
// mul to get answer
int rec = (data.data | pow2[10]) * (this->data | pow2[10]);
// calculat the new exp
int recp = exp + data.exp - 15 > 0 ? exp + data.exp - 15 : 0;
// if carryed, to fix the exp and data
rec >>= 10;
while (rec & mask[11]) {
++recp;
rec >>= 1;
}
if (recp >= 31)
ans.SetOverFlow();
else {
ans.exp = recp;
ans.data = rec;
}
return ans;
}
_OVERLOAD_OPRATER_TEMPLATE(*, float16, (*this)* rec)
// overload operator *= (multiple) a *= b
float16 float16::operator *= (const float16& data)
{
return *this = *this * data;
}
_OVERLOAD_OPRATER_TEMPLATE(*=, float16, *this = *this * rec)
// overload operator / (division) a / b
float16 float16::operator / (const float16& data)
{
float16 ans;
// ^ to get zhe result sign different will be 1(negtive),same will be 0 positive;
ans.sign = sign ^ data.sign;
// calculat the new exp
int recp = exp - data.exp + 14;
// defore div should move to the left to avoid precision loss
int recd = (this->data << 21) | pow2[31];
recd /= (data.data | pow2[10]);
// to make the highest one is the 21st bit
if (recd & pow2[21]) {
recd >>= 1;
++recp;
}
if (recp >= 31)
ans.SetOverFlow();
else {
recd >>= 10;
ans.data = recd;
ans.exp = recp;
}
return ans;
}
_OVERLOAD_OPRATER_TEMPLATE(/ , float16, (*this) / rec)
// overload operator /= (division) a /= b
float16 float16::operator /= (const float16& data) {
return *this = *this / data;
}
_OVERLOAD_OPRATER_TEMPLATE(/=, float16, *this = *this / rec)
} // namespace nts(NiuTrans.Tensor)
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Creted by: Guan Huhao 2020-02-05
* $Updated by: Xu Chen (email: hello_master1954@163.com) 2020-05-01
*/
#ifndef FLOAT16_H
#define FLOAT16_H
namespace nts { // namespace nts(NiuTrans.Tensor)
struct float16
{
private:
/*
sign is the sign bit 1 means negative, 0 means positive
exp is the exponent with 16 offset
data is the data, similar to ieee-754, the highest is default 1 and ignored
*/
unsigned short data : 10;
unsigned short exp : 5;
unsigned short sign : 1;
// mask for calculate the highest 1
static unsigned int mask[32];
static unsigned int pow2[32];
//int FindHighOne(const int &num, int &l, int &r);
int AbsCompare(const float16 & a,const float16 & b);
public:
float16 SetOverFlow();
// judge whether overflow
int IsOverlFlow() const;
/* constructor by (sign, exp, data)
similar to ieee 32 floating point
sign: 1bit
exp: 5bit
data: 10bit */
float16(const int& s, const int& e, const int& d);
/* default constructor
This initializes the 16bit floating point to 0. */
float16();
// constructor by a 32-bit float num
float16(const float& data);
// constructor by other datatype
template<class T> float16(const T& data);
void Dump();
// convert float16 to float and return
float Float();
/* assignment function and tempalte function
Float assignment function is the basic function.
Template assignment function is force change other datetype to float,
then call the float assignment function.
Template assignment function now support int and double. */
float16 operator = (const float& data);
float16 operator = (const float16& data);
template<class T> float16 operator = (const T& data);
// overload operator (less than) a < b
int operator < (const float16& data);
template<class T> int operator < (const T& data);
// overload opertator <= (less or equal than) a <= b
int operator <= (const float16& data);
template<class T> int operator <= (const T& data);
// overload operator (greater than) a > b
int operator > (const float16& data);
template<class T> int operator > (const T& data);
// overload opertator >= (greater or equal than) a >= b
int operator >= (const float16& data);
template<class T> int operator >= (const T& data);
// overload operator + (add) a + b
float16 operator + (const float16& data);
template<class T> float16 operator + (const T& data);
// overload operator += (add) a += b
float16 operator += (const float16& data);
template<class T> float16 operator += (const T& data);
// overload operator -(negetive) -a
float16 operator - ();
// overload operator - (substraction) a - b
float16 operator - (const float16& data);
template<class T> float16 operator - (const T& data);
// overload operator -= (substraction) a -= b
float16 operator -= (const float16& data);
template<class T> float16 operator -= (const T& data);
// overload operator * (multiple) a * b
float16 operator * (const float16& data);
template<class T> float16 operator * (const T& data);
// overload operator *= (multiple) a *= b
float16 operator *= (const float16& data);
template<class T> float16 operator *= (const T& data);
// overload operator / (division) a / b
float16 GetInverse() const;
float16 operator / (const float16& data);
template<class T> float16 operator / (const T& data);
// overload operator /= (division) a /= b
float16 operator /= (const float16& data);
template<class T> float16 operator /= (const T& data);
};
} // namespace nts(NiuTrans.Tensor)
#endif /* FLOAT16_H */
...@@ -87,7 +87,7 @@ bool TestMultiply1() ...@@ -87,7 +87,7 @@ bool TestMultiply1()
/* call Multiply function */ /* call Multiply function */
_Multiply(s1, s2, t, 0, 0); _Multiply(s1, s2, t, 0, 0);
_MultiplyMe(tMe, s2, 0, 0); _MultiplyMe(tMe, s2, 0, 0);
tUser = Multiply(*s1, *s2, 0); tUser = Multiply(*s1, *s2, false, 0);
/* check results */ /* check results */
cpuTest = _CheckData(t, answer, tUnitNum, 1e-4F) && cpuTest = _CheckData(t, answer, tUnitNum, 1e-4F) &&
......
...@@ -161,7 +161,7 @@ bool TestSub2() ...@@ -161,7 +161,7 @@ bool TestSub2()
/* call Sub function */ /* call Sub function */
_Sub(a, b, c, beta); _Sub(a, b, c, beta);
_SubMe(cMe, b, beta); _SubMe(cMe, b, beta);
cUser = Sub(*a, *b, beta); cUser = Sub(*a, *b, false, beta);
/* check results */ /* check results */
cpuTest = _CheckData(c, answer, unitNum, 1e-4F) && cpuTest = _CheckData(c, answer, unitNum, 1e-4F) &&
...@@ -268,7 +268,7 @@ bool TestSub3() ...@@ -268,7 +268,7 @@ bool TestSub3()
b->SetData(bData, bUnitNum); b->SetData(bData, bUnitNum);
/* call Sum function */ /* call Sum function */
cUser = Sub(*a, *b, beta); cUser = Sub(*a, *b, false, beta);
/* check results */ /* check results */
cpuTest = _CheckData(&cUser, answer, cUnitNum, 1e-4F); cpuTest = _CheckData(&cUser, answer, cUnitNum, 1e-4F);
...@@ -370,7 +370,7 @@ bool TestSub4() ...@@ -370,7 +370,7 @@ bool TestSub4()
b->SetData(bData, bUnitNum); b->SetData(bData, bUnitNum);
/* call Sum function */ /* call Sum function */
cUser = Sub(*a, *b, beta); cUser = Sub(*a, *b, false, beta);
/* check results */ /* check results */
cpuTest = _CheckData(&cUser, answer, cUnitNum, 1e-4F); cpuTest = _CheckData(&cUser, answer, cUnitNum, 1e-4F);
...@@ -472,7 +472,7 @@ bool TestSub5() ...@@ -472,7 +472,7 @@ bool TestSub5()
b->SetData(bData, bUnitNum); b->SetData(bData, bUnitNum);
/* call Sum function */ /* call Sum function */
cUser = Sub(*a, *b, beta); cUser = Sub(*a, *b, false, beta);
/* check results */ /* check results */
cpuTest = _CheckData(&cUser, answer, cUnitNum, 1e-4F); cpuTest = _CheckData(&cUser, answer, cUnitNum, 1e-4F);
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
*/
#include "../core/utilities/CheckData.h"
#include "../core/arithmetic/SubDim.h"
#include "../XTensor.h"
#include "TSubDim.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
case 1: tensor subtraction c = a - b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting
*/
bool TestSubDim1()
{
/* a tensor of size (2, 4) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 2;
aDimSize[1] = 4;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (2) */
int bOrder = 1;
int * bDimSize = new int[bOrder];
bDimSize[0] = 2;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[2] = {1.0F, -1.0F};
DTYPE answer[2][4] = { {-1.0F, 0.0F, 1.0F, 2.0F},
{5.0F, 6.0F, 7.0F, 8.0F} };
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensorV2(aOrder, aDimSize);
XTensor * b = NewTensorV2(bOrder, bDimSize);
XTensor * c = NewTensorV2(aOrder, aDimSize);
XTensor * cMe = NewTensorV2(aOrder, aDimSize);
XTensor cUser;
/* initialize variables */
a->SetData(aData, aUnitNum);
cMe->SetData(aData, aUnitNum);
b->SetData(bData, bUnitNum);
c->SetZeroAll();
/* call SubDim function */
_SubDim(a, b, c, 0);
_SubDim(cMe, b, 0);
cUser = SubDim(*a, *b, 0);
/* check results */
cpuTest = _CheckData(c, answer, aUnitNum) &&
_CheckData(cMe, answer, aUnitNum) &&
_CheckData(&cUser, answer, aUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensorV2(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
cMeGPU->SetData(aData, aUnitNum);
bGPU->SetData(bData, bUnitNum);
cGPU->SetZeroAll();
/* call sub function */
_SubDim(aGPU, bGPU, cGPU, 0);
_SubDim(cMeGPU, bGPU, 0);
cUserGPU = SubDim(*aGPU, *bGPU, 0);
/* check results */
gpuTest = _CheckData(cGPU, answer, aUnitNum) &&
_CheckData(cMeGPU, answer, aUnitNum) &&
_CheckData(&cUserGPU, answer, aUnitNum);
/* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 2: tensor subtraction c = a - b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting
*/
bool TestSubDim2()
{
/* a tensor of size (2, 4) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 2;
aDimSize[1] = 4;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (2, 2) */
int bOrder = 2;
int * bDimSize = new int[bOrder];
bDimSize[0] = 2;
bDimSize[1] = 2;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[2][2] = { {1.0F, -1.0F},
{-1.0F, 1.0F} };
DTYPE answer[2][4] = { {-1.0F, 2.0F, 3.0F, 2.0F},
{3.0F, 6.0F, 7.0F, 6.0F} };
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensorV2(aOrder, aDimSize);
XTensor * b = NewTensorV2(bOrder, bDimSize);
XTensor * c = NewTensorV2(aOrder, aDimSize);
XTensor * cMe = NewTensorV2(aOrder, aDimSize);
XTensor cUser;
/* initialize variables */
a->SetData(aData, aUnitNum);
cMe->SetData(aData, aUnitNum);
b->SetData(bData, bUnitNum);
c->SetZeroAll();
/* call SubDim function */
_SubDim(a, b, c, 1);
_SubDim(cMe, b, 1);
cUser = SubDim(*a, *b, 1);
/* check results */
cpuTest = _CheckData(c, answer, aUnitNum) &&
_CheckData(cMe, answer, aUnitNum) &&
_CheckData(&cUser, answer, aUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensorV2(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
cMeGPU->SetData(aData, aUnitNum);
bGPU->SetData(bData, bUnitNum);
cGPU->SetZeroAll();
/* call sub function */
_SubDim(aGPU, bGPU, cGPU, 1);
_SubDim(cMeGPU, bGPU, 1);
cUserGPU = SubDim(*aGPU, *bGPU, 1);
/* check results */
gpuTest = _CheckData(cGPU, answer, aUnitNum) &&
_CheckData(cMeGPU, answer, aUnitNum) &&
_CheckData(&cUserGPU, answer, aUnitNum);
/* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
*/
/* test for SubDim Function */
bool TestSubDim()
{
XPRINT(0, stdout, "[TEST SUBDIM] tensor subtraction c = a - b * beta by broadcasting\n");
bool returnFlag = true, caseFlag = true;
/* case 1 test */
caseFlag = TestSubDim1();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 1 failed!\n");
}
else
XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 2 test */
caseFlag = TestSubDim2();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n");
}
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* other cases test */
/*
TODO!!
*/
if (returnFlag) {
XPRINT(0, stdout, ">> All Passed!\n");
}
else
XPRINT(0, stdout, ">> Failed!\n");
XPRINT(0, stdout, "\n");
return returnFlag;
}
} // namespace nts(NiuTrans.Tensor)
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
*/
#ifndef __TEST_SUBDIM_H__
#define __TEST_SUBDIM_H__
#include "../core/arithmetic/SubDim.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* test for SubDim Function */
bool TestSubDim();
} // namespace nts(NiuTrans.Tensor)
#endif // __TEST_SUBDIM_H__
...@@ -161,7 +161,7 @@ bool TestSum2() ...@@ -161,7 +161,7 @@ bool TestSum2()
/* call Sum function */ /* call Sum function */
_Sum(a, b, c, beta); _Sum(a, b, c, beta);
_SumMe(cMe, b, beta); _SumMe(cMe, b, beta);
cUser = Sum(*a, *b, beta); cUser = Sum(*a, *b, false, beta);
/* check results */ /* check results */
cpuTest = _CheckData(c, answer, unitNum, 1e-4F) && cpuTest = _CheckData(c, answer, unitNum, 1e-4F) &&
...@@ -268,7 +268,7 @@ bool TestSum3() ...@@ -268,7 +268,7 @@ bool TestSum3()
b->SetData(bData, bUnitNum); b->SetData(bData, bUnitNum);
/* call Sum function */ /* call Sum function */
cUser = Sum(*a, *b, beta); cUser = Sum(*a, *b, false, beta);
/* check results */ /* check results */
cpuTest = _CheckData(&cUser, answer, cUnitNum, 1e-4F); cpuTest = _CheckData(&cUser, answer, cUnitNum, 1e-4F);
...@@ -370,7 +370,7 @@ bool TestSum4() ...@@ -370,7 +370,7 @@ bool TestSum4()
b->SetData(bData, bUnitNum); b->SetData(bData, bUnitNum);
/* call Sum function */ /* call Sum function */
cUser = Sum(*a, *b, beta); cUser = Sum(*a, *b, false, beta);
/* check results */ /* check results */
cpuTest = _CheckData(&cUser, answer, cUnitNum, 1e-4F); cpuTest = _CheckData(&cUser, answer, cUnitNum, 1e-4F);
...@@ -472,7 +472,7 @@ bool TestSum5() ...@@ -472,7 +472,7 @@ bool TestSum5()
b->SetData(bData, bUnitNum); b->SetData(bData, bUnitNum);
/* call Sum function */ /* call Sum function */
cUser = Sum(*a, *b, beta); cUser = Sum(*a, *b, false, beta);
/* check results */ /* check results */
cpuTest = _CheckData(&cUser, answer, cUnitNum, 1e-4F); cpuTest = _CheckData(&cUser, answer, cUnitNum, 1e-4F);
......
import argparse
from struct import pack
import torch
parser = argparse.ArgumentParser(description='Pack Pytorch model to NiuTensor')
parser.add_argument('-src', help='pytorch model', type=str, default='model.pt')
parser.add_argument('-tgt', help='niutensor model', type=str, default='model.bin')
args = parser.parse_args()
model = torch.load(args.src, map_location='cpu')
model = model['model']
def get_model_parameters(m):
'''
get flattend transformer model parameters
'''
p = []
w = None
for k in m:
if 'embed_tokens.weight' in k:
w = m[k]
elif m[k].numel() != 1:
# p.append(m[k])
if 'weight' in k:
# weights for qkv
if 'in_proj' in k:
dim = m[k].shape[0] // 3
p.append((m[k][:dim, :]).t())
p.append((m[k][dim:dim*2, :]).t())
p.append((m[k][dim*2:, :]).t())
else:
if 'norm' in k:
p.append(m[k])
else:
p.append(m[k].t())
else:
p.append(m[k])
# encoder embedding weight
p.append(w)
# decoder embedding weight
p.append(w)
# output weight
p.append(w)
return p
with torch.no_grad():
params = get_model_parameters(model)
params_number = pack("Q", len(params))
params_size = pack("Q" * len(params), *[p.numel() for p in params])
print('total params: ', len(params))
print('total params size: ', sum([p.numel() for p in params]))
with open(args.tgt+".name.txt", "w") as name_list:
for p in model:
name_list.write("{}\t{}\n".format(p, model[p].shape))
with open(args.tgt+".bin", 'wb') as tgt:
# part 1: number of parameters
# tgt.write(params_number)
# part 2: offsets of parameters
# tgt.write(params_size)
# part 3: values of parameters
for p in params:
values = pack("f" * p.numel(), *(p.contiguous().view(-1).cpu().tolist()))
tgt.write(values)
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论