Commit 05715480 by huchi

Initial commit

parents
/bin
/build
/out
sample/train/iwlst14de-en.train.log
/models
/source/niutensor/
\ No newline at end of file
# CMake minimum version
cmake_minimum_required(VERSION 2.8)
# Project's name
project(NiuTrans.NMT)
# The prefix of the generated executable file
set(NIUTRANS_EXE "NiuTrans.NMT")
set(NIUTRANS_DLL "${NIUTRANS_EXE}")
# Generated file path
set(EXECUTABLE_OUTPUT_PATH ../bin)
set(LIBRARY_OUTPUT_PATH ../lib)
# Use CMAKE_MACOSX_RPATH for MacOS
set(CMAKE_MACOSX_RPATH 1)
# Open floder manage
set_property(GLOBAL PROPERTY USE_FOLDERS ON)
option(USE_CUDA "Use CUDA" OFF)
option(USE_MKL "Use MKL" OFF)
option(USE_OPENBLAS "Use OpenBLAS" OFF)
option(USE_FP16 "Use FP16" OFF)
option(GEN_DLL "Generate Dynamic Link Library" OFF)
if (USE_CUDA)
if(NOT DEFINED CUDA_TOOLKIT_ROOT_DIR)
if(WIN32)
message(STATUS "HERE cuda")
set(CUDA_TOOLKIT_ROOT_DIR "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1")
else()
set(CUDA_TOOLKIT_ROOT_DIR "/usr/cuda-9.0")
endif()
endif()
message(STATUS "CUDA_TOOLKIT_ROOT_DIR: ${CUDA_TOOLKIT_ROOT_DIR}")
endif()
if(USE_MKL)
if(NOT DEFINED INTEL_ROOT)
if(WIN32)
message(STATUS "HERE mkl")
set(INTEL_ROOT "C:/Program Files (x86)/IntelSWTools/compilers_and_libraries_2020.2.254/windows")
else()
set(INTEL_ROOT "/usr/intel/compilers_and_libraries_2020.2.254/linux")
endif()
endif()
message(STATUS "INTEL_ROOT: ${INTEL_ROOT}")
endif()
if(USE_OPENBLAS)
if(NOT DEFINED OPENBLAS_ROOT)
if(WIN32)
set(OPENBLAS_ROOT "D:/software/BaiduNetdiskDownload/thirdparty20170624/OpenBLAS")
else()
set(OPENBLAS_ROOT "/usr/OpenBLAS")
endif()
endif()
message(STATUS "OPENBLAS_ROOT: ${OPENBLAS_ROOT}")
endif()
# Find all the .cpp .h .cu .chu files in source folder
file(GLOB_RECURSE CPP_FILES source/*.cpp)
file(GLOB_RECURSE H_FILES source/*.h)
file(GLOB_RECURSE CU_FILES source/*.cu)
file(GLOB_RECURSE CUH_FILES source/*.cuh)
function(assign_source_group)
foreach(_source IN ITEMS ${ARGN})
if (IS_ABSOLUTE "${_source}")
file(RELATIVE_PATH _source_rel "${CMAKE_CURRENT_SOURCE_DIR}" "${_source}")
else()
set(_source_rel "${_source}")
endif()
get_filename_component(_source_path "${_source_rel}" PATH)
string(REPLACE "/" "\\" _source_path_msvc "${_source_path}")
source_group("${_source_path_msvc}" FILES "${_source}")
endforeach()
endfunction(assign_source_group)
function(my_add_executable)
foreach(_source IN ITEMS ${ARGN})
assign_source_group(${_source})
endforeach()
if(USE_CUDA)
cuda_add_executable(${ARGV})
else()
add_executable(${ARGV})
endif()
endfunction(my_add_executable)
# Set libs and compiler options for CUDA
if(USE_CUDA)
add_definitions(-DUSE_CUDA)
if(USE_FP16)
add_definitions(-DHALF_PRECISION)
endif()
find_package(CUDA ${CUDA_VERSION} REQUIRED)
if(WIN32)
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4819")
set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-maxrregcount=0 -m64 --disable-warnings -use_fast_math -DUSE_CUDA")
set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} -arch=sm_30
-gencode=arch=compute_30,code=sm_30
-gencode=arch=compute_50,code=sm_50
-gencode=arch=compute_52,code=sm_52
-gencode=arch=compute_60,code=sm_60
-gencode=arch=compute_61,code=sm_61
-gencode=arch=compute_62,code=sm_62
-gencode=arch=compute_70,code=sm_70
-gencode=arch=compute_70,code=compute_70
)
set(CMAKE_POLICY_DEFAULT_CMP0028 NEW)
link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib/x64")
include_directories("${CUDA_TOOLKIT_ROOT_DIR}/include")
set(CUDA_LIB_DIR "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64/")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}cublas.lib")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}npps.lib")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}nppc.lib")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}cudadevrt.lib")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}curand.lib")
else()
set(CMAKE_CXX_FLAGS "-fPIC -msse4.2 -w -march=native -Wno-enum-compare -Wno-sign-compare -Wno-format -Wno-dev -O3 -DNDEBUG -rdynamic")
if(USE_FP16)
set(CUDA_NVCC_FLAGS "-Xcompiler -fPIC -maxrregcount=0 --disable-warnings -use_fast_math -DUSE_CUDA -DHALF_PRECISION -Wno-deprecated-gpu-targets -std=c++11 ")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -arch=sm_60
-gencode=arch=compute_60,code=sm_60
-gencode=arch=compute_61,code=sm_61
-gencode=arch=compute_62,code=sm_62
-gencode=arch=compute_70,code=sm_70
-gencode=arch=compute_70,code=compute_70
)
else()
set(CUDA_NVCC_FLAGS "-Xcompiler -fPIC -maxrregcount=0 --disable-warnings -use_fast_math -DUSE_CUDA -Wno-deprecated-gpu-targets -std=c++11 ")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -arch=sm_30
-gencode=arch=compute_30,code=sm_30
-gencode=arch=compute_50,code=sm_50
-gencode=arch=compute_52,code=sm_52
-gencode=arch=compute_60,code=sm_60
-gencode=arch=compute_61,code=sm_61
-gencode=arch=compute_62,code=sm_62
-gencode=arch=compute_70,code=sm_70
-gencode=arch=compute_70,code=compute_70
)
endif()
link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64)
include_directories(${CUDA_TOOLKIT_ROOT_DIR}/include)
set(CUDA_LIB_DIR "${CUDA_TOOLKIT_ROOT_DIR}/lib64/")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcublas_static.a")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libculibos.a")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libnpps_static.a")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libnppc_static.a")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcudadevrt.a")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcurand_static.a")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "/usr/lib64/libdl.so.2")
endif()
endif()
# Set libs and compiler options for MKL
if(USE_MKL)
add_definitions(-DMKL)
set(COMPILER_DIR "${INTEL_ROOT}/compiler")
set(MKL_DIR "${INTEL_ROOT}/mkl")
set(CPU_ARCH intel64)
if(WIN32)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DNDEBUG -DMKL")
link_directories(${MKL_DIR}/lib/intel64/)
link_directories(${COMPILER_DIR}/lib/intel64)
include_directories(${MKL_DIR}/include)
set(COMPILER_LIB_DIR "${COMPILER_DIR}/lib/intel64/")
set(MKL_LIB_DIR "${MKL_DIR}/lib/intel64/")
set(MKL_LIB_PATH ${MKL_LIB_PATH} "${MKL_LIB_DIR}mkl_intel_lp64.lib")
set(MKL_LIB_PATH ${MKL_LIB_PATH} "${MKL_LIB_DIR}mkl_core.lib")
set(MKL_LIB_PATH ${MKL_LIB_PATH} "${MKL_LIB_DIR}mkl_intel_thread.lib")
set(MKL_LIB_PATH ${MKL_LIB_PATH} "${COMPILER_LIB_DIR}libiomp5md.lib")
else()
if(USE_CUDA)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-reorder -DMKL")
else()
set(CMAKE_CXX_FLAGS "-std=c++11 -msse4.2 -w -march=native -Wno-enum-compare -Wno-sign-compare -Wno-reorder -Wno-format -O3 -flto -DNDEBUG -rdynamic -fkeep-inline-functions -fPIC -DMKL")
endif(USE_CUDA)
link_directories(${MKL_DIR}/lib/intel64/)
link_directories(${COMPILER_DIR}/lib/intel64)
include_directories(${MKL_DIR}/include)
set(COMPILER_LIB_DIR "${COMPILER_DIR}/lib/intel64/")
set(MKL_LIB_DIR "${MKL_DIR}/lib/intel64/")
set(MKL_LIB_PATH ${MKL_LIB_PATH} "${MKL_LIB_DIR}libmkl_intel_lp64.a")
set(MKL_LIB_PATH ${MKL_LIB_PATH} "${MKL_LIB_DIR}libmkl_core.a")
set(MKL_LIB_PATH ${MKL_LIB_PATH} "${MKL_LIB_DIR}libmkl_intel_thread.a")
set(MKL_LIB_PATH ${MKL_LIB_PATH} "${COMPILER_LIB_DIR}libiomp5.a")
endif()
endif()
# Set libs and compiler options for OpenBLAS
if(USE_OPENBLAS)
add_definitions(-DUSE_BLAS -DMKL)
set(OPENBLAS_INCLUDE_DIR "${OPENBLAS_ROOT}/include")
set(OPENBLAS_LIB_DIR "${OPENBLAS_ROOT}/lib")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_BLAS")
if(WIN32)
link_directories(${OPENBLAS_LIB_DIR})
include_directories(${OPENBLAS_INCLUDE_DIR})
set(OPENBLAS_LIB_PATH ${OPENBLAS_LIB_PATH} "${OPENBLAS_LIB_DIR}/libopenblas.lib")
else()
link_directories(${OPENBLAS_LIB_DIR})
include_directories(${OPENBLAS_INCLUDE_DIR})
set(OPENBLAS_LIB_PATH ${OPENBLAS_LIB_PATH} "${OPENBLAS_LIB_DIR}/libopenblas.a")
endif()
endif()
# Integrate all libs
set(CUDA_LIB ${CUDA_LIB_PATH})
set(MKL_LIB ${MKL_LIB_PATH})
set(OPENBLAS_LIB ${OPENBLAS_LIB_PATH})
# Add executable files to project
# Generate dynamic link library about project
if(USE_CUDA)
if(GEN_DLL)
cuda_add_library(${NIUTRANS_DLL} SHARED ${CPP_FILES} ${H_FILES} ${CU_FILES} ${CUH_FILES})
else()
my_add_executable(${NIUTRANS_EXE} ${CPP_FILES} ${H_FILES} ${CU_FILES} ${CUH_FILES})
endif()
else()
if(GEN_DLL)
add_library(${NIUTRANS_DLL} SHARED ${CPP_FILES} ${H_FILES})
else()
my_add_executable(${NIUTRANS_EXE} ${CPP_FILES} ${H_FILES})
endif()
endif()
# Link external libs to executable files
# Link external libs to dynamic link library
if(WIN32)
add_definitions(-DWIN32)
set(MESS ${MESS} "On Windows")
if(USE_CUDA)
set(MESS ${MESS} " Use CUDA")
set(ALL_LIB ${ALL_LIB} ${CUDA_LIB})
endif()
if(USE_MKL)
set(MESS ${MESS} " Use MKL")
set(ALL_LIB ${ALL_LIB} ${MKL_LIB})
elseif(USE_OPENBLAS)
set(MESS ${MESS} " Use OpenBLAS")
set(ALL_LIB ${ALL_LIB} ${OPENBLAS_LIB})
else()
endif()
if(GEN_DLL)
message(STATUS "Generate Dynamic Link Library")
message(STATUS "Name of Dynamic Link Library: " ${NIUTRANS_DLL})
target_link_libraries(${NIUTRANS_DLL} ${ALL_LIB})
else()
message(STATUS "Generate Makefile For Executable File")
message(STATUS "Name of Executable File :" ${NIUTRANS_EXE})
target_link_libraries(${NIUTRANS_EXE} ${ALL_LIB})
endif()
message(STATUS "${MESS}")
else()
add_definitions(-std=c++11)
set(MESS ${MESS} "On Linux")
if(USE_CUDA)
set(MESS ${MESS} " Use CUDA")
set(ALL_LIB ${ALL_LIB} ${CUDA_LIB})
set(FLAG ${FLAG} "-lpthread -lcudart -lnvidia-ml")
else()
set(FLAG ${FLAG} "-lpthread")
endif()
if(USE_MKL)
set(MESS ${MESS} " Use MKL")
set(ALL_LIB ${ALL_LIB} ${MKL_LIB})
set(FLAG ${FLAG} "-liomp5 -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -ldl")
elseif(USE_OPENBLAS)
set(MESS ${MESS} " Use OpenBLAS")
set(ALL_LIB ${ALL_LIB} ${OPENBLAS_LIB})
set(FLAG ${FLAG} "-lopenblas")
else()
endif()
if(GEN_DLL)
message(STATUS "Generate Dynamic Link Library")
message(STATUS "Name of Dynamic Link Library: " ${NIUTRANS_DLL})
target_link_libraries(${NIUTRANS_DLL} ${ALL_LIB} ${FLAG})
else()
message(STATUS "Generate Makefile For Executable File")
message(STATUS "Name of Executable File: " ${NIUTRANS_EXE})
target_link_libraries(${NIUTRANS_EXE} ${ALL_LIB} ${FLAG})
endif()
message(STATUS "${MESS}")
endif()
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
# NiuTrans.NMT
- [Features](#features)
- [Installation](#installation)
- [Requirements](#requirements)
- [Build from Source](#build-from-source)
- [Configure with CMake](#configure-with-cmake)
- [Configuration Example](#configuration-example)
- [Compile on Linux](#compile-on-linux)
- [Compile on Windows](#compile-on-windows)
- [Usage](#usage)
- [Training](#training)
- [Commands](#commands)
- [An Example](#an-example)
- [Translating](#translating)
- [Commands](#commands-1)
- [An Example](#an-example-1)
- [Low Precision Inference](#low-precision-inference)
- [Converting Models from Fairseq](#converting-models-from-fairseq)
- [A Model Zoo](#a-model-zoo)
- [Papers](#papers)
- [Team Members](#team-members)
## Features
NiuTrans.NMT is a lightweight and efficient Transformer-based neural machine translation system. Its main features are:
* Few dependencies. It is implemented with pure C++, and all dependencies are optional.
* Fast decoding. It supports various decoding acceleration strategies, such as batch pruning and dynamic batch size.
* Advanced NMT models, such as [Deep Transformer](https://www.aclweb.org/anthology/P19-1176).
* Flexible running modes. The system can be run on various systems and devices (Linux vs. Windows, CPUs vs. GPUs, and FP32 vs. FP16, etc.).
* Framework agnostic. It supports various models trained with other tools, e.g., fairseq models.
* The code is simple and friendly to beginners.
## Installation
### Requirements
* OS: Linux or Windows
* [GCC/G++](https://gcc.gnu.org/) >=4.8.4 (on Linux)
* [VC++](https://www.microsoft.com/en-us/download/details.aspx?id=48145) >=2015 (on Windows)
* [CMake](https://cmake.org/download/) >= 2.8
* [CUDA](https://developer.nvidia.com/cuda-92-download-archive) >= 9.2, <= 10.0 (optional)
* [MKL](https://software.intel.com/content/www/us/en/develop/tools/math-kernel-library.html) latest version (optional)
* [OpenBLAS](https://github.com/xianyi/OpenBLAS) latest version (optional)
### Build from Source
#### Configure with CMake
The default configuration enables compiling for the **pure CPU** version.
```bash
# Download the code
git clone https://github.com/NiuTrans/NiuTrans.NMT.git
git clone https://github.com/NiuTrans/NiuTensor.git
# Merge with NiuTrans.Tensor
mv NiuTrans.Tensor/source NiuTrans.NMT/source/niutensor
rm NiuTrans.NMT/source/niutensor/Main.cpp
rm -rf NiuTrans.NMT/source/niutensor/sample NiuTrans.NMT/source/niutensor/tensor/test
mkdir NiuTrans.NMT/build && cd NiuTrans.NMT/build
# Run CMake
cmake ..
```
You can add compilation options to the CMake command to support accelerations with MKL, OpenBLAS, or CUDA.
*Please note that you can only select at most one of MKL or OpenBLAS.*
* Use CUDA (required for training)
Add ``-DUSE_CUDA=ON`` and ``-DCUDA_TOOLKIT_ROOT_DIR=$CUDA_PATH`` to the CMake command, where ``$CUDA_PATH`` is the path of the CUDA toolkit.
You can also add ``-DUSE_FP16=ON`` to the CMake command to get half-precision supported.
* Use MKL (optional)
Add ``-DUSE_MKL=ON`` and ``-DINTEL_ROOT=$MKL_PATH`` to the CMake command, where ``$MKL_PATH`` is the path of MKL.
* Use OpenBLAS (optional)
Add ``-DUSE_OPENBLAS=ON`` and ``-DOPENBLAS_ROOT=$OPENBLAS_PATH`` to the CMake command, where ``$OPENBLAS_PATH`` is the path of OpenBLAS.
*Note that half-precision requires Pascal or newer architectures on GPUs.*
#### Configuration Example
We provide [several examples](./sample/compile/README.md) to build the project with different options.
#### Compile on Linux
```bash
make -j && cd ..
```
#### Compile on Windows
Add ``-A 64`` to the CMake command and it will generate a visual studio project on windows, i.e., ``NiuTrans.NMT.sln`` so you can open & build it with Visual Studio (>= Visual Studio 2015).
If it succeeds, you will get an executable file **`NiuTrans.NMT`** in the 'bin' directory.
## Usage
### Training
#### Commands
*Make sure compiling the program with CUDA because training on CPUs is not supported now.*
Step 1: Prepare the training data.
```bash
# Convert the BPE vocabulary
python3 tools/GetVocab.py \
-raw $bpeVocab \
-new $niutransVocab
```
Description:
* `raw` - Path of the BPE vocabulary.
* `new` - Path of the NiuTrans.NMT vocabulary to be saved.
```bash
# Binarize the training data
python3 tools/PrepareParallelData.py \
-src $srcFile \
-tgt $tgtFile \
-src_vocab $srcVocab \
-tgt_vocab $tgtVocab \
-output $trainingFile
```
Description:
* `src` - Path of the source language data. One sentence per line with tokens separated by spaces or tabs.
* `tgt` - Path of the target language data. The same format as the source language data.
* `sv` - Path of the source language vocabulary. Its first line is the vocabulary size and the first index, followed by a word and its index in each following line.
* `tv` - Path of the target language vocabulary. The same format as the source language vocabulary.
* `output` - Path of the training data to be saved.
Step 2: Train the model
```bash
bin/NiuTrans.NMT \
-dev $deviceID \
-model $modelFile \
-train $trainingData \
-valid $validData
```
Description:
* `dev` - Device id (>= 0 for GPUs). Default: 0.
* `model` - Path of the model to be saved.
* `train` - Path to the training file. The same format as the output file in step 1.
* `valid` - Path to the validation file. The same format as the output file in step 1.
* `wbatch` - Word batch size. Default: 4096.
* `sbatch` - Sentence batch size. Default: 8.
* `mt` - Indicates whether the model runs for machine translation. Default: true.
* `dropout` - Dropout rate for the model. Default: 0.3.
* `fnndrop` - Dropout rate for fnn layers. Default: 0.1.
* `attdrop` - Dropout rate for attention layers. Default: 0.1.
* `lrate`- Learning rate. Default: 0.0015.
* `lrbias` - The parameter that controls the maximum learning rate in training. Default: 0.
* `nepoch` - Training epoch number. Default: 50.
* `nstep` - Traing step number. Default: 100000.
* `nwarmup` - Step number of warm-up for training. Default: 8000.
* `adam` - Indicates whether Adam is used. Default: true.
* `adambeta1` - Hyper parameters of Adam. Default: 0.9.
* `adambeta2` - Hyper parameters of Adam. Default: 0.98.
* `adambeta` - Hyper parameters of Adam. Default: 1e-9.
* `shuffled` - Indicates whether the data file is shuffled for training. Default: true.
* `labelsmoothing` - Label smoothing factor. Default: 0.1.
* `nstepcheckpoint` - Number of steps after which we make a checkpoint. Default: -1.
* `epochcheckpoint` - Indicates whether we make a checkpoint after each training epoch. Default: true.
* `updatestep` - Number of batches that we collect for model update. Default: 1 (one can set > 1 for gradient accumulation).
* `sorted` - Indicates whether the sequence is sorted by length. Default: false.
* `bufsize` - Buffer size for the batch loader. Default: 50000.
* `doubledend` - Indicates whether we double the </s> symbol for the output of LM. Default: false.
* `smallbatch` - Indicates whether we use batchsize = max * sc
rather rather than batchsize = word-number, where max is the maximum
length and sc is the sentence number. Default: true.
* `bigbatch` - Counterpart of "isSmallBatch". Default: false.
* `randbatch` - Randomize batches. Default: false.
* `bucketsize` - Bucket size for the batch loader. Default: wbatch * 10.
#### An Example
Refer to [this page for the training example.](./sample/train/)
### Translating
*Make sure compiling the program with CUDA and FP16 if you want to translate with FP16 on GPUs.*
#### Commands
```bash
bin/NiuTrans.NMT \
-dev $deviceID \
-test $inputFile \
-model $modelPath \
-sbatch $batchSize \
-beamsize $beamSize \
-srcvocab $srcVocab \
-tgtvocab $tgtVocab \
-output $outputFile
```
Description:
* `model` - Path of the model.
* `sbatch` - Sentence batch size. Default: 8.
* `dev` - Device id (-1 for CPUs, and >= 0 for GPUs). Default: 0.
* `beamsize` - Size of the beam. 1 for the greedy search.
* `test` - Path of the input file. One sentence per line with tokens separated by spaces.
* `output` - Path of the output file to be saved. The same format as the input file.
* `srcvocab` - Path of the source language vocabulary. Its first line is the vocabulary size, followed by a word and its index in each following line.
* `tgtvocab` - Path of the target language vocabulary. The same format as the source language vocabulary.
* `fp16 (optional)` - Inference with FP16. This will not work if the model is stored in FP32. Default: false.
* `lenalpha` - The alpha parameter controls the length preference. Default: 0.6.
* `maxlenalpha` - Scalar of the input sequence (for the max number of search steps). Default: 1.2.
#### An Example
Refer to [this page for the translating example.](./sample/translate/)
## Low Precision Inference
NiuTrans.NMT supports inference with FP16, you can convert the model to FP16 with our tools:
```bash
python3 tools/FormatConverter.py \
-input $inputModel \
-output $outputModel \
-format $targetFormat
```
Description:
* `input` - Path of the raw model file.
* `output` - Path of the new model file.
* `format` - Target storage format, FP16 (Default) or FP32.
## Converting Models from Fairseq
The core implementation is framework agnostic, so we can easily convert models trained with other frameworks to a binary format for efficient inference.
The following frameworks and models are currently supported:
| | [fairseq (0.6.2)](https://github.com/pytorch/fairseq/tree/v0.6.2) |
| --- | :---: |
| Transformer ([Vaswani et al. 2017](https://arxiv.org/abs/1706.03762)) | ✓ |
| RPR attention ([Shaw et al. 2018](https://arxiv.org/abs/1803.02155)) | ✓ |
| Deep Transformer ([Wang et al. 2019](https://www.aclweb.org/anthology/P19-1176/)) | ✓ |
*Refer to [this page](https://fairseq.readthedocs.io/en/latest/getting_started.html#training-a-new-model) for the details about training models with fairseq.*
After training, you can convert the fairseq models and vocabulary with the following steps.
Step 1: Convert parameters of a single fairseq model
```bash
python3 tools/ModelConverter.py -src $src -tgt $tgt
```
Description:
* `src` - Path of the fairseq checkpoint, [refer to this for more details](https://fairseq.readthedocs.io/en/latest/).
* `tgt` - Path to save the converted model parameters. All parameters are stored in a binary format.
* `fp16 (optional)` - Save the parameters with 16-bit data type. Default: disabled.
Step 2: Convert the vocabulary:
```bash
python3 tools/VocabConverter.py -src $fairseqVocabPath -tgt $newVocabPath
```
Description:
* `src` - Path of the fairseq vocabulary, [refer to this for more details](https://fairseq.readthedocs.io/en/latest/).
* `tgt` - Path to save the converted vocabulary. Its first line is the vocabulary size, followed by a word and its index in each following line.
*You may need to convert both the source language vocabulary and the target language vocabulary if they are not shared.*
## A Model Zoo
We provide several pre-trained models to test the system.
All models and runnable systems are packaged into docker files so that one can easily reproduce our result.
Refer to [this page](./sample/translate) for more details.
## Papers
Here are the papers related to this project:
[Learning Deep Transformer Models for Machine Translation.](https://www.aclweb.org/anthology/P19-1176) Qiang Wang, Bei Li, Tong Xiao, Jingbo Zhu, Changliang Li, Derek F. Wong, Lidia S. Chao. 2019. Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics.
[The NiuTrans System for WNGT 2020 Efficiency Task.](https://www.aclweb.org/anthology/2020.ngt-1.24) Chi Hu, Bei Li, Yinqiao Li, Ye Lin, Yanyang Li, Chenglong Wang, Tong Xiao, Jingbo Zhu. 2020. Proceedings of the Fourth Workshop on Neural Generation and Translation.
## Team Members
This project is maintained by a joint team from NiuTrans Research and NEU NLP Lab. Current team members are
*Chi Hu, Bei Li, Yinqiao Li, Ye Lin, Quan Du, Tong Xiao and Jingbo Zhu*
Please contact niutrans@mail.neu.edu.cn if you have any questions.
# Compilation Example
Here is some compilation example for Linux with MKL, OpenBLAS, or CUDA supported.
**Replace the path in your environment.**
## Compile with CUDA supported
```bash
git clone https://github.com/NiuTrans/NiuTrans.NMT.git
git clone https://github.com/NiuTrans/NiuTensor.git
mv NiuTrans.Tensor/source NiuTrans.NMT/source/niutensor
rm NiuTrans.NMT/source/niutensor/Main.cpp
rm -rf NiuTrans.NMT/source/niutensor/sample NiuTrans.NMT/source/niutensor/tensor/test
mkdir NiuTrans.NMT/build && cd NiuTrans.NMT/build
cmake -DUSE_CUDA=ON -DCUDA_TOOLKIT_ROOT_DIR='/home/nlplab/cuda9.2/' ..
make -j
```
## Compile with CUDA and FP16 supported
```bash
git clone https://github.com/NiuTrans/NiuTrans.NMT.git
git clone https://github.com/NiuTrans/NiuTensor.git
mv NiuTrans.Tensor/source NiuTrans.NMT/source/niutensor
rm NiuTrans.NMT/source/niutensor/Main.cpp
rm -rf NiuTrans.NMT/source/niutensor/sample NiuTrans.NMT/source/niutensor/tensor/test
mkdir NiuTrans.NMT/build && cd NiuTrans.NMT/build
cmake -DUSE_CUDA=ON -DCUDA_TOOLKIT_ROOT_DIR='/home/nlplab/cuda9.2/' -DUSE_FP16=ON ..
make -j
```
## Compile with MKL supported
```bash
git clone https://github.com/NiuTrans/NiuTrans.NMT.git
git clone https://github.com/NiuTrans/NiuTensor.git
mv NiuTrans.Tensor/source NiuTrans.NMT/source/niutensor
rm NiuTrans.NMT/source/niutensor/Main.cpp
rm -rf NiuTrans.NMT/source/niutensor/sample NiuTrans.NMT/source/niutensor/tensor/test
mkdir NiuTrans.NMT/build && cd NiuTrans.NMT/build
cmake -DUSE_MKL=ON -DINTEL_ROOT='/home/nlplab/intel/compilers_and_libraries_2020.2.254/linux' ..
make -j
```
## Compile with OpenBLAS supported
```bash
git clone https://github.com/NiuTrans/NiuTrans.NMT.git
git clone https://github.com/NiuTrans/NiuTensor.git
mv NiuTrans.Tensor/source NiuTrans.NMT/source/niutensor
rm NiuTrans.NMT/source/niutensor/Main.cpp
rm -rf NiuTrans.NMT/source/niutensor/sample NiuTrans.NMT/source/niutensor/tensor/test
mkdir NiuTrans.NMT/build && cd NiuTrans.NMT/build
cmake -DUSE_OPENBLAS=ON -DOPENBLAS_ROOT='/home/nlplab/openblas/' ..
make -j
```
# Training a new model
## IWSLT'14 German to English (Transformer)
The following instructions can train a Transformer model on the [IWSLT'14 German to English dataset](http://workshop2014.iwslt.org/downloads/proceeding.pdf).
Step 1: Prepare the training data:
*We provide the BPE code for better reproducibility. The source and target vocabulary are shared with 10,000 merges.*
```bash
# Extract the data
cd sample/train/
IWSLT_PATH=iwslt14.tokenized.de-en
tar -zxvf $IWSLT_PATH.tar.gz
IWSLT_PATH=sample/train/$IWSLT_PATH
# Binarize the data
cd ../..
python3 tools/GetVocab.py \
-raw $IWSLT_PATH/bpevocab \
-new $IWSLT_PATH/vocab.de
python3 tools/GetVocab.py \
-raw $IWSLT_PATH/bpevocab \
-new $IWSLT_PATH/vocab.en
python3 tools/PrepareParallelData.py \
-src $IWSLT_PATH/train.de -tgt $IWSLT_PATH/train.en \
-src_vocab $IWSLT_PATH/vocab.de -tgt_vocab $IWSLT_PATH/vocab.en \
-output $IWSLT_PATH/train.data
python3 tools/PrepareParallelData.py \
-src $IWSLT_PATH/valid.de -tgt $IWSLT_PATH/valid.en \
-src_vocab $IWSLT_PATH/vocab.de -tgt_vocab $IWSLT_PATH/vocab.en \
-output $IWSLT_PATH/valid.data
```
*You may extract the data manually on Windows.*
Step 2: Train the model with default configurations
(6 encoder/decoder layer, 512 model size, 50 epoches):
```bash
bin/NiuTrans.NMT \
-dev 0 \
-nepoch 50 \
-model model.bin \
-maxcheckpoint 10 \
-train $IWSLT_PATH/train.data \
-valid $IWSLT_PATH/valid.data
```
Step 3: Average the last ten checkpoints:
```bash
python tools/Ensemble.py -input 'model.bin.*' -output model.ensemble
```
It costs about 310s per epoch on a GTX 1080 Ti.
Expected BLEU score (lenalpha=0.6, maxlenalpha=1.2):
| Model type | Beam Search | Greedy Search |
| --------------- | --------------- | --------------- |
| Single model | 34.05 (beam=4) | 33.35 |
| Ensemble model | 34.48 (beam=4) | 34.01 |
We provide models trained with the default configurations:
[Google Drive](https://drive.google.com/drive/folders/10W89cx60Q7A9nGyg5fwLP21Sg53n6NXV?usp=sharing)
[Baidu Cloud](https://pan.baidu.com/s/1LbkV8kuaDWNunVR2jwOhRg) (password: bdwp)
\ No newline at end of file
# Translating with pre-trained models
## IWSLT'14 En-De Models
The following instructions can be used to translate with a pre-trained Transformer model.
You can evaluate models trained in the [training example](../sample/train) by two steps.
Step 1: Translate the IWSLT14 De-En test set (tokenized) on the GPU:
```
IWSLT_PATH=sample/train/iwslt14.tokenized.de-en
bin/NiuTrans.NMT \
-dev 0 \
-test $IWSLT_PATH/test.de \
-model model.bin \
-sbatch 64 \
-beamsize 1 \
-srcvocab $IWSLT_PATH/vocab.de \
-tgtvocab $IWSLT_PATH/vocab.en \
-output output.atat
sed -r 's/(@@ )|(@@ ?$)//g' < output.atat > output
```
You can also set `-dev -1` to use the CPU.
Step 2: Check the translation with [multi-bleu](https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/multi-bleu.perl):
```
perl multi-bleu.perl $IWSLT_PATH/test.en < output
```
It takes about 15s for translating test.de (6,750 sentences) on a GTX 1080 Ti with a greedy search.
## WNGT 2020 Models
The models here are the submissions to the [WNGT 2020 efficiency task](https://sites.google.com/view/wngt20/efficiency-task), which focuses on developing efficient MT systems.
The WNGT 2020 efficiency task constrains systems to translate 1 million sentences on CPUs and GPUs under the condition of the [WMT 2019 English-German news](http://statmt.org/wmt19/translation-task.html) translation task.
- For CPUs, the performance was measured on an [AWS c5.metal instance](https://aws.amazon.com/cn/blogs/aws/now-available-new-c5-instance-sizes-and-bare-metal-instances/) with 96 logical Cascade Lake processors and 192 GB memory. We submitted one system (9-1-tiny) running with all CPU cores.
- For GPUs, the performance was measured on an [AWS g4dn.xlarge instance](https://aws.amazon.com/cn/ec2/instance-types/g4/) with an NVIDIA T4 GPU and 16 GB memory. We submitted four systems (9-1, 18-1, 35-1, 35-6) running with FP16.
We list the results of all submissions. See [the official results](https://docs.google.com/spreadsheets/d/1M82S5wPSIM543Gh20d71Zs0FNHJQ3JdiJzDECiYJNlE/edit#gid=0) for more details.
| Model type | Time (s) | File size (MiB) | BLEU | Word per second |
| ---------- | -------- | --------------- | ---- | --------------- |
| 9-1-tiny* | 810 | 66.8 | 27.0 | 18518 |
| 9-1 | 977 | 99.3 | 31.1 | 15353 |
| 18-1 | 1355 | 156.1 | 31.4 | 11070 |
| 35-1 | 2023 | 263.3 | 32.0 | 7418 |
| 35-6 | 3166 | 305.4 | 32.2 | 4738 |
<em>* means run on CPUs. </em>
Description:
* `Model type` - Number of encoder and decoder layers, e.g., 9-1 means that the model consists of 9 encoder layers and 1 decoder layer. The model size is 512 except for the *tiny* model, whose size is 256.
* `Time` - Real time took for translating the whole test set, which contains about 1 million sentences with ~15 million tokens. The time of the `tiny` model was measured on CPUs, while other models were measured on GPUs.
* `File size` - All models are stored in FP16 except for the `tiny` model stored in FP32.
* `BLEU` - We report the averaged sacre BLEU score across wmt10 to wmt19, wmt12 is excluded. BLEU+case.mixed+lang.en-de+numrefs.1+smooth.exp+test.wmt10+tok.13a+version.1.4.9 (for wmt10, similar for others).
All these models and docker images are available at:
[Baidu Cloud](https://pan.baidu.com/s/1J8kRoF3d5P-XA4Qd3YT4ZQ) password: bdwp
[Google Drive](https://drive.google.com/file/d/1tgCUN8TnUsbcI7BCYFQkj30rCvk68YRb) (docker images only)
\ No newline at end of file
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-10
*/
//#define CRTDBG_MAP_ALLOC
//#include <stdlib.h>
//#include <crtdbg.h>
#include "./nmt/NMT.h"
#include "niutensor/network/XNoder.h"
#include "niutensor/tensor/XTensor.h"
#include "niutensor/tensor/core/movement/Spread.h"
using namespace nmt;
using namespace nts;
void test() {
XTensor input, node, index;
InitTensor2D(&input, 32, 4);
InitTensor2D(&input, 13, 4);
InitTensor2D(&input, 32, 4);
XNoder::MakeGrad(&input);
XTensor* tmp = NewTensorBufV2(&input, input.devID, input.mem);
_SpreadForGather(tmp, node.grad, &index);
_SumMe(input.grad, tmp);
input.grad->Dump(stderr);
}
int main(int argc, const char** argv)
{
//_CrtSetDbgFlag(_CrtSetDbgFlag(_CRTDBG_REPORT_FLAG) | _CRTDBG_LEAK_CHECK_DF);
//_CrtSetBreakAlloc(2708);
NMTMain(argc - 1, argv + 1);
//test();
//_CrtDumpMemoryLeaks();
return 0;
}
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-10-09
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#include "Decoder.h"
#include "Utility.h"
#include "layer/LayerNorm.h"
#include "layer/CommonModules.h"
#include "../niutensor/tensor/core/CHeader.h"
namespace nmt
{
/* constructor */
AttDecoder::AttDecoder()
{
selfAtt = NULL;
fnns = NULL;
selfAttLayerNorms = NULL;
fnnLayerNorms = NULL;
enDeAtt = NULL;
enDeAttLayerNorms = NULL;
decoderLayerNorm = NULL;
selfAttCache = NULL;
enDeAttCache = NULL;
}
/* de-constructor */
AttDecoder::~AttDecoder()
{
delete[] selfAttCache;
delete[] enDeAttCache;
delete[] selfAtt;
delete[] fnns;
delete[] selfAttLayerNorms;
delete[] fnnLayerNorms;
delete[] enDeAtt;
delete[] enDeAttLayerNorms;
if (preNorm)
delete decoderLayerNorm;
}
/*
initialize the model
>> config - configurations of the model
*/
void AttDecoder::InitModel(Config& config)
{
devID = config.devID;
nlayer = config.nDecLayer;
hSize = config.modelSize;
eSize = config.embSize;
vSize = config.tgtVocabSize;
dropoutP = config.dropout;
preNorm = config.preNorm;
CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsizetgt\"");
/* embedding model */
embedder.InitModel(config, false);
selfAtt = new Attention[nlayer];
fnns = new FNN[nlayer];
selfAttLayerNorms = new LN[nlayer];
enDeAtt = new Attention[nlayer];
enDeAttLayerNorms = new LN[nlayer];
fnnLayerNorms = new LN[nlayer];
selfAttCache = new Cache[nlayer];
enDeAttCache = new Cache[nlayer];
if (preNorm)
decoderLayerNorm = new LN;
/* initialize the stacked layers */
for (int i = 0; i < nlayer; i++) {
selfAtt[i].InitModel(config);
fnns[i].InitModel(config);
selfAttLayerNorms[i].InitModel(config);
fnnLayerNorms[i].InitModel(config);
enDeAtt[i].InitModel(config);
enDeAttLayerNorms[i].InitModel(config);
selfAttCache[i].enable = true;
enDeAttCache[i].enable = true;
}
if (preNorm)
decoderLayerNorm->InitModel(config);
}
/*
make the decoding network
>> inputDec - the input tensor of the decoder
>> outputEnc - the output tensor of the encoder
>> mask - mask that indicates which position is valid
>> maskEncDec - mask for the encoder-decoder attention
>> nstep - the current length of the decoder input
>> isTraining - indicates whether the model is used for training
<< return - the output tensor of the decoder
*/
XTensor AttDecoder::Make(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
XTensor* maskEncDec, int nstep, bool isTraining)
{
XTensor x;
x = embedder.Make(inputDec, true, isTraining, nstep);
/* dropout */
if (isTraining && dropoutP > 0)
x = Dropout(x, dropoutP);
for (int i = 0; i < nlayer; i++) {
XTensor att;
XTensor ende;
XTensor fnn;
XTensor res;
XTensor selfAttnBefore;
XTensor selfAttnAfter;
XTensor endeAttnBefore;
XTensor endeAttnAfter;
XTensor fnnBefore;
/* layer normalization with pre-norm for self-attn */
selfAttnBefore = LayerNorm(x, selfAttLayerNorms[i], preNorm, true, false);
/******************/
/* self attention */
att = selfAtt[i].Make(selfAttnBefore, selfAttnBefore, selfAttnBefore,
mask, isTraining, &selfAttCache[i], SELF_ATT);
/* dropout */
if (isTraining && dropoutP > 0)
att = Dropout(att, dropoutP);
/* residual connection */
res = Sum(att, x);
/* layer normalization with post-norm for self-attention */
selfAttnAfter = LayerNorm(res, selfAttLayerNorms[i], preNorm, false, true);
/* layer normalization with pre-norm for encoder-decoder attention */
endeAttnBefore = LayerNorm(selfAttnAfter, enDeAttLayerNorms[i], preNorm, true, false);
/* encoder-decoder attention */
ende = enDeAtt[i].Make(outputEnc, endeAttnBefore, outputEnc, maskEncDec,
isTraining, &enDeAttCache[i], EN_DE_ATT);
/* dropout */
if (isTraining && dropoutP > 0)
ende = Dropout(ende, dropoutP);
/* residual connection */
res = Sum(ende, selfAttnAfter);
/* layer normalization with post-norm for encoder-decoder attention */
endeAttnAfter = LayerNorm(res, enDeAttLayerNorms[i], preNorm, false, true);
/* layer normalization with pre-norm for fnn */
fnnBefore = LayerNorm(endeAttnAfter, fnnLayerNorms[i], preNorm, true, false);
/* fnn */
fnn = fnns[i].Make(fnnBefore, isTraining);
/* dropout */
if (isTraining && dropoutP > 0)
fnn = Dropout(fnn, dropoutP);
/* residual connection */
res = Sum(fnn, endeAttnAfter);
/* layer normalization with post-norm for fnn */
x = LayerNorm(res, fnnLayerNorms[i], preNorm, false, true);
}
if (preNorm)
return decoderLayerNorm->Make(x);
return x;
}
/*
make the decoding network
>> inputDec - the input tensor of the decoder
>> outputEnc - the output tensor of the encoder
>> mask - mask that indicates which position is valid
>> maskEncDec - mask for the encoder-decoder attention
>> nstep - the current length of the decoder input
>> isTraining - indicates whether the model is used for training
<< return - the output tensor of the decoder
*/
XTensor AttDecoder::MakeFast(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
XTensor* maskEncDec, int nstep, bool isTraining)
{
XTensor x;
x = embedder.Make(inputDec, true, isTraining, nstep);
/* dropout */
if (isTraining && dropoutP > 0)
x = Dropout(x, dropoutP);
for (int i = 0; i < nlayer; i++) {
XTensor res;
res = x;
/* layer normalization with pre-norm for self-attn */
x = selfAttLayerNorms[i].Make(x);
/******************/
/* self attention */
x = selfAtt[i].Make(x, x, x, mask, isTraining, &selfAttCache[i], SELF_ATT);
/* dropout */
if (isTraining && dropoutP > 0)
x = Dropout(x, dropoutP);
/* residual connection */
x = Sum(res, x);
res = x;
/* layer normalization with pre-norm for encoder-decoder attention */
x = enDeAttLayerNorms[i].Make(x);
/* encoder-decoder attention */
x = enDeAtt[i].Make(outputEnc, x, outputEnc, maskEncDec,
isTraining, &enDeAttCache[i], EN_DE_ATT);
/* dropout */
if (isTraining && dropoutP > 0)
x = Dropout(x, dropoutP);
/* residual connection */
x = Sum(res, x);
res = x;
/* layer normalization with pre-norm for fnn */
x = fnnLayerNorms[i].Make(x);
/* fnn */
x = fnns[i].Make(x, isTraining);
/* dropout */
if (isTraining && dropoutP > 0)
x = Dropout(x, dropoutP);
/* residual connection */
x = Sum(res, x);
}
x = decoderLayerNorm->Make(x);
return x;
}
}
\ No newline at end of file
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#ifndef __DECODER_H__
#define __DECODER_H__
#include "Encoder.h"
#include "Utility.h"
namespace nmt
{
class AttDecoder
{
public:
/* device id */
int devID;
/* layer number */
int nlayer;
/* hidden layer size of the FNN layer */
int hSize;
/* embedding size */
int eSize;
/* vocabulary size */
int vSize;
/* dropout probability */
DTYPE dropoutP;
/* embedding of word at each position */
Embedder embedder;
/* FNN model of each layer */
FNN* fnns;
/* attention model of each layer */
Attention* selfAtt;
/* layer normalization for attention */
LN* selfAttLayerNorms;
/* layer normalization for fnn */
LN* fnnLayerNorms;
/* layer normalization for decoder */
LN* decoderLayerNorm;
/* encoder-decoder attention model of each layer */
Attention* enDeAtt;
/* layer normalization for encoder-decoder attention */
LN* enDeAttLayerNorms;
/* layer cache list */
Cache* selfAttCache;
/* layer cache list */
Cache* enDeAttCache;
/* the location of layer normalization */
bool preNorm;
public:
/* constructor */
AttDecoder();
/* de-constructor */
~AttDecoder();
/* initialize the model */
void InitModel(Config& config);
/* make the decoding network */
XTensor Make(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
XTensor* maskEncDec, int nstep, bool isTraining);
/* make the decoding network (pre norm) */
XTensor MakeFast(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
XTensor* maskEncDec, int nstep, bool isTraining);
};
}
#endif
\ No newline at end of file
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#include "Encoder.h"
#include "Utility.h"
#include "layer/LayerNorm.h"
#include "layer/CommonModules.h"
#include "../niutensor/tensor/core/CHeader.h"
namespace nmt
{
/* constructor */
AttEncoder::AttEncoder()
{
selfAtt = NULL;
fnns = NULL;
attLayerNorms = NULL;
fnnLayerNorms = NULL;
encoderLayerNorm = NULL;
}
/* de-constructor */
AttEncoder::~AttEncoder()
{
delete[] selfAtt;
delete[] fnns;
delete[] attLayerNorms;
delete[] fnnLayerNorms;
if (preNorm)
delete encoderLayerNorm;
}
/*
initialize the model
>> config - configurations for the model
*/
void AttEncoder::InitModel(Config& config)
{
devID = config.devID;
nlayer = config.nEncLayer;
eSize = config.embSize;
hSize = config.modelSize;
vSize = config.srcVocabSize;
preNorm = config.preNorm;
dropoutP = config.dropout;
CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
CheckNTErrors(vSize > 1, "Set vocabulary size by \"-vsize\"");
/* embedding model */
embedder.InitModel(config);
selfAtt = new Attention[nlayer];
fnns = new FNN[nlayer];
attLayerNorms = new LN[nlayer];
fnnLayerNorms = new LN[nlayer];
if (preNorm)
encoderLayerNorm = new LN;
/* initialize the stacked layers */
for (int i = 0; i < nlayer; i++) {
selfAtt[i].InitModel(config);
fnns[i].InitModel(config);
attLayerNorms[i].InitModel(config);
fnnLayerNorms[i].InitModel(config);
}
if (preNorm)
encoderLayerNorm->InitModel(config);
}
/*
make the encoding network
>> input - the input tensor of the encoder
>> mask - the mask that indicate each position is valid
>> maskEncDec - no use
>> isTraining - indicates whether the model is used for training
<< return - the output tensor of the encoder
*/
XTensor AttEncoder::Make(XTensor& input, XTensor* mask, XTensor& maskEncDec, bool isTraining)
{
XTensor x;
x = embedder.Make(input, false, isTraining);
/* dropout */
if (isTraining && dropoutP > 0)
x = Dropout(x, dropoutP);
for (int i = 0; i < nlayer; i++) {
XTensor att;
XTensor fnn;
XTensor res;
XTensor attnBefore;
XTensor attnAfter;
XTensor fnnBefore;
/* layer normalization with pre-norm for self-attn */
attnBefore = LayerNorm(x, attLayerNorms[i], preNorm, true, false);
/* self attention */
att = selfAtt[i].Make(attnBefore, attnBefore, attnBefore, mask, isTraining, NULL, SELF_ATT);
/* dropout */
if (isTraining && dropoutP > 0)
att = Dropout(att, dropoutP);
/* residual connection */
res = Sum(att, x);
/* layer normalization with post-norm for self-attn */
attnAfter = LayerNorm(res, attLayerNorms[i], preNorm, false, true);
/* layer normalization with pre-norm for fnn */
fnnBefore = LayerNorm(attnAfter, fnnLayerNorms[i], preNorm, true, false);
/* fnn */
fnn = fnns[i].Make(fnnBefore, isTraining);
/* dropout */
if (isTraining && dropoutP > 0)
fnn = Dropout(fnn, dropoutP);
/* residual connection */
res = Sum(fnn, attnAfter);
/* layer normalization with post-norm for fnn */
x = LayerNorm(res, fnnLayerNorms[i], preNorm, false, true);
}
if (preNorm)
return encoderLayerNorm->Make(x);
return x;
}
/*
make the encoding network
>> input - the input tensor of the encoder
>> mask - the mask that indicate each position is valid
>> maskEncDec - no use
>> isTraining - indicates whether the model is used for training
<< return - the output tensor of the encoder
*/
XTensor AttEncoder::MakeFast(XTensor& input, XTensor* mask, XTensor& maskEncDec, bool isTraining)
{
XTensor x;
x = embedder.Make(input, false, isTraining);
/* dropout */
if (isTraining && dropoutP > 0)
x = Dropout(x, dropoutP);
for (int i = 0; i < nlayer; i++) {
XTensor res;
res = x;
/* layer normalization with pre-norm for self-attn */
x = attLayerNorms[i].Make(x);
/* self attention */
x = selfAtt[i].Make(x, x, x, mask, isTraining, NULL, SELF_ATT);
/* dropout */
if (isTraining && dropoutP > 0)
x = Dropout(x, dropoutP);
/* residual connection */
x = Sum(res, x);
res = x;
/* layer normalization with pre-norm for fnn */
x = fnnLayerNorms[i].Make(x);
/* fnn */
x = fnns[i].Make(x, isTraining);
/* dropout */
if (isTraining && dropoutP > 0)
x = Dropout(x, dropoutP);
/* residual connection */
x = Sum(res, x);
}
x = encoderLayerNorm->Make(x);
return x;
}
/*
make the encoding network (wrapper)
>> input - the input tensor of the encoder
>> mask - the mask that indicate each position is valid
>> isTraining - indicates whether the model is used for training
<< return - the output tensor of the encoder
*/
XTensor AttEncoder::Make(XTensor& input, XTensor* mask, bool isTraining)
{
XTensor nothing;
return Make(input, mask, nothing, isTraining);
}
}
\ No newline at end of file
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#ifndef __ENCODER_H__
#define __ENCODER_H__
#include "Utility.h"
#include "layer/FNN.h"
#include "layer/Attention.h"
#include "layer/Embedding.h"
#include "layer/LayerNorm.h"
#include "../niutensor/network/XNet.h"
using namespace nts;
namespace nmt
{
/*
base class of the encoder
*/
class Encoder
{
public:
virtual XTensor Make(XTensor& input, XTensor* mask, XTensor& mask2, bool isTraining) = 0;
};
/*
the encoder based on self-attention
*/
class AttEncoder : Encoder
{
public:
/* device id */
int devID;
/* layer number */
int nlayer;
/* hidden layer size of the FNN layer */
int hSize;
/* embedding size */
int eSize;
/* vocabulary size */
int vSize;
/* dropout probability */
DTYPE dropoutP;
/* some positions can be ignored in attention. this is useful in lm where the first position needs
special design for the attention model. */
int ignored;
/* embedding of word at each position */
Embedder embedder;
/* FNN model of each layer */
FNN* fnns;
/* attention model of each layer */
Attention* selfAtt;
/* layer normalizations for attention */
LN* attLayerNorms;
/* layer normalization for fnn */
LN* fnnLayerNorms;
/* layer normalization for encoder */
LN* encoderLayerNorm;
/* the location of layer normalization */
bool preNorm;
public:
/* constructor */
AttEncoder();
/* de-constructor */
~AttEncoder();
/* initialize the model */
void InitModel(Config& config);
/* make the encoding network */
XTensor Make(XTensor& input, XTensor* mask, XTensor& maskEncDec, bool isTraining);
/* make the encoding network */
XTensor MakeFast(XTensor& input, XTensor* mask, XTensor& maskEncDec, bool isTraining);
/* make the encoding network (wrapper) */
XTensor Make(XTensor& input, XTensor* mask, bool isTraining);
};
}
#endif
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#include <cstdint>
#include "Model.h"
#include "Utility.h"
#include "../niutensor/tensor/XUtility.h"
#include "../niutensor/tensor/core/CHeader.h"
namespace nmt
{
/* constructor */
Model::Model()
{
devID = -1;
isLM = false;
isMT = false;
useFP16 = false;
shareAllEmbeddings = 0;
shareDecInputOutputWeight = 0;
nhead = 1;
encoder = new AttEncoder();
decoder = new AttDecoder();
outputLayer = new Output();
}
/* de-constructor */
Model::~Model()
{
delete encoder;
delete decoder;
delete outputLayer;
}
/*
initialize the model
>> config - configurations of the model
*/
void Model::InitModel(Config& config)
{
devID = config.devID;
isMT = config.isMT;
isLM = !isMT;
useFP16 = config.useFP16;
/* configurations for the model */
int* metaInfo[] = {
&config.nEncLayer, &config.nDecLayer,
&config.fnnHiddenSize, &config.modelSize,
&config.embSize, &config.srcVocabSize,
&config.tgtVocabSize, &config.nhead,
&config.maxRP, &config.shareAllEmbeddings,
&config.shareDecInputOutputWeight,
&config.maxPosLen
};
FILE* modelFile = NULL;
/* read model configurations */
if (!config.isTraining) {
modelFile = fopen(config.modelFN, "rb");
CheckNTErrors(modelFile, "Failed to open the model file");
for (auto& meta : metaInfo) {
fread(meta, sizeof(int), 1, modelFile);
}
}
else {
/* read the source and target vocab size */
FILE* trainF = fopen(config.trainFN, "rb");
CheckNTErrors(trainF, "Failed to open the training file");
fread(&config.srcVocabSize, sizeof(config.srcVocabSize), 1, trainF);
fread(&config.tgtVocabSize, sizeof(config.tgtVocabSize), 1, trainF);
CheckNTErrors(config.srcVocabSize > 0, "Invalid source vocabulary size");
CheckNTErrors(config.tgtVocabSize > 0, "Invalid target vocabulary size");
fclose(trainF);
}
nhead = config.nhead;
shareAllEmbeddings = config.shareAllEmbeddings;
shareDecInputOutputWeight = config.shareDecInputOutputWeight;
ShowModelConfig(config);
encoder->InitModel(config);
outputLayer->InitModel(config);
if (isMT)
decoder->InitModel(config);
/* load parameters */
if (!config.isTraining)
Read(modelFile);
else {
TensorList params;
GetParams(params);
for (int i = 0; i < params.Size(); i++)
params[i]->SetVarFlag();
}
if (modelFile != NULL)
fclose(modelFile);
}
/*
print model configurations
>> config - model configurations
*/
void Model::ShowModelConfig(Config& config)
{
/* TODO: output more info */
XPRINT1(0, stderr, "encoder layer: %d\n", config.nEncLayer);
XPRINT1(0, stderr, "decoder layer: %d\n", config.nDecLayer);
XPRINT1(0, stderr, "attention heads: %d\n", config.nhead);
XPRINT1(0, stderr, "model size: %d\n", config.modelSize);
XPRINT1(0, stderr, "source vocab size: %d\n", config.srcVocabSize);
XPRINT1(0, stderr, "target vocab size: %d\n", config.tgtVocabSize);
}
/*
make the encoding network
>> input - input tensor, (batchSize, srcLen)
>> mask - the mask for encoder self-attention, (headNum, batchSize, srcLen, srcLen)
>> isTraining - indicates whether we are training the model
<< return - encoding result, (batchSize, srcLen, hiddenDim)
*/
XTensor Model::MakeEncoder(XTensor& input, XTensor* mask, bool isTraining)
{
XTensor nothing;
return encoder->Make(input, mask, nothing, isTraining);
}
/*
make the decoding network
>> inputDec - input tensor of the decoder, (batchSize, tgtLen)
>> outputEnc - output tensor of the encoder, (batchSize, srcLen, hiddenDim)
>> mask - mask for decoder self-attention, (headNum, batchSize, tgtLen, tgtLen)
>> maskEncDec - mask for the encoder-decoder attention, (headNum, batchSize, tgtLen, srcLen)
>> isTraining - indicates whether we are training the model
<< return - decoding result, (batchSize, tgtLen, hiddenDim)
*/
XTensor Model::MakeDecoder(XTensor& inputDec, XTensor& outputEnc,
XTensor* mask, XTensor& maskEncDec, bool isTraining)
{
return decoder->Make(inputDec, outputEnc, mask, &maskEncDec,
inputDec.GetDim(1), isTraining);
}
/*
make the network for language modeling (with the output softmax layer)
>> input - input tensor
>> output - output tensor (distribution)
>> padding - padding of the sequences
>> isTraining - indicates whether the model is for training
*/
void Model::MakeLM(XTensor& input, XTensor& output, XTensor& padding, bool isTraining)
{
int len = padding.GetDim(padding.order - 1);
int* dims = new int[padding.order + 2];
for (int i = 0; i < padding.order; i++)
dims[i + 1] = padding.GetDim(i);
dims[0] = nhead;
dims[padding.order + 1] = len;
XTensor mask;
InitTensor(&mask, padding.order + 2, dims, X_FLOAT, padding.devID);
delete[] dims;
/* a upper triangular matrix where the cells of the upper triangular are set to -1e-9.
this matrix can be used to prevent the attention to current or following words in
a given sequence. */
_SetDataLowTri(&mask, 1e9F, 0);
ScaleAndShiftMe(mask, 1.0F, -1e9F);
/* forward */
XTensor encoding;
encoding = MakeEncoder(input, &mask, isTraining);
outputLayer->Make(encoding, output, true, true);
}
/*
make the network for machine translation (with the output softmax layer)
>> inputEnc - input tensor of the encoder, (batchSize, srcLen)
>> inputDec - input tensor of the decoder, (batchSize, tgtLen)
>> output - output tensor (distribution), (batchSize, tgtLen, hiddenDim)
>> paddingEnc - padding of the sequences (on the encoder side), (batchSize, srcLen)
>> paddingDec - padding of the sequences (on the decoder side), (batchSize, tgtLen)
>> isTraining - indicates whether the model is for training
*/
void Model::MakeMT(XTensor& inputEnc, XTensor& inputDec, XTensor& output,
XTensor& paddingEnc, XTensor& paddingDec, bool isTraining)
{
XTensor encoding;
XTensor decoding;
XTensor maskEnc;
XTensor maskDec;
XTensor maskEncDec;
/* encoder mask */
MakeMTMaskEnc(paddingEnc, maskEnc);
/* decoder mask */
MakeMTMaskDec(paddingEnc, paddingDec, maskDec, maskEncDec);
encoding = MakeEncoder(inputEnc, &maskEnc, isTraining);
decoding = MakeDecoder(inputDec, encoding, &maskDec, maskEncDec, isTraining);
outputLayer->Make(decoding, output, true, true);
}
/*
make the mask for training MT models
>> inputEnc - input of the encoder
>> inputDec - input of the decoder
>> paddingEnc - padding of the encoder input
>> paddingDec - padding of the decoder input
>> maskEnc - mask of the encoder self-attention
>> maksDec - mask of the decoder self-attention
>> maksEncDec - mask of the decoder enc-dec attention
*/
void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
XTensor& paddingEnc, XTensor& paddingDec,
XTensor& maskEnc, XTensor& maskDec, XTensor& maskEncDec)
{
int len = inputDec.GetDim(inputDec.order - 1);
int* dims = new int[inputDec.order + 2];
for (int i = 0; i < inputDec.order; i++)
dims[i + 1] = inputDec.GetDim(i);
dims[0] = nhead;
dims[inputDec.order + 1] = len;
InitTensor(&maskDec, inputDec.order + 2, dims, X_FLOAT, paddingDec.devID);
/* an upper triangular matrix where the cells of the upper triangular are set to -1e-9.
this matrix can be used to prevent the attention to current or following words in
a given sequence. */
_SetDataLowTri(&maskDec, 1e9F, 0);
ScaleAndShiftMe(maskDec, 1.0F, -1e9F);
/* encoder-decoder mask that prevents the attention to padding dummy words */
dims[inputDec.order + 1] = inputEnc.GetDim(inputEnc.order - 1);
InitTensor(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, paddingEnc.devID);
XTensor* maskEncDecTMPEnc = NewTensorBuf(paddingEnc.order + 1, dims + 1,
paddingEnc.dataType, paddingEnc.devID);
XTensor* maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID);
_Unsqueeze(&paddingEnc, maskEncDecTMPEnc, paddingEnc.order - 1, paddingDec.GetDim(-1));
_ScaleAndShiftMe(maskEncDecTMPEnc, 1e9F, -1e9F);
_Unsqueeze(maskEncDecTMPEnc, &maskEncDec, 0, dims[0]);
DelTensorBuf(maskEncDecTMPDec);
DelTensorBuf(maskEncDecTMPEnc);
/* padding on the source side */
int* dimsPadding = new int[paddingEnc.order + 2];
for (int i = 0; i < paddingEnc.order - 1; i++)
dimsPadding[i] = paddingEnc.GetDim(i);
dimsPadding[paddingEnc.order - 1] = paddingEnc.GetDim(-1);
dimsPadding[paddingEnc.order] = paddingEnc.GetDim(-1);
XTensor* padding2 = NewTensorBuf(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType,
paddingEnc.devID);
for (int i = 0; i < padding2->order; i++)
dimsPadding[i + 1] = padding2->GetDim(i);
dimsPadding[0] = nhead;
XTensor* padding3 = NewTensorBuf(paddingEnc.order + 2, dimsPadding, paddingEnc.dataType, paddingEnc.devID);
/* mask of the padding */
_Unsqueeze(&paddingEnc, padding2, paddingEnc.order - 1, paddingEnc.GetDim(-1));
_Unsqueeze(padding2, padding3, 0, nhead);
_ScaleAndShiftMe(padding3, 1e9F, -1e9F);
InitTensor(&maskEnc, padding3);
maskEnc.SetZeroAll();
/* generate the mask on the source language side (for padding) */
_Sum(&maskEnc, padding3, &maskEnc);
delete[] dims;
delete[] dimsPadding;
DelTensorBuf(padding3);
DelTensorBuf(padding2);
}
/*
make the mask of the encoder
>> paddingEnc - padding of the encoder input, (batchSize, srcLen)
>> maskEnc - mask of the encoder self-attention, (headNum, batchSize, srcLen, srcLen)
*/
void Model::MakeMTMaskEnc(XTensor& paddingEnc, XTensor& maskEnc)
{
XTensor padding2;
/* mask of the padding */
Unsqueeze(paddingEnc, padding2, paddingEnc.order - 1, paddingEnc.GetDim(-1));
Unsqueeze(padding2, maskEnc, 0, nhead);
ScaleAndShiftMe(maskEnc, 1e9F, -1e9F);
}
/*
make the mask of the decoder
>> paddingEnc - padding of the encoder input, (batchSize, srcLen)
>> paddingDec - padding of the decoder input, (batchSize, tgtLen)
>> maksDec - mask of the decoder self-attention, (headNum, batchSize, tgtLen, tgtLen)
>> maksEncDec - mask of the decoder enc-dec attention, (headNum, batchSize, tgtLen, srcLen)
*/
void Model::MakeMTMaskDec(XTensor& paddingEnc, XTensor& paddingDec,
XTensor& maskDec, XTensor& maskEncDec)
{
int len = paddingDec.GetDim(paddingDec.order - 1);
int* dims = new int[paddingDec.order + 2];
for (int i = 0; i < paddingDec.order; i++)
dims[i + 1] = paddingDec.GetDim(i);
dims[0] = nhead;
dims[paddingDec.order + 1] = len;
InitTensor(&maskDec, paddingDec.order + 2, dims, X_FLOAT, paddingDec.devID);
/* An upper triangular matrix where the cells of the upper triangular are set to -1e-9.
This matrix can be used to block the attention to current or following words in
a given sequence. */
_SetDataLowTri(&maskDec, 1e9F, 0);
ScaleAndShiftMe(maskDec, 1.0F, -1e9F);
/* encoder-decoder mask that prevents the attention to padding dummy words */
XTensor maskEncDecTMP;
Unsqueeze(paddingEnc, maskEncDecTMP, paddingEnc.order - 1, paddingDec.GetDim(-1));
ScaleAndShiftMe(maskEncDecTMP, 1e9F, -1e9F);
Unsqueeze(maskEncDecTMP, maskEncDec, 0, dims[0]);
delete[] dims;
}
/*
get parameter matrices
>> list - the list that keeps the parameter matrics
*/
void Model::GetParams(TensorList& list)
{
list.Clear();
/* encoder parameters */
for (int i = 0; i < encoder->nlayer; i++) {
list.Add(&encoder->selfAtt[i].weightQ);
list.Add(&encoder->selfAtt[i].weightK);
list.Add(&encoder->selfAtt[i].weightV);
list.Add(&encoder->selfAtt[i].biasQ);
list.Add(&encoder->selfAtt[i].biasK);
list.Add(&encoder->selfAtt[i].biasV);
if (encoder->selfAtt[i].useRPR)
list.Add(&encoder->selfAtt[i].RPEmbK);
list.Add(&encoder->selfAtt[i].weightO);
list.Add(&encoder->selfAtt[i].biasO);
list.Add(&encoder->fnns[i].w1);
list.Add(&encoder->fnns[i].b1);
list.Add(&encoder->fnns[i].w2);
list.Add(&encoder->fnns[i].b2);
list.Add(&encoder->attLayerNorms[i].w);
list.Add(&encoder->attLayerNorms[i].b);
list.Add(&encoder->fnnLayerNorms[i].w);
list.Add(&encoder->fnnLayerNorms[i].b);
}
if (encoder->preNorm) {
list.Add(&encoder->encoderLayerNorm->w);
list.Add(&encoder->encoderLayerNorm->b);
}
if (isMT) {
/* decoder parameters */
for (int i = 0; i < decoder->nlayer; i++) {
list.Add(&decoder->selfAtt[i].weightQ);
list.Add(&decoder->selfAtt[i].weightK);
list.Add(&decoder->selfAtt[i].weightV);
list.Add(&decoder->selfAtt[i].biasQ);
list.Add(&decoder->selfAtt[i].biasK);
list.Add(&decoder->selfAtt[i].biasV);
if (decoder->selfAtt[i].useRPR)
list.Add(&decoder->selfAtt[i].RPEmbK);
list.Add(&decoder->selfAtt[i].weightO);
list.Add(&decoder->selfAtt[i].biasO);
list.Add(&decoder->selfAttLayerNorms[i].w);
list.Add(&decoder->selfAttLayerNorms[i].b);
list.Add(&decoder->enDeAtt[i].weightQ);
list.Add(&decoder->enDeAtt[i].weightK);
list.Add(&decoder->enDeAtt[i].weightV);
list.Add(&decoder->enDeAtt[i].biasQ);
list.Add(&decoder->enDeAtt[i].biasK);
list.Add(&decoder->enDeAtt[i].biasV);
list.Add(&decoder->enDeAtt[i].weightO);
list.Add(&decoder->enDeAtt[i].biasO);
list.Add(&decoder->enDeAttLayerNorms[i].w);
list.Add(&decoder->enDeAttLayerNorms[i].b);
list.Add(&decoder->fnns[i].w1);
list.Add(&decoder->fnns[i].b1);
list.Add(&decoder->fnns[i].w2);
list.Add(&decoder->fnns[i].b2);
list.Add(&decoder->fnnLayerNorms[i].w);
list.Add(&decoder->fnnLayerNorms[i].b);
}
if (decoder->preNorm) {
list.Add(&decoder->decoderLayerNorm->w);
list.Add(&decoder->decoderLayerNorm->b);
}
}
list.Add(&encoder->embedder.w);
if (isMT && (shareAllEmbeddings == 0)) {
list.Add(&decoder->embedder.w);
}
if (shareDecInputOutputWeight == 0) {
list.Add(&outputLayer->w);
}
}
/*
dump the model to a file
>> fn - where to save the model
>> model - the model
*/
void Model::Dump(const char* fn)
{
double startT = GetClockSec();
FILE* file = fopen(fn, "wb");
CheckNTErrors(file, "Cannot open the model file");
TensorList params;
GetParams(params);
int metaInfo[]{
encoder->nlayer, decoder->nlayer,
encoder->fnns->hSize, encoder->selfAtt->d,
encoder->embedder.eSize, encoder->embedder.vSize,
decoder->embedder.vSize, encoder->selfAtt->nhead,
encoder->selfAtt->maxRP, shareAllEmbeddings,
shareDecInputOutputWeight, encoder->embedder.maxLength - 1 - 1,
};
/* part 1: hyper-parameters */
fwrite(metaInfo, sizeof(int), sizeof(metaInfo) / sizeof(int), file);
/* part 2: model parameters */
for (int i = 0; i < params.Size(); i++) {
params[i]->BinaryDump(file);
}
fclose(file);
double elapsed = GetClockSec() - startT;
LOG("model saved (took %.1fs)", elapsed);
}
/* read the parameters */
void Model::Read(FILE* file)
{
double startT = GetClockSec();
TensorList params;
GetParams(params);
LOG("params count: %d", params.Size());
int size = 0;
for (int i = 0; i < params.Size(); i++) {
size += params[i]->unitNum;
}
LOG("params size: %d", size);
/* convert parameters to FP16 before reading files */
if (useFP16) {
LOG("Convert parameters to FP16");
for (int i = 0; i < params.Size(); i++) {
XTensor* p = params[i];
InitTensor(p, p->order, p->dimSize, X_FLOAT16, p->devID, p->enableGrad && X_ENABLE_GRAD);
}
auto& encEmb = encoder->embedder.posEmbeddingBase;
auto& decEmb = decoder->embedder.posEmbeddingBase;
encEmb = ConvertDataType(encEmb, X_FLOAT16);
decEmb = ConvertDataType(decEmb, X_FLOAT16);
}
for (int i = 0; i < params.Size(); i++)
params[i]->BinaryRead(file);
/* share all embeddings */
if (shareAllEmbeddings == 1) {
_CopyValues(&encoder->embedder.w, &decoder->embedder.w);
LOG("sharing encoder decoder embeddings");
}
/* share embeddings with output weights */
if (shareDecInputOutputWeight == 1) {
_CopyValues(&decoder->embedder.w, &outputLayer->w);
LOG("sharing decoder embeddings with output weights");
}
double elapsed = GetClockSec() - startT;
LOG("model loaded (took %.1fs)", elapsed);
}
}
\ No newline at end of file
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#ifndef __MODEL_H__
#define __MODEL_H__
#include "Encoder.h"
#include "Decoder.h"
#include "layer/FNN.h"
#include "layer/Output.h"
#include "Utility.h"
#include "layer/Attention.h"
namespace nmt
{
/* a nmt model that keeps parameters of the encoder,
the decoder and the output layer (softmax). */
class Model
{
public:
/* device id */
int devID;
/* the encoder */
AttEncoder* encoder;
/* the decoder */
AttDecoder* decoder;
/* output layer */
Output* outputLayer;
/* indicates whether the model is running for language modeling */
bool isLM;
/* indicates whether the model is running for machine translation */
bool isMT;
/* indicates whether the model is running with FP16 data type */
bool useFP16;
/* number of heads in the attention model */
int nhead;
/* indicates whether share encoders embeddings with decoders */
int shareAllEmbeddings;
/* indicates whether share decoder embeddings with output weights */
int shareDecInputOutputWeight;
public:
/* constructor */
Model();
/* de-constructor */
~Model();
/* initialize the model */
void InitModel(Config& config);
/* print model configurations */
void ShowModelConfig(Config& config);
/* make the encoding network */
XTensor MakeEncoder(XTensor& input, XTensor* mask, bool isTraining);
/* make the encoding network */
XTensor MakeDecoder(XTensor& inputEnc, XTensor& inputDec, XTensor* mask,
XTensor& MaskEncDec, bool isTraining);
/* make the network for language modeling (with the output softmax layer) */
void MakeLM(XTensor& input, XTensor& output, XTensor& padding, bool isTraining);
/* make the network for machine translation (with the output softmax layer) */
void MakeMT(XTensor& inputEnc, XTensor& inputDec, XTensor& output,
XTensor& paddingEnc, XTensor& paddingDec, bool isTraining);
/* make the mask for training MT models */
void MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
XTensor& paddingEnc, XTensor& paddingDec,
XTensor& maskEnc, XTensor& maskDec, XTensor& maskEncDec);
/* make the mask of the encoder */
void MakeMTMaskEnc(XTensor& paddingEnc, XTensor& maskEnc);
/* make the mask of the decoder */
void MakeMTMaskDec(XTensor& paddingEnc, XTensor& paddingDec,
XTensor& maskDec, XTensor& maskEncDec);
/* get parameter matrices */
void GetParams(TensorList& list);
/* dump the model to a file */
void Dump(const char* fn);
/* read the parameters */
void Read(FILE* file);
};
}
#endif
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-06, 2020-07
*/
#include <ctime>
#include "NMT.h"
#include "train/Trainer.h"
#include "translate/Translator.h"
namespace nmt
{
int NMTMain(int argc, const char** argv)
{
if (argc == 0)
return 1;
/* load configurations */
Config config(argc, argv);
srand(1);
/* training */
if (strcmp(config.trainFN, "") != 0) {
Model model;
model.InitModel(config);
Trainer trainer;
trainer.Init(config);
trainer.Train(config.trainFN, config.validFN, config.modelFN, &model);
}
/* translating */
if (strcmp(config.testFN, "") != 0 && strcmp(config.outputFN, "") != 0) {
/* disable grad flow */
DISABLE_GRAD;
Model model;
model.InitModel(config);
Translator translator;
translator.Init(config);
translator.Translate(config.testFN, config.srcVocabFN,
config.tgtVocabFN, config.outputFN, &model);
}
return 0;
}
}
\ No newline at end of file
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* An implementation of the NMT system.
*/
#ifndef __NMT_H__
#define __NMT_H__
namespace nmt
{
/* entrance of the program */
int NMTMain(int argc, const char** argv);
}
#endif
\ No newline at end of file
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
*/
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <string>
#include <fstream>
#include <sstream>
#include "Utility.h"
#include "../niutensor/tensor/XGlobal.h"
using namespace nts;
using namespace std;
namespace nmt
{
/*
load configurations from the command
>> argc - number of arguments
>> argv - the list of arguments
*/
Config::Config(int argc, const char** argv)
{
char** args = new char* [MAX_PARAM_NUM];
for (int i = 0; i < argc; i++) {
args[i] = new char[strlen(argv[i]) + 1];
strcpy(args[i], argv[i]);
}
char* configFN = new char[1024];
LoadParamString(argc, args, "config", configFN, "");
int argsNum = argc;
/* load configurations from a file */
if (strcmp(configFN, "") != 0)
argsNum = LoadFromFile(configFN, args);
ShowParams(argsNum, args);
/* options for the model */
LoadParamInt(argsNum, args, "nhead", &nhead, 4);
LoadParamInt(argsNum, args, "enclayer", &nEncLayer, 6);
LoadParamInt(argsNum, args, "declayer", &nDecLayer, 6);
LoadParamInt(argsNum, args, "maxrp", &maxRP, 8);
LoadParamInt(argsNum, args, "embsize", &embSize, 512);
LoadParamInt(argsNum, args, "modelsize", &modelSize, 512);
LoadParamInt(argsNum, args, "maxpos", &maxPosLen, 1024);
LoadParamInt(argsNum, args, "fnnhidden", &fnnHiddenSize, modelSize * 2);
LoadParamInt(argsNum, args, "vsize", &srcVocabSize, 10152);
LoadParamInt(argsNum, args, "vsizetgt", &tgtVocabSize, 10152);
LoadParamInt(argsNum, args, "padid", &padID, 1);
LoadParamInt(argsNum, args, "startid", &startID, 2);
LoadParamInt(argsNum, args, "endid", &endID, 2);
LoadParamBool(argsNum, args, "rpr", &useRPR, false);
LoadParamBool(argsNum, args, "prenorm", &preNorm, true);
// TODO: refactor the parameters type to support weight sharing during training
LoadParamInt(argsNum, args, "shareemb", &shareAllEmbeddings, 0);
LoadParamInt(argsNum, args, "sharedec", &shareDecInputOutputWeight, 0);
LoadParamString(argsNum, args, "model", modelFN, "");
LoadParamString(argsNum, args, "srcvocab", srcVocabFN, "vocab.src");
LoadParamString(argsNum, args, "tgtvocab", tgtVocabFN, "vocab.tgt");
/* options for training */
LoadParamString(argsNum, args, "train", trainFN, "");
LoadParamString(argsNum, args, "valid", validFN, "");
LoadParamInt(argsNum, args, "dev", &devID, 0);
LoadParamInt(argsNum, args, "wbatch", &wBatchSize, 4096);
LoadParamInt(argsNum, args, "sbatch", &sBatchSize, 8);
isTraining = (strcmp(trainFN, "") == 0) ? false : true;
LoadParamBool(argsNum, args, "mt", &isMT, true);
LoadParamFloat(argsNum, args, "dropout", &dropout, 0.3);
LoadParamFloat(argsNum, args, "fnndrop", &fnnDropout, 0.1);
LoadParamFloat(argsNum, args, "attdrop", &attDropout, 0.1);
LoadParamFloat(argc, args, "lrate", &lrate, 0.0015F);
LoadParamFloat(argc, args, "lrbias", &lrbias, 0);
LoadParamInt(argc, args, "nepoch", &nepoch, 50);
LoadParamInt(argc, args, "maxcheckpoint", &maxCheckpoint, 10);
LoadParamInt(argc, args, "nstep", &nstep, 100000);
LoadParamInt(argc, args, "nwarmup", &nwarmup, 8000);
LoadParamBool(argc, args, "adam", &useAdam, true);
LoadParamFloat(argc, args, "adambeta1", &adamBeta1, 0.9F);
LoadParamFloat(argc, args, "adambeta2", &adamBeta2, 0.98F);
LoadParamFloat(argc, args, "adamdelta", &adamDelta, 1e-9F);
LoadParamBool(argc, args, "shuffled", &isShuffled, true);
LoadParamFloat(argc, args, "labelsmoothing", &labelSmoothingP, 0.1);
LoadParamInt(argc, args, "nstepcheckpoint", &nStepCheckpoint, -1);
LoadParamBool(argc, args, "epochcheckpoint", &useEpochCheckpoint, true);
LoadParamInt(argc, args, "updatestep", &updateStep, 1);
LoadParamBool(argc, args, "sorted", &isLenSorted, false);
LoadParamInt(argc, args, "bufsize", &bufSize, 50000);
LoadParamBool(argc, args, "doubledend", &isDoubledEnd, false);
LoadParamBool(argc, args, "smallbatch", &isSmallBatch, true);
LoadParamBool(argc, args, "bigbatch", &isBigBatch, false);
LoadParamBool(argc, args, "randbatch", &isRandomBatch, false);
LoadParamInt(argc, args, "bucketsize", &bucketSize, wBatchSize * 10);
/* options for translating */
LoadParamString(argsNum, args, "test", testFN, "");
LoadParamString(argsNum, args, "output", outputFN, "");
LoadParamInt(argsNum, args, "beamsize", &beamSize, 1);
LoadParamBool(argsNum, args, "fp16", &useFP16, false);
LoadParamFloat(argsNum, args, "lenalpha", &lenAlpha, 0.6);
LoadParamFloat(argsNum, args, "maxlenalpha", &maxLenAlpha, 1.2);
for (int i = 0; i < argc; i++)
delete[] args[i];
delete[] args;
delete[] configFN;
}
/*
load configurations from a file
>> configFN - path to the configuration file
>> args - the list to store the configurations
format: one option per line, separated by a blank or a tab
*/
int Config::LoadFromFile(const char* configFN, char** args) {
ifstream f(configFN, ios::in);
CheckNTErrors(f.is_open(), "unable to open the config file");
int argsNum = 0;
/* parse arguments */
string key, value;
while (f >> key >> value) {
key += '-';
strcpy(args[argsNum++], key.c_str());
strcpy(args[argsNum++], value.c_str());
}
/* record the number of arguments */
return argsNum;
}
void LoadParamString(int argc, char** argv, const char* name, char* p, const char* defaultP)
{
char vname[128];
vname[0] = '-';
strcpy(vname + 1, name);
bool hit = false;
for (int i = 0; i < argc; i++) {
if (!strcmp(argv[i], vname) && i + 1 < argc) {
strcpy(p, argv[i + 1]);
hit = true;
break;
}
}
if (!hit)
strcpy(p, defaultP);
}
void LoadParamInt(int argc, char** argv, const char* name, int* p, int defaultP)
{
char vname[128];
vname[0] = '-';
strcpy(vname + 1, name);
bool hit = false;
for (int i = 0; i < argc; i++) {
if (!strcmp(argv[i], vname) && i + 1 < argc) {
*(int*)p = atoi(argv[i + 1]);
hit = true;
break;
}
}
if (!hit)
*p = defaultP;
}
void LoadParamBool(int argc, char** argv, const char* name, bool* p, bool defaultP)
{
char vname[128];
vname[0] = '-';
strcpy(vname + 1, name);
bool hit = false;
for (int i = 0; i < argc; i++) {
if (!strcmp(argv[i], vname)) {
*(bool*)p = true;
hit = true;
break;
}
}
if (!hit)
*p = defaultP;
}
void LoadParamFloat(int argc, char** argv, const char* name, float* p, float defaultP)
{
char vname[128];
vname[0] = '-';
strcpy(vname + 1, name);
bool hit = false;
for (int i = 0; i < argc; i++) {
if (!strcmp(argv[i], vname) && i + 1 < argc) {
*p = (float)atof(argv[i + 1]);
hit = true;
break;
}
}
if (!hit)
*p = defaultP;
}
void ShowParams(int argc, char** argv)
{
fprintf(stderr, "args:\n");
for (int i = 0; i < argc; i++) {
if (argv[i][1] == 0)
continue;
if (argv[i][0] == '-' && (argv[i][1] < '1' || argv[i][1] > '9')) {
if (i + 1 < argc && argv[i + 1][0] != '-')
fprintf(stderr, " %s=%s\n", argv[i], argv[i + 1]);
else
fprintf(stderr, " %s=yes\n", argv[i]);
}
}
fprintf(stderr, "\n");
}
#define MAX_WORD_NUM 120
/*
split string by delimiter, this will return indices of all sub-strings
>> s - the original string
>> delimiter - as it is
<< indices - indices of all sub-strings
*/
UInt64List SplitToPos(const string& s, const string& delimiter)
{
UInt64List indices;
if (delimiter.length() == 0) {
indices.Add(0);
}
size_t pos = 0;
uint64_t start = 0;
while ((pos = s.find(delimiter, start)) != string::npos) {
if (pos != start) {
indices.Add(start);
}
start = pos + delimiter.length();
}
if (start != s.length()) {
indices.Add(start);
}
return indices;
}
/* split a string to a int64_t list */
IntList SplitInt(const string& s, const string& delimiter)
{
IntList values;
auto indices = SplitToPos(s, delimiter);
for (int i = 0; i < indices.Size(); i++) {
values.Add(strtol(s.data() + indices[i], nullptr, 10));
}
return values;
}
/* split a string to a float list */
FloatList SplitFloat(const string& s, const string& delimiter)
{
FloatList values;
auto indices = SplitToPos(s, delimiter);
for (int i = 0; i < indices.Size(); i++) {
values.Add(strtof(s.data() + indices[i], nullptr));
}
return values;
}
}
\ No newline at end of file
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-06
*/
#ifndef __UTILITY_H__
#define __UTILITY_H__
#include <string>
#include <cstdio>
#include "../niutensor/tensor/XList.h"
using namespace std;
using namespace nts;
namespace nmt
{
#define MAX_PARAM_NUM 100
/* load arguments */
void LoadParamInt(int argc, char** argv, const char* name, int* p, int defaultP);
void LoadParamBool(int argc, char** argv, const char* name, bool* p, bool defaultP);
void LoadParamFloat(int argc, char** argv, const char* name, float* p, float defaultP);
void LoadParamString(int argc, char** argv, const char* name, char* p, const char* defaultP);
/* show arguments */
void ShowParams(int argc, char** argv);
/* split string */
IntList SplitInt(const string& s, const string& delimiter);
FloatList SplitFloat(const string& s, const string& delimiter);
UInt64List SplitToPos(const string& s, const string& delimiter);
/* configurations for */
class Config {
public:
/* path to the model */
char modelFN[1024];
/* path to the source vocab */
char srcVocabFN[1024];
/* path to the target vocab */
char tgtVocabFN[1024];
/* path to the input file (for inference) */
char testFN[1024];
/* path to the output file (for inference) */
char outputFN[1024];
/* path to the training file */
char trainFN[1024];
/* path to the validation file */
char validFN[1024];
/* device id */
int devID;
/* beam size */
int beamSize;
/* word batch size */
int wBatchSize;
/* sentence batch size */
int sBatchSize;
/* number of heads in attention */
int nhead;
/* number of encoder layers */
int nEncLayer;
/* number of decoder layers */
int nDecLayer;
/* the maximum relative position in RPR attentions */
int maxRP;
/* the dimension of embeddings */
int embSize;
/* the dimension of hidden layer */
int modelSize;
/* the maximum length in positional embedding */
int maxPosLen;
/* the dimension of fnn hidden layer */
int fnnHiddenSize;
/* the vocab size of source sequence */
int srcVocabSize;
/* the vocab size of target sequence */
int tgtVocabSize;
/* the padding id */
int padID;
/* start symbol */
int startID;
/* end symbol */
int endID;
/* indicates whether the model uses pre-norm */
bool preNorm;
/* indicates whether the model is running for machine translation */
bool isMT;
/* indicates whether share encoder decoder embeddings */
int shareAllEmbeddings;
/* indicates whether share decoder embeddings and output weights */
int shareDecInputOutputWeight;
/* indicates whether the model is running with FP16 data type */
bool useFP16;
/* indicates whether we use the RPR attention */
bool useRPR;
/* indicates whether we train the model */
bool isTraining;
/* dropout rate for the model */
float dropout;
/* dropout rate for fnn layers */
float fnnDropout;
/* dropout rate for attention layers */
float attDropout;
/* the alpha parameter controls the length preference */
float lenAlpha;
/* scalar of the input sequence (for max number of search steps) */
float maxLenAlpha;
/* learning rate */
float lrate;
/* the parameter that controls the maximum learning rate in training */
float lrbias;
/* training epoch number */
int nepoch;
/* training step number */
int nstep;
/* the maximum number of saved checkpoints */
int maxCheckpoint;
/* indicates whether we use Adam */
bool useAdam;
/* hyper parameters of Adam */
float adamBeta1;
float adamBeta2;
float adamDelta;
/* step number of warm-up for training */
int nwarmup;
/* indicates whether the data file is shuffled for training */
bool isShuffled;
/* the factor of label smoothing */
float labelSmoothingP;
/* number of steps after which we make a checkpoint */
int nStepCheckpoint;
/* indicates whether we make a checkpoint after each training epoch */
bool useEpochCheckpoint;
/* number of batches on which we do model update */
int updateStep;
/* indicates whether the sequence is sorted by length */
bool isLenSorted;
/* buffer size */
int bufSize;
/* indicates whether we double the </s> symbol for the output of LM */
bool isDoubledEnd;
/* indicates whether we use batchsize = max * sc
rather rather than batchsize = word-number, where max is the maximum
length and sc is the sentence number */
bool isSmallBatch;
/* counterpart of "isSmallBatch" */
bool isBigBatch;
/* randomize batches */
bool isRandomBatch;
/* bucket size */
int bucketSize;
public:
/* load configurations from the command */
Config(int argc, const char** argv);
/* load configurations from a file */
int LoadFromFile(const char* configFN, char** args);
};
}
#endif
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
*/
#include "Attention.h"
#include "Embedding.h"
#include "../Utility.h"
#include "../../niutensor/tensor/core/CHeader.h"
namespace nmt
{
/* constructor */
Attention::Attention()
{
nhead = -1;
dk = -1;
dv = -1;
d = -1;
}
/* de-constructor */
Attention::~Attention()
{
}
/*
initialize the model
>> config - the configurations of the network
*/
void Attention::InitModel(Config& config)
{
devID = config.devID;
useRPR = config.useRPR;
nhead = config.nhead;
d = config.modelSize;
dk = config.modelSize;
dv = config.modelSize;
maxRP = config.maxRP;
dropoutP = config.attDropout;
/* initialize the parameters */
InitTensor2D(&weightQ, d, d, X_FLOAT, devID);
InitTensor1D(&biasQ, d, X_FLOAT, devID);
InitTensor2D(&weightK, d, d, X_FLOAT, devID);
InitTensor1D(&biasK, d, X_FLOAT, devID);
InitTensor2D(&weightV, d, d, X_FLOAT, devID);
InitTensor1D(&biasV, d, X_FLOAT, devID);
if (useRPR)
InitTensor2D(&RPEmbK, maxRP * 2 + 1, d / nhead, X_FLOAT, devID);
InitTensor2D(&weightO, d, d, X_FLOAT, devID);
InitTensor1D(&biasO, d, X_FLOAT, devID);
float scale = 1.0F;
_SetDataFanInOut(&weightK, scale);
_SetDataFanInOut(&weightQ, scale);
_SetDataFanInOut(&weightV, scale);
_SetDataFanInOut(&weightO, scale);
if (useRPR)
_SetDataFanInOut(&RPEmbK, scale);
biasQ.SetZeroAll();
biasO.SetZeroAll();
biasK.SetDataRand(-(DTYPE)sqrt(6.0F / d), (DTYPE)sqrt(6.0F / d));
biasV.SetDataRand(-(DTYPE)sqrt(6.0F / d), (DTYPE)sqrt(6.0F / d));
}
/*
make the network
>> k - keys, B * L * H for encoders, B * 1 * H for decoders
where B = batch size, L = sequence length,
and H = vector size of each position
>> q - queries, B * L * H
>> v - values, B * L * H for encoders, B * 1 * H for decoders
>> mask - as it is
>> isTraining - indicates whether the model is used for training
>> cache - decoder cache
>> cacheType - type of cache, e.g., self-attention
<< return - multi-attention result
*/
XTensor Attention::Make(XTensor& k, XTensor& q, XTensor& v, XTensor* mask,
bool isTraining, Cache* cache, int attType)
{
const bool isEnc = (!cache) ? true : false;
/* linear transformation before self-attention */
XTensor q2, k2, v2;
q2 = MulAndShift(q, weightQ, biasQ);
if (!cache || isTraining || !(cache->enable)) {
/* self attention for encoder layers */
k2 = MulAndShift(k, weightK, biasK);
v2 = MulAndShift(v, weightV, biasV);
if (useRPR && attType == SELF_ATT)
return MakeRPRAttention(k2, q2, v2, mask, isTraining, isEnc);
return MakeAttention(k2, q2, v2, mask, isTraining);
}
else {
if (attType == SELF_ATT) {
k2 = MulAndShift(k, weightK, biasK);
v2 = MulAndShift(v, weightV, biasV);
/* if hit, we only concat the cache with the new token */
if (!cache->miss) {
k2 = Concatenate(cache->key, k2, 1);
v2 = Concatenate(cache->value, v2, 1);
}
cache->key = k2;
cache->value = v2;
cache->miss = false;
if (useRPR)
return MakeRPRAttention(cache->key, q2, cache->value, mask, isTraining, isEnc);
return MakeAttention(cache->key, q2, cache->value, mask, isTraining);
}
else if (attType == EN_DE_ATT) {
if (cache->miss) {
cache->key = MulAndShift(k, weightK, biasK);
cache->value = MulAndShift(v, weightV, biasV);
cache->miss = false;
}
return MakeAttention(cache->key, q2, cache->value, mask, isTraining);
}
CheckNTErrors(0, "invalid cache type");
}
}
/*
make the attention network given keys, queries and values (after linear transformation)
>> k - keys, B * L * H
>> q - queries, B * L * H
>> v - values, B * L * H
>> mask - as it is
>> isTraining - indicates whether the model is used for training
*/
XTensor Attention::MakeAttention(XTensor& k, XTensor& q, XTensor& v,
XTensor* mask, bool isTraining)
{
XTensor kheads;
XTensor qheads;
XTensor vheads;
const auto dataType = k.dataType;
/* multi head */
kheads = Split(k, k.order - 1, nhead);
qheads = Split(q, q.order - 1, nhead);
vheads = Split(v, v.order - 1, nhead);
XTensor att;
XTensor dot;
XTensor scalar;
/* Some operations may cause numerical overflow under FP16 including
BMMul, Mask, Div and Softmax. So we need to cast the input to FP32 */
if (qheads.dataType == X_FLOAT16) {
qheads = ConvertDataType(qheads, X_FLOAT);
kheads = ConvertDataType(kheads, X_FLOAT);
}
/* scalar = softmax(Q * K^T / sqrt(dk)) * V */
dot = BMMul(qheads, X_NOTRANS, kheads, X_TRANS);
if (mask)
dot = dot + *mask;
dot = Linear(dot, 1.0F / (float)sqrt((float)dk / nhead));
scalar = Softmax(dot, -1);
if (isTraining && dropoutP > 0)
scalar = Dropout(scalar, dropoutP);
if (vheads.dataType != scalar.dataType)
vheads = ConvertDataType(vheads, scalar.dataType);
att = BMMul(scalar, vheads);
if (dataType != att.dataType)
att = ConvertDataType(att, dataType);
/* concatenate the heads */
return MulAndShift(Merge(att, att.order - 1), weightO, biasO);
}
/*
make the attention network by incorporating the relative position representation
with the given keys, queries and values (after linear transformation)
>> k - keys, B * L * H
>> q - queries, B * L * H
>> v - values, B * L * H
>> mask - as it is
>> isTraining - indicates whether the model is used for training
>> isEnc - indicates whether it is encoder
*/
XTensor Attention::MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v,
XTensor* mask, bool isTraining, bool isEnc)
{
XTensor kheads;
XTensor qheads;
XTensor vheads;
const int batchSize = q.GetDim(0);
const int lenQ = q.GetDim(1);
const int lenKV = k.GetDim(1);
const auto dataType = k.dataType;
/* multi head */
kheads = Split(k, k.order - 1, nhead);
qheads = Split(q, q.order - 1, nhead);
vheads = Split(v, v.order - 1, nhead);
XTensor att;
XTensor dot;
XTensor scalar;
XTensor embMatrix, relativeKey;
/* generate the relative emb index (L_q, L_kv) */
embMatrix = GetRPEmbedding(lenQ, lenKV, maxRP, isEnc || isTraining);
/* generate the relative key from the RPEmbK (L_q, L_kv, H/K) */
relativeKey = Gather(RPEmbK, embMatrix);
if (qheads.dataType == X_FLOAT16) {
qheads = ConvertDataType(qheads, X_FLOAT);
kheads = ConvertDataType(kheads, X_FLOAT);
relativeKey = ConvertDataType(relativeKey, X_FLOAT);
}
float scaling = sqrt(d / nhead);
qheads = ScaleAndShift(qheads, 1.0F / scaling);
dot = RPDotProduct(qheads, kheads, relativeKey, true);
if (mask)
dot = dot + *mask;
/* softmax */
scalar = Softmax(dot, -1);
if (isTraining && dropoutP > 0)
scalar = Dropout(scalar, dropoutP);
if (vheads.dataType != scalar.dataType)
vheads = ConvertDataType(vheads, scalar.dataType);
/* generate the relative attention output (K, B, L_q, H/K) */
att = BMMul(scalar, vheads);
if (dataType != att.dataType)
att = ConvertDataType(att, dataType);
/* concatenate the heads */
return MulAndShift(Merge(att, att.order - 1), weightO, biasO);
}
/*
generate relative position embeddings
>> lenQ - the length of query
>> lenKV - the length of key and value
>> maxRelativeLen - the maximum length of relative position
*/
XTensor Attention::GetRPEmbedding(const int lenQ, const int lenKV,
const int maxRelativeLen, const bool isEnc)
{
XTensor range;
XTensor embMatrix;
InitTensor1D(&range, lenKV, X_INT, devID);
int* index = new int[lenKV];
if (isEnc) {
for (int i = 0; i < lenKV; i++)
index[i] = i;
range.SetData(index, lenKV);
XTensor range2D;
XTensor range2DTrans;
range2D = Unsqueeze(range, 0, lenQ);
range2DTrans = Transpose(range2D, 0, 1);
embMatrix = Sum(range2D, range2DTrans, -1);
}
else {
for (int i = 0; i < lenKV; i++)
index[i] = -lenKV + i + 1;
range.SetData(index, lenKV);
embMatrix = Unsqueeze(range, 0, lenQ);
}
//ClipMe(embMatrix, -float(maxRelativeLen), float(maxRelativeLen));
embMatrix = Clip(embMatrix, -float(maxRelativeLen), float(maxRelativeLen));
embMatrix = ScaleAndShift(embMatrix, 1.0F, float(maxRelativeLen));
delete[] index;
return embMatrix;
}
/*
relative position-aware dot-product attention inner calculation.
>> x - Tensor with shape [batch_size*heads, length, length or depth].
>> y - Tensor with shape [batch_size*heads, length, depth].
>> z - Tensor with shape [length, length, depth].
>> isKey - Whether y is key.
<< return - A Tensor with shape [batch_size*heads, length, length or depth].
*/
XTensor Attention::RPDotProduct(XTensor& x, XTensor& y, XTensor& z, const bool isKey)
{
const int headNum = nhead;
const int batchSize = x.GetDim(1);
const int lenQ = x.GetDim(2);
const int lenKV = y.GetDim(2);
const int depth = y.GetDim(3);
const int lastDim = isKey ? lenKV : depth;
auto transposeFlag = isKey ? X_TRANS : X_NOTRANS;
int mergeDimsX[] = { headNum * batchSize, lenQ, x.GetDim(3) };
int mergeDimsY[] = { headNum * batchSize, lenKV, y.GetDim(3) };
x = Reshape(x, 3, mergeDimsX);
y = Reshape(y, 3, mergeDimsY);
if (isKey) {
y = Transpose(y, 1, 2);
}
XTensor context;
context = BMMul(x, y);
int newDims[]{ headNum, batchSize, context.GetDim(1), context.GetDim(2) };
context = Reshape(context, 4, newDims);
XTensor xTrans;
xTrans = Transpose(x, 0, 1);
XTensor relative;
relative = MatrixMulBatched(xTrans, X_NOTRANS, z, transposeFlag);
XTensor relativeTrans;
relativeTrans = Transpose(relative, 0, 1);
int splitDims[] = { headNum, batchSize, lenQ, lastDim };
relativeTrans = Reshape(relativeTrans, 4, splitDims);
return context + relativeTrans;
}
/* constructor */
Cache::Cache()
{
miss = true;
enable = true;
}
/* update the states cache */
void Cache::Update(XTensor&& k, XTensor&& v)
{
key = k;
value = v;
miss = false;
}
/* keep alive states */
void Cache::KeepAlive(XTensor& aliveIdx)
{
if (!miss) {
key = AutoGather(key, aliveIdx);
value = AutoGather(value, aliveIdx);
}
}
/* reorder alive states */
void Cache::Reorder(XTensor& reorder)
{
if (!miss) {
key = AutoGather(key, reorder);
value = AutoGather(value, reorder);
}
}
}
\ No newline at end of file
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
*/
#ifndef __ATTENTION_H__
#define __ATTENTION_H__
#include "NNUtil.h"
#include "../Utility.h"
#include "../../niutensor/network/XNet.h"
#include "../../niutensor/tensor/core/CHeader.h"
using namespace nts;
namespace nmt
{
/* attention type */
enum { NONE, SELF_ATT, EN_DE_ATT };
/* layer cache for keys and values */
class Cache
{
public:
/* cache for keys, (B, L, H) */
XTensor key;
/* cache for values, (B, L, H) */
XTensor value;
public:
/* indicates cache miss if 'true' */
bool miss;
/* indicates whether we use cache */
bool enable;
/* constructor */
Cache();
/* update the states cache */
void Update(XTensor&& k, XTensor&& v);
/* keep alive states */
void KeepAlive(XTensor& aliveIdx);
/* reorder alive states */
void Reorder(XTensor& reorder);
};
/* multi-head attention */
class Attention
{
public:
/* device id */
int devID;
/* head number */
int nhead;
/* transformation matrix for Q */
XTensor weightQ;
/* bias for Q */
XTensor biasQ;
/* transformation matrix for K */
XTensor weightK;
/* bias for K */
XTensor biasK;
/* transformation matrix for V */
XTensor weightV;
/* bias for V */
XTensor biasV;
XTensor wBig;
XTensor bBig;
/* RPR emb */
XTensor RPEmbK;
/* transformation after dot-product attention */
XTensor weightO;
/* bias after dot-product attention */
XTensor biasO;
/* size of transformed Q and K */
int dk;
/* size of transformed V */
int dv;
/* size of input Q, K and V */
int d;
/* indicates whether we use the RPR attention */
bool useRPR;
/* dropout probability */
DTYPE dropoutP;
/* the maximum relative window size */
int maxRP;
public:
/* constructor */
Attention();
/* de-constructor */
~Attention();
/* initialize the model */
void InitModel(Config& config);
/* make the network */
XTensor Make(XTensor& k, XTensor& q, XTensor& v,
XTensor* mask, bool isTraining,
Cache* cache, int cacheType);
/* make the attention network given keys, queries and values (after linear transformation) */
XTensor MakeAttention(XTensor& k, XTensor& q, XTensor& v,
XTensor* mask, bool isTraining);
/* make the attention network given keys, queries and values (after linear transformation) */
XTensor MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v,
XTensor* mask, bool isTraining, bool isEnc);
/* generate relative position embeddings */
XTensor GetRPEmbedding(const int lenQ, const int lenKV, const int maxRelativeLen, const bool isEnc);
/* relative position-aware dot-product attention inner calculation */
XTensor RPDotProduct(XTensor& x, XTensor& y, XTensor& z, const bool is_key);
};
}
#endif
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Bei Li (libei_neu@outlook.com) 2020-02-05
* This file includes some common modules of the Transformer model
*/
#include "CommonModules.h"
#include "../../niutensor/tensor/core/CHeader.h"
#include "../../niutensor/tensor/function/FHeader.h"
namespace nmt
{
/*
flexible layer normalization for the Transformer
>> input - input tensor
>> ln - the layernorm network
>> prenorm - whether we use prenorm or not
>> before - whether we use layernorm before attention/fnn
>> after - whether we use layernorm after attention/fnn
*/
XTensor LayerNorm(XTensor& input, LN& ln, bool prenorm, bool before, bool after)
{
if (after ^ prenorm)
return ln.Make(input);
else
return input;
}
}
\ No newline at end of file
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Bei Li (libei_neu@outlook.com) 2020-02-03
*/
#ifndef __COMMONMODULE_H__
#define __COMMONMODULE_H__
#include "LayerNorm.h"
#include "CommonModules.h"
using namespace nts;
namespace nmt
{
/* the layer normalization module to control pre-norm or post-norm*/
XTensor LayerNorm(XTensor& input, LN& ln, bool prenorm, bool before, bool after);
}
#endif
\ No newline at end of file
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-08-01
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-07
*/
#include "Embedding.h"
#include "../Utility.h"
#include "../../niutensor/tensor/core/CHeader.h"
namespace nmt
{
/* constructor */
Embedder::Embedder()
{
devID = -1;
vSize = -1;
maxLength = -1;
}
/* de-constructor */
Embedder::~Embedder()
{
}
/*
initialize the model
>> config - configurations of the model
>> isEnc - indicates if it is used for the encoder
*/
void Embedder::InitModel(Config& config, bool isEnc)
{
devID = config.devID;
d = config.modelSize;
padIdx = config.padID;
eSize = config.embSize;
maxLength = config.maxPosLen;
vSize = (isEnc) ? config.srcVocabSize : config.tgtVocabSize;
InitTensor2D(&w, vSize, eSize, X_FLOAT, devID);
maxLength = maxLength + 1 + 1;
DTYPE v = 1.0F / (float)sqrt((float)eSize);
w.SetDataRandn(0, v);
/* create the positional embedding matrix */
MakePosEmbedding(maxLength);
}
/*
make positional embeddings (of size eSize * length)
>> length - length of the sequence
*/
void Embedder::MakePosEmbedding(int length)
{
InitTensor2D(&posEmbeddingBase, length, eSize, X_FLOAT, devID);
float* data = new float[posEmbeddingBase.unitNum];
for (int pos = 0; pos < length; pos++) {
float* dp = data + pos * eSize;
int channelSize = eSize / 2;
int offset = 0;
for (int i = 0; i < channelSize; i++) {
dp[offset++] = (float)sin(pos * exp(-i * log(10000.0F) / (channelSize - 1)));
}
for (int i = 0; i < channelSize; i++) {
dp[offset++] = (float)cos(pos * exp(-i * log(10000.0F) / (channelSize - 1)));
}
}
/* padding zeros */
int padStart = padIdx * eSize;
for (int i = padStart; i < padStart + eSize; i++)
data[i] = 0.F;
posEmbeddingBase.SetData(data, posEmbeddingBase.unitNum);
if (w.dataType != posEmbeddingBase.dataType)
posEmbeddingBase = ConvertDataType(posEmbeddingBase, w.dataType);
delete[] data;
}
/*
make the network
>> input - the word indices
>> nstep - the length of current sequence
>> isDec - indicates whether it is decoder
>> isTraining - indicates whether it is training
<< return - word & position embeddings of the input
*/
XTensor Embedder::Make(XTensor& input, bool isDec, bool isTraining, int nstep)
{
/* make sure the padding index is 1 */
CheckNTErrors(input.order > 1, "Wrong input tensor size!");
CheckNTErrors(input.dimSize[input.order - 1] < maxLength, "The sequence is too long!");
CheckNTErrors(vSize > 0, "Set vocabulary size by \"-vsize\"");
CheckNTErrors(eSize > 0, "Set embedding size by \"-esize\"");
XTensor wordEmbedding, position, posEmbedding;
InitTensor1D(&position, input.GetDim(-1), X_INT, devID);
if (!isDec || isTraining || input.GetDim(-1) > 1)
{
position.Range(0, position.unitNum, 1);
// disable grad
ScaleAndShiftMe(position, 1.0F, float(padIdx + 1));
}
else
{
/* decoder embeddings during decoding */
position.SetDataFixed(nstep + padIdx + 1);
}
/* we make positional embeddings first */
XTensor embTMP;
embTMP = Gather(posEmbeddingBase, position);
posEmbedding = Unsqueeze(embTMP, 0, input.GetDim(0));
/* then we make word embeddings */
//w.enableGrad = false;
wordEmbedding = Gather(w, input);
wordEmbedding = Linear(wordEmbedding, (float)sqrt((float)eSize));
/* we sum over the two embeddings */
SumMe(wordEmbedding, posEmbedding);
return wordEmbedding;
}
}
\ No newline at end of file
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-08-01
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-07
*/
#ifndef __EMBEDDING_H__
#define __EMBEDDING_H__
#include "../Utility.h"
#include "../../niutensor/network/XNet.h"
using namespace nts;
namespace nmt
{
#define DEFAULT_EMBEDDING_SIZE 512
/*
embedding (of word at position i):
word embedding + positional embedding
*/
class Embedder
{
public:
/* device id */
int devID;
/* vocabulary size */
int vSize;
/* embedding size */
int eSize;
/* maximum length of the sequence */
int maxLength;
/* dimension size of the hidden layers in the model */
int d;
/* padding index */
int padIdx;
/* word embedding matrix */
XTensor w;
/* predefined positional embeddings. It can speeds up
the embedding processing by re-loading. */
XTensor posEmbeddingBase;
public:
/* constructor */
Embedder();
/* de-constructor */
~Embedder();
/* initialize the model */
void InitModel(Config& config, bool isEnc = true);
/* make positional embeddings */
void MakePosEmbedding(int length);
/* make the network */
XTensor Make(XTensor& input, bool isDec, bool isTraining, int nstep = 0);
};
}
#endif
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#include "FNN.h"
#include "Embedding.h"
#include "../Utility.h"
#include "../../niutensor/tensor/core/CHeader.h"
#include "../../niutensor/tensor/function/FHeader.h"
namespace nmt
{
/* constructor */
FNN::FNN()
{
inSize = -1;
outSize = -1;
hSize = -1;
}
/* de-constructor */
FNN::~FNN()
{
}
/*
initialize the model
>> argc - number of arguments
>> argv - list of pointers to the arguments
>> config - configurations of the model
*/
void FNN::InitModel(Config& config)
{
devID = config.devID;
inSize = config.modelSize;
outSize = config.modelSize;
hSize = config.fnnHiddenSize;
dropoutP = config.fnnDropout;
InitTensor2D(&w1, inSize, hSize, X_FLOAT, devID);
InitTensor1D(&b1, hSize, X_FLOAT, devID);
InitTensor2D(&w2, hSize, outSize, X_FLOAT, devID);
InitTensor1D(&b2, outSize, X_FLOAT, devID);
float scale = 1.0F;
_SetDataFanInOut(&w1, scale);
_SetDataFanInOut(&w2, scale);
w1.SetDataRand(-(DTYPE)sqrt(6.0F / inSize), (DTYPE)sqrt(6.0F / inSize));
w2.SetDataRand(-(DTYPE)sqrt(6.0F / hSize), (DTYPE)sqrt(6.0F / hSize));
b1.SetZeroAll();
b2.SetZeroAll();
}
/*
make the network
y = max(0, x * w1 + b1) * w2 + b2
>> input - the input tensor
>> return - the output tensor
*/
XTensor FNN::Make(XTensor& input, bool isTraining)
{
XTensor t1;
/* t1 = max(0, x * w1 + b1) */
t1 = Rectify(MulAndShift(input, w1, b1));
if (isTraining && dropoutP > 0)
t1 = Dropout(t1, dropoutP);
/* result = t1 * w2 + b2 */
return MulAndShift(t1, w2, b2);
}
}
\ No newline at end of file
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#ifndef __FNN_H__
#define __FNN_H__
#include "LayerNorm.h"
#include "../Utility.h"
#include "../../niutensor/tensor/XTensor.h"
using namespace nts;
namespace nmt
{
/* a fnn: y = max(0, x * w1 + b1) * w2 + b2 */
class FNN
{
public:
/* device id */
int devID;
/* size of input vector */
int inSize;
/* size of output vector */
int outSize;
/* size of hidden layers */
int hSize;
/* matrix of transformation 1 */
XTensor w1;
/* bias of transformation 1 */
XTensor b1;
/* matrix of transformation 2 */
XTensor w2;
/* bias of transformation 2 */
XTensor b2;
/* dropout probability */
DTYPE dropoutP;
public:
/* constructor */
FNN();
/* de-constructor */
~FNN();
/* initialize the model */
void InitModel(Config& config);
/* make the network */
XTensor Make(XTensor& input, bool isTraining);
};
}
#endif
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Bei Li (libei_neu@outlook.com) 2020-02-03
*/
#include "GLU.h"
#include "Embedding.h"
#include "../Utility.h"
#include "../../niutensor/tensor/core/CHeader.h"
#include "../../niutensor/tensor/function/FHeader.h"
namespace nmt
{
/* constructor */
GLU::GLU()
{
inSize = -1;
outSize = -1;
hSize = -1;
}
/* de-constructor */
GLU::~GLU()
{
}
/*
initialize the model
>> config - configurations of the model
*/
void GLU::InitModel(Config& config)
{
devID = config.devID;
float minmax = 0;
inSize = config.modelSize;
outSize = config.modelSize;
InitTensor2D(&w1, hSize, outSize, X_FLOAT, devID);
InitTensor1D(&b1, outSize, X_FLOAT, devID);
InitTensor2D(&w2, hSize, outSize, X_FLOAT, devID);
InitTensor1D(&b2, outSize, X_FLOAT, devID);
}
/*
make the network
y = W1 * x + b1 * sigmod(W2 * x + b2)
>> input - the input tensor, size = 2 * hSize
>> return - the output tensor, size = hSize
*/
XTensor GLU::Make(XTensor& input)
{
XTensor t1;
XTensor t2;
TensorList input_list;
/* split the input into two vectors with the dim hSize */
Split(input, input_list, -1, 2);
/* t1 = W1 * x + b1 */
t1 = MulAndShift(input_list.GetItem(0), w1, b1);
/* t2 = W2 * x + b2 */
t2 = MulAndShift(input_list.GetItem(1), w2, b2);
return t1 * Sigmoid(t2);
}
}
\ No newline at end of file
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Bei Li (libei_neu@outlook.com) 2020-02-03
*/
#ifndef __GLU_H__
#define __GLU_H__
#include "LayerNorm.h"
using namespace nts;
namespace nmt
{
/* a fnn: y = max(0, x * w1 + b1) * w2 + b2 */
class GLU
{
public:
/* device id */
int devID;
/* size of input vector */
int inSize;
/* size of output vector */
int outSize;
/* size of hidden layers */
int hSize;
/* matrix of transformation 1 */
XTensor w1;
/* bias of transformation 1 */
XTensor b1;
/* matrix of transformation 2 */
XTensor w2;
/* bias of transformation 2 */
XTensor b2;
public:
/* constructor */
GLU();
/* de-constructor */
~GLU();
/* initialize the model */
void InitModel(Config& config);
/* make the network */
XTensor Make(XTensor& input);
};
}
#endif
\ No newline at end of file
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Bei Li (libei_neu@outlook.com) 2020-02-03
*/
#include "Embedding.h"
#include "LayerNorm.h"
#include "LayerHistory.h"
#include "../Utility.h"
#include "../../niutensor/tensor/core/CHeader.h"
#define SAFE_DELETE(x) do{ if((x) != NULL){delete (x); (x) = NULL;} } while(false)
#define SAFE_DELETE_ARRAY(x) do{ if((x) != NULL) {delete [] (x); (x)=NULL;} } while(false)
namespace nmt
{
/* constructor */
LayerHistory::LayerHistory()
{
d = -1;
count = -1;
weight = NULL;
layerNorms = NULL;
}
/* de-constructor */
LayerHistory::~LayerHistory()
{
history.Clear();
delete[] layerNorms;
}
/*
initialize the model
>> config - configurations of the model
*/
void LayerHistory::InitModel(Config& config)
{
devID = config.devID;
d = config.modelSize;
nlayer = config.nEncLayer;
InitTensor2D(&weight, nlayer + 1, nlayer + 1, X_FLOAT, devID);
layerNorms = new LN[nlayer];
/* initialize the layer normalization of each layer */
for (int i = 0; i < nlayer; i++) {
layerNorms[i].InitModel(config);
}
}
/*
the Add operation
>> tensor - the previous layer output. It might be of size B * L * H
where B = batch size, L = sequence length,
and H = vector size of each position
*/
void LayerHistory::Add(XTensor& tensor)
{
/* the embedding is not normed */
count += 1;
if (history.Size() == 0) {
//sample_ = tensor;
history.Add(&tensor);
return;
}
XTensor ln = layerNorms[count - 2].Make(tensor);
history.Add(&ln);
}
/*
generate the weight sum vector of all previous layer output in the history as the layer input
*/
XTensor LayerHistory::Pop()
{
/* the number of layer output in the history */
size_t size = history.Size();
TensorList historyList;
for (size_t i = 0; i < size; i++)
historyList.Add(history[i]);
/* we need stack the tensor along the first dim*/
XTensor stackTensor = Stack(historyList, 0);
XTensor interWeight;
InitTensor2D(&interWeight, 1, weight.dimSize[1], DEFAULT_DTYPE, devID);
XTensor layerWeight;
InitTensor1D(&layerWeight, size, DEFAULT_DTYPE, devID);
_SelectRange(&weight, &interWeight, 0, size - 1, size);
interWeight.Reshape(interWeight.unitNum);
_SelectRange(&interWeight, &layerWeight, 0, 0, size);
MultiplyDimMe(stackTensor, layerWeight, 0);
XTensor result;
ReduceSum(stackTensor, result, 0);
return result;
}
void LayerHistory::ClearHistory()
{
history.Clear();
}
}
\ No newline at end of file
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Bei Li (libei_neu@outlook.com) 2020-02-03
*/
#ifndef __LAYERHISTORY_H__
#define __LAYERHISTORY_H__
#include "LayerNorm.h"
#include "LayerHistory.h"
#include "../../niutensor/tensor/function/FHeader.h"
using namespace nts;
namespace nmt
{
/*
multi-head attention
y(Q, K, V) = cat(head_1, head_2, ..., head_n)
where head_i = Attention(Q * w_i^Q, K * w_i^K, V * w_i^V)
attention(Q, K, V) = softmax(Q * K^T/d_k^0.5) V
d_k = dimension size of K
*/
class LayerHistory
{
public:
/* device id */
int devID;
/* the triangle weight matrix for dlcl */
XTensor weight;
/* hidden size */
int d;
/* layer number */
int nlayer;
/* current layer number */
int count;
/* a history to store the value of intimidate layers */
TensorList history;
/* layer normalization for each intimidate layer */
LN* layerNorms;
public:
/* constructor */
LayerHistory();
/* de-constructor */
~LayerHistory();
/* initialize the model */
void InitModel(Config& config);
/* add the layer output to the history */
void Add(XTensor& tensor);
/* compute the layer input for the current layer, the weight sum of all previous layer output after normed in the history */
XTensor Pop();
/* clean the history*/
void ClearHistory();
};
}
#endif
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#include "Embedding.h"
#include "LayerNorm.h"
#include "../Utility.h"
#include "../../niutensor/tensor/core/CHeader.h"
namespace nmt
{
/* constructor */
LN::LN()
{
devID = -1;
d = 0;
}
/* de-constructor */
LN::~LN()
{
}
/*
initialize the model
>> argc - number of arguments
>> argv - list of pointers to the arguments
>> config - configurations of the model
*/
void LN::InitModel(Config& config)
{
devID = config.devID;
d = config.modelSize;
InitTensor1D(&w, d, X_FLOAT, devID);
InitTensor1D(&b, d, X_FLOAT, devID);
w.SetDataRand(1.0F, 1.0F);
b.SetZeroAll();
w.SetDataFixed(1);
}
/*
make the network
>> input - the input tensor
>> return - layer normalization output
*/
XTensor LN::Make(XTensor& input)
{
XTensor& x = input;
XTensor xn;
XTensor mean;
XTensor variance;
XTensor standard;
XTensor meanFilled;
XTensor standardFilled;
TENSOR_DATA_TYPE dataType = input.dataType;
if (dataType == X_FLOAT16) {
/* reduce functions can only run with FP32 */
x = ConvertDataType(input, X_FLOAT);
}
/* \mu = (sum_i x_i)/m */
mean = ReduceMean(x, x.order - 1);
/* \sigma = (sum_i (x_i - \mu)^2)/m */
variance = ReduceVariance(x, x.order - 1, mean);
/* standard = sqrt(variance) */
standard = Power(variance, 0.5F);
/* unsqueeze mean and standard deviation to fit them into
the same shape of x */
meanFilled = Unsqueeze(mean, x.order - 1, x.GetDim(-1));
standardFilled = Unsqueeze(standard, x.order - 1, x.GetDim(-1));
/* x' = (x - \mu)/standard */
xn = (x - meanFilled) / standardFilled;
if (dataType != mean.dataType) {
x = ConvertDataType(x, dataType);
xn = ConvertDataType(xn, dataType);
}
/* result = x' * w + b */
return xn * w + b;
}
}
\ No newline at end of file
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#ifndef __LAYERNORMAL_H__
#define __LAYERNORMAL_H__
#include "../Utility.h"
#include "../../niutensor/network//XNet.h"
using namespace nts;
namespace nmt
{
/* layer normalization: y = norm(x) * w + b
where norm(x) = (x - mean)/standardDeviation */
class LN
{
public:
/* device id */
int devID;
/* the transformation matrix w */
XTensor w;
/* the bias term b */
XTensor b;
/* dimension size of the model */
int d;
public:
/* constructor */
LN();
/* de-constructor */
~LN();
/* initialize the model */
void InitModel(Config& config);
/* make the network */
XTensor Make(XTensor& input);
};
}
#endif
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: HU Chi (huchinlp@foxmail.com) 2020-03-21
*/
#include "NNUtil.h"
namespace nmt
{
/*
a wrapper for the gather function
>> src - the input tensor
>> index - the index tensor
<< res - the output tensor
*/
XTensor AutoGather(XTensor& src, XTensor& index)
{
if (src.order == 2)
return Gather(src, index);
else {
CheckNTErrors(src.order == 3, "the source must be 3d");
int order = src.order;
int dimSize[MAX_TENSOR_DIM_NUM];
for (int i = 0; i < src.order; i++) {
dimSize[i] = src.dimSize[i];
}
src.Reshape(src.dimSize[0], src.dimSize[1] * src.dimSize[2]);
XTensor res = Gather(src, index);
src.Reshape(order, dimSize);
dimSize[0] = index.dimSize[0];
dimSize[1] = res.unitNum / (dimSize[0] * dimSize[2]);
res.Reshape(order, dimSize);
return res;
}
}
}
\ No newline at end of file
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: HU Chi (huchinlp@foxmail.com) 2020-03-21
*/
#ifndef __NNUTIL_H__
#define __NNUTIL_H__
#include "../../niutensor/tensor/XGlobal.h"
#include "../../niutensor/tensor/core/CHeader.h"
#include "../../niutensor/tensor/function/FHeader.h"
using namespace nts;
namespace nmt
{
/* the gather function for tensor with any dimension */
XTensor AutoGather(XTensor& src, XTensor& index);
}
#endif
\ No newline at end of file
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#include "Output.h"
#include "Embedding.h"
#include "../Utility.h"
#include "../../niutensor/tensor/core/CHeader.h"
namespace nmt
{
/* constructor */
Output::Output()
{
devID = -1;
vSize = -1;
hSize = -1;
}
/* de-constructor */
Output::~Output()
{
}
/*
initialize the model
>> config - configurations of the model
*/
void Output::InitModel(Config& config)
{
devID = config.devID;
hSize = config.modelSize;
vSize = config.tgtVocabSize;
InitTensor2D(&w, vSize, hSize, X_FLOAT, devID);
DTYPE v = 1.0F / (float)sqrt((float)hSize);
w.SetDataRandn(0, v);
}
/*
make the network (redefined output tensor)
>> input - input tensor
>> output - output tensor
>> isTraining - whether it is used for training
>> normalized - whether ignore the log-softmax
*/
void Output::Make(XTensor& input, XTensor& output, bool isTraining, bool normalized)
{
XTensor& x = input;
output = MMul(x, X_NOTRANS, w, X_TRANS);
/* use softmax for training */
if (isTraining) {
output = Softmax(output, -1);
return;
}
/* normalize the output for beam search */
if (normalized) {
auto dataType = output.dataType;
if (dataType == X_FLOAT16)
output = ConvertDataType(output, X_FLOAT);
output = LogSoftmax(output, -1);
if (output.dataType != dataType)
output = ConvertDataType(output, dataType);
}
}
}
\ No newline at end of file
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#ifndef __OUTPUT_H__
#define __OUTPUT_H__
#include "../Utility.h"
#include "../../niutensor/tensor/function/FHeader.h"
using namespace nts;
namespace nmt
{
/* output layer */
class Output
{
public:
/* device id */
int devID;
/* vocabulary size */
int vSize;
/* vector size of the linear transformation */
int hSize;
/* transformation matrix */
XTensor w;
public:
/* constructor */
Output();
/* de-constructor */
~Output();
/* initialize the model */
void InitModel(Config& config);
/* make the network (redefined output tensor) */
void Make(XTensor& input, XTensor& output, bool isTraining, bool normalized);
};
}
#endif
\ No newline at end of file
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: HU Chi (huchinlp@foxmail.com) 2020-08-09
* TODO: refactor the data loader class and references
*/
#include <string>
#include <vector>
#include <cstdlib>
#include <fstream>
#include <algorithm>
#include "TrainDataSet.h"
#include "../Utility.h"
#include "../translate/Vocab.h"
using namespace nmt;
namespace nts {
/* sort the dataset by length (in descending order) */
void TrainDataSet::SortByLength() {
sort(buffer.items, buffer.items + buffer.count,
[](TrainExample* a, TrainExample* b) {
return (a->srcSent.Size() + a->tgtSent.Size())
> (b->srcSent.Size() + b->tgtSent.Size());
});
}
/* sort buckets by key (in descending order) */
void TrainDataSet::SortBucket() {
sort(buffer.items, buffer.items + buffer.count,
[](TrainExample* a, TrainExample* b) {
return a->bucketKey > b->bucketKey;
});
}
/*
sort the output by key in a range (in descending order)
>> begin - the first index of the range
>> end - the last index of the range
*/
void TrainDataSet::SortInBucket(int begin, int end) {
sort(buffer.items + begin, buffer.items + end,
[](TrainExample* a, TrainExample* b) {
return (a->key > b->key);
});
}
/*
load all data from a file to the buffer
training data format (binary):
first 8 bit: number of sentence pairs
subsequent segements:
source sentence length (4 bit)
target sentence length (4 bit)
source tokens (4 bit per token)
target tokens (4 bit per token)
*/
void TrainDataSet::LoadDataToBuffer()
{
buffer.Clear();
curIdx = 0;
int id = 0;
uint64_t sentNum = 0;
int srcVocabSize = 0;
int tgtVocabSize = 0;
fread(&srcVocabSize, sizeof(srcVocabSize), 1, fp);
fread(&tgtVocabSize, sizeof(tgtVocabSize), 1, fp);
fread(&sentNum, sizeof(uint64_t), 1, fp);
CheckNTErrors(sentNum > 0, "Invalid sentence pairs number");
while (id < sentNum) {
int srcLen = 0;
int tgtLen = 0;
fread(&srcLen, sizeof(int), 1, fp);
fread(&tgtLen, sizeof(int), 1, fp);
CheckNTErrors(srcLen > 0, "Invalid source sentence length");
CheckNTErrors(tgtLen > 0, "Invalid target sentence length");
IntList srcSent;
IntList tgtSent;
srcSent.ReadFromFile(fp, srcLen);
tgtSent.ReadFromFile(fp, tgtLen);
TrainExample* example = new TrainExample;
example->id = id++;
example->key = id;
example->srcSent = srcSent;
example->tgtSent = tgtSent;
buffer.Add(example);
}
fclose(fp);
XPRINT1(0, stderr, "[INFO] loaded %d sentences\n", id);
}
/*
load a mini-batch to the device (for training)
>> batchEnc - a tensor to store the batch of encoder input
>> paddingEnc - a tensor to store the batch of encoder paddings
>> batchDec - a tensor to store the batch of decoder input
>> paddingDec - a tensor to store the batch of decoder paddings
>> label - a tensor to store the label of input
>> minSentBatch - the minimum number of sentence batch
>> batchSize - the maxium number of words in a batch
>> devID - the device id, -1 for the CPU
<< return - number of target tokens and sentences
*/
UInt64List TrainDataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
XTensor* batchDec, XTensor* paddingDec, XTensor* label,
size_t minSentBatch, size_t batchSize, int devID)
{
UInt64List info;
size_t srcTokenNum = 0;
size_t tgtTokenNum = 0;
int realBatchSize = 1;
if (!isTraining)
realBatchSize = minSentBatch;
/* get the maximum source sentence length in a mini-batch */
size_t maxSrcLen = buffer[curIdx]->srcSent.Size();
/* max batch size */
const int MAX_BATCH_SIZE = 512;
/* dynamic batching for sentences, enabled when the dataset is used for training */
if (isTraining) {
while ((realBatchSize < (buffer.Size() - curIdx))
&& (realBatchSize * maxSrcLen < batchSize)
&& (realBatchSize < MAX_BATCH_SIZE)
&& (realBatchSize * buffer[curIdx + realBatchSize]->srcSent.Size() < batchSize)) {
if (maxSrcLen < buffer[curIdx + realBatchSize]->srcSent.Size())
maxSrcLen = buffer[curIdx + realBatchSize]->srcSent.Size();
realBatchSize++;
}
}
/* real batch size */
if ((buffer.Size() - curIdx) < realBatchSize) {
realBatchSize = buffer.Size() - curIdx;
}
CheckNTErrors(realBatchSize > 0, "Invalid batch size");
/* get the maximum target sentence length in a mini-batch */
size_t maxTgtLen = buffer[curIdx]->tgtSent.Size();
for (size_t i = 0; i < realBatchSize; i++) {
if (maxTgtLen < buffer[curIdx + i]->tgtSent.Size())
maxTgtLen = buffer[curIdx + i]->tgtSent.Size();
}
for (size_t i = 0; i < realBatchSize; i++) {
if (maxSrcLen < buffer[curIdx + i]->srcSent.Size())
maxSrcLen = buffer[curIdx + i]->srcSent.Size();
}
CheckNTErrors(maxSrcLen != 0, "Invalid source length for batching");
int* batchEncValues = new int[realBatchSize * maxSrcLen];
float* paddingEncValues = new float[realBatchSize * maxSrcLen];
int* labelVaues = new int[realBatchSize * maxTgtLen];
int* batchDecValues = new int[realBatchSize * maxTgtLen];
float* paddingDecValues = new float[realBatchSize * maxTgtLen];
for (int i = 0; i < realBatchSize * maxSrcLen; i++) {
batchEncValues[i] = PAD;
paddingEncValues[i] = 1;
}
for (int i = 0; i < realBatchSize * maxTgtLen; i++) {
batchDecValues[i] = PAD;
labelVaues[i] = PAD;
paddingDecValues[i] = 1.0F;
}
size_t curSrc = 0;
size_t curTgt = 0;
/*
batchEnc: end with EOS (left padding)
batchDec: begin with SOS (right padding)
label: end with EOS (right padding)
*/
for (int i = 0; i < realBatchSize; ++i) {
srcTokenNum += buffer[curIdx + i]->srcSent.Size();
tgtTokenNum += buffer[curIdx + i]->tgtSent.Size();
curSrc = maxSrcLen * i;
for (int j = 0; j < buffer[curIdx + i]->srcSent.Size(); j++) {
batchEncValues[curSrc++] = buffer[curIdx + i]->srcSent[j];
}
curTgt = maxTgtLen * i;
for (int j = 0; j < buffer[curIdx + i]->tgtSent.Size(); j++) {
if (j > 0)
labelVaues[curTgt - 1] = buffer[curIdx + i]->tgtSent[j];
batchDecValues[curTgt++] = buffer[curIdx + i]->tgtSent[j];
}
labelVaues[curTgt - 1] = EOS;
while (curSrc < maxSrcLen * (i + 1))
paddingEncValues[curSrc++] = 0;
while (curTgt < maxTgtLen * (i + 1))
paddingDecValues[curTgt++] = 0;
}
InitTensor2D(batchEnc, realBatchSize, maxSrcLen, X_INT, devID);
InitTensor2D(paddingEnc, realBatchSize, maxSrcLen, X_FLOAT, devID);
InitTensor2D(batchDec, realBatchSize, maxTgtLen, X_INT, devID);
InitTensor2D(paddingDec, realBatchSize, maxTgtLen, X_FLOAT, devID);
InitTensor2D(label, realBatchSize, maxTgtLen, X_INT, devID);
curIdx += realBatchSize;
batchEnc->SetData(batchEncValues, batchEnc->unitNum);
paddingEnc->SetData(paddingEncValues, paddingEnc->unitNum);
batchDec->SetData(batchDecValues, batchDec->unitNum);
paddingDec->SetData(paddingDecValues, paddingDec->unitNum);
label->SetData(labelVaues, label->unitNum);
delete[] batchEncValues;
delete[] paddingEncValues;
delete[] batchDecValues;
delete[] paddingDecValues;
delete[] labelVaues;
info.Add(tgtTokenNum);
info.Add(realBatchSize);
return info;
}
/*
the constructor of DataSet
>> dataFile - path of the data file
>> bucketSize - size of the bucket to keep similar length sentence pairs
>> training - indicates whether it is used for training
*/
void TrainDataSet::Init(const char* dataFile, int myBucketSize, bool training)
{
fp = fopen(dataFile, "rb");
CheckNTErrors(fp, "can not open the training file");
curIdx = 0;
bucketSize = myBucketSize;
isTraining = training;
LoadDataToBuffer();
SortByLength();
if (isTraining)
BuildBucket();
}
/* check if the buffer is empty */
bool TrainDataSet::IsEmpty() {
if (curIdx < buffer.Size())
return false;
return true;
}
/* reset the buffer */
void TrainDataSet::ClearBuf()
{
curIdx = 0;
/* make different batches in different epochs */
SortByLength();
if (isTraining)
BuildBucket();
}
/* group data into buckets with similar length */
void TrainDataSet::BuildBucket()
{
size_t idx = 0;
/* build and shuffle buckets */
while (idx < buffer.Size()) {
/* sentence number in a bucket */
size_t sentNum = 1;
/* get the maximum source sentence length in a bucket */
size_t maxSrcLen = buffer[idx]->srcSent.Size();
/* bucketing for sentences */
while ((sentNum < (buffer.Size() - idx))
&& (sentNum * maxSrcLen < bucketSize)
&& (sentNum * buffer[curIdx + sentNum]->srcSent.Size() < bucketSize)) {
if (maxSrcLen < buffer[idx + sentNum]->srcSent.Size())
maxSrcLen = buffer[idx + sentNum]->srcSent.Size();
sentNum++;
}
/* make sure the number is valid */
if ((buffer.Size() - idx) < sentNum) {
sentNum = buffer.Size() - idx;
}
int randomKey = rand();
/* shuffle items in a bucket */
for (size_t i = 0; i < sentNum; i++) {
buffer[idx + i]->bucketKey = randomKey;
}
idx += sentNum;
}
SortBucket();
/* sort items in a bucket */
idx = 0;
while (idx < buffer.Size()) {
size_t sentNum = 0;
int bucketKey = buffer[idx + sentNum]->bucketKey;
while (sentNum < (buffer.Size() - idx)
&& buffer[idx + sentNum]->bucketKey == bucketKey) {
buffer[idx + sentNum]->key = buffer[idx + sentNum]->srcSent.Size();
sentNum++;
}
SortInBucket(idx, idx + sentNum);
idx += sentNum;
}
}
/* de-constructor */
TrainDataSet::~TrainDataSet()
{
/* release the buffer */
for (int i = 0; i < buffer.Size(); i++)
delete buffer[i];
}
}
\ No newline at end of file
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: HU Chi (huchinlp@foxmail.com) 2019-04-03
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-06
*/
#ifndef __TRAIN_DATASET_H__
#define __TRAIN_DATASET_H__
#include <cstdio>
#include <vector>
#include <fstream>
#include "../../niutensor/tensor/XList.h"
#include "../../niutensor/tensor/XTensor.h"
#include "../../niutensor/tensor/XGlobal.h"
#define MAX_WORD_NUM 120
using namespace std;
namespace nts {
/* a class of sentence pairs for training */
struct TrainExample {
/* id of the sentence pair */
int id;
/* source language setence (tokenized) */
IntList srcSent;
/* target language setence (tokenized) */
IntList tgtSent;
/* the key used to shuffle items in a bucket */
int key;
/* the key used to shuffle buckets */
int bucketKey;
};
/* A `TrainDataSet` is associated with a file which contains training data. */
struct TrainDataSet {
public:
/* the data buffer */
TrainBufferType buffer;
/* a list of empty line number */
IntList emptyLines;
/* the pointer to file stream */
FILE* fp;
/* current index in the buffer */
size_t curIdx;
/* size of used data in the buffer */
size_t bufferUsed;
/* size of the bucket used for grouping sentences */
size_t bucketSize;
/* indicates whether it is used for training */
bool isTraining;
public:
/* sort the input by length (in descending order) */
void SortByLength();
/* sort buckets by key (in descending order) */
void SortBucket();
/* sort the output by key (in descending order) */
void SortInBucket(int begin, int end);
/* load data from a file to the buffer */
void LoadDataToBuffer();
/* generate a mini-batch */
UInt64List LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
XTensor* batchDec, XTensor* paddingDec, XTensor* label,
size_t minSentBatch, size_t batchSize, int devID);
/* initialization function */
void Init(const char* dataFile, int bucketSize, bool training);
/* check if the buffer is empty */
bool IsEmpty();
/* reset the buffer */
void ClearBuf();
/* group data into buckets with similar length */
void BuildBucket();
/* de-constructor */
~TrainDataSet();
};
}
#endif // __TRAIN_DATASET_H__
\ No newline at end of file
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-08-02
*/
#include "Trainer.h"
#include "../Utility.h"
#include "../../niutensor/network/XNoder.h"
#include "../../niutensor/tensor/XUtility.h"
#include "../../niutensor/tensor/core/CHeader.h"
#include "../../niutensor/tensor/loss/LHeader.h"
#ifndef WIN32
#include <sys/time.h>
#include <unistd.h>
#endif
#include "../../niutensor/tensor/XMem.h"
namespace nmt
{
/* constructor */
Trainer::Trainer()
{
cfg = NULL;
}
/* de-constructor */
Trainer::~Trainer()
{
for (int i = 0; i < moments.count; i++) {
XTensor* m = (XTensor*)moments.Get(i);
delete m;
}
for (int i = 0; i < moments2nd.count; i++) {
XTensor* m = (XTensor*)moments2nd.Get(i);
delete m;
}
}
/*
initialization
>> config - configurations of the training process
*/
void Trainer::Init(Config& config)
{
cfg = &config;
lrate = config.lrate;
lrbias = config.lrbias;
sBatchSize = config.sBatchSize;
wBatchSize = config.wBatchSize;
bucketSize = config.bucketSize;
nepoch = config.nepoch;
nstep = config.nstep;
maxCheckpoint = config.maxCheckpoint;
d = config.modelSize;
nwarmup = config.nwarmup;
vSize = config.srcVocabSize;
vSizeTgt = config.tgtVocabSize;
useAdam = config.useAdam;
adamBeta1 = config.adamBeta1;
adamBeta2 = config.adamBeta2;
adamDelta = config.adamDelta;
isShuffled = config.isShuffled;
labelSmoothingP = config.labelSmoothingP;
nStepCheckpoint = config.nStepCheckpoint;
useEpochCheckpoint = config.useEpochCheckpoint;
updateStep = config.updateStep;
isLenSorted = config.isLenSorted;
adamBeta1T = 1.0F;
adamBeta2T = 1.0F;
}
/*
train the model
>> fn - training data file
>> validFN - validation data file
>> modelFN - where we keep the model
>> model - model to train
*/
void Trainer::Train(const char* fn, const char* validFN,
const char* modelFN, Model* model)
{
/* disable cache during training */
for (int i = 0; i < model->decoder->nlayer; i++) {
model->decoder->selfAttCache[i].enable = false;
model->decoder->enDeAttCache[i].enable = false;
}
int step = 0;
int wc = 0;
int ws = 0;
int wordCount = 0;
int wordCountTotal = 0;
int batchCountTotal = 0;
bool isEnd = false;
float loss = 0;
float lr = 0;
int nStepCheck = 0;
int nCheckpoint = 0;
int nSkipped = 0;
int gradStep = 0;
int validStep = 0;
int epoch = 0;
char* trainFN = new char[(int)strlen(fn) + 10];
strcpy(trainFN, fn);
#ifndef WIN32
if (isShuffled)
sprintf(trainFN, "%s.random", fn);
#endif
int devID = model->devID;
PrepareModel(model);
double startT = GetClockSec();
batchLoader.Init(fn, bucketSize, true);
for (epoch = 1; epoch <= nepoch; epoch++) {
wordCount = 0;
loss = 0;
/* reset the batch loader */
batchLoader.ClearBuf();
while (!batchLoader.IsEmpty())
{
XNet net;
net.Clear();
/* batch of sequences (on the encoder and decoder sides) */
XTensor batchEnc;
XTensor batchDec;
/* labels */
XTensor label;
/* padding */
XTensor paddingEnc;
XTensor paddingDec;
UInt64List info = batchLoader.LoadBatch(&batchEnc, &paddingEnc, &batchDec, &paddingDec, &label,
sBatchSize, wBatchSize, devID);
wc = info[0];
ws = info[1];
CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch");
/* output probabilities */
XTensor output;
/* make the network */
if (model->isLM)
model->MakeLM(batchEnc, output, paddingEnc, true);
else if (model->isMT)
model->MakeMT(batchEnc, batchDec, output, paddingEnc, paddingDec, true);
else {
ShowNTErrors("Illegal model type!");
}
/* get loss and probabilities */
XTensor labelOnehot;
XTensor lossTensor;
labelOnehot = IndexToOnehot(label, vSizeTgt, labelSmoothingP);
lossTensor = CrossEntropy(output, labelOnehot, paddingDec);
float lossBatch = ReduceSumAllValue(lossTensor);
DTYPE lossLocal = lossBatch / wc;
bool doUpdate = (!IsNAN(lossLocal) && !IsINF(lossLocal) && lossLocal < 1e3F);
if (doUpdate) {
/* back-propagation */
net.Backward(lossTensor);
gradStep += 1;
loss += lossBatch;
wordCount += wc;
wordCountTotal += wc;
batchCountTotal += ws;
/* update the parameters */
if (gradStep == updateStep) {
float warmupEndLR = lrate;
float warmupInitLR = 1e-7;
float lrStep = (warmupEndLR - warmupInitLR) / nwarmup;
float decayFactor = warmupEndLR * pow(float(nwarmup), 0.5F);
/* learning rate, scheduled by inverse square root */
if (step < nwarmup)
lr = warmupInitLR + step * lrStep;
else
lr = decayFactor * pow((float)step, -0.5F);
/* model update */
Update(model, lr);
gradStep = 0;
validStep++;
}
}
else
nSkipped++;
if (++step >= nstep) {
isEnd = true;
break;
}
if (step == 10) {
// LOG("after backward --------");
// lossTensor.mem->ShowMemUsage(stderr);
// exit(0);
}
if (step % 100 == 0) {
double elapsed = GetClockSec() - startT;
LOG("elapsed=%.1fs, step=%d, epoch=%d, "
"total word=%d, total batch=%d, loss=%.3f, ppl=%.3f, lr=%.2e",
elapsed, step, epoch, wordCountTotal, batchCountTotal,
loss / wordCount / log(2.0), exp(loss / wordCount), lr);
if (!doUpdate)
XPRINT(0, stderr, " (no update)");
}
if (nStepCheckpoint > 0 && ++nStepCheck >= nStepCheckpoint) {
MakeCheckpoint(model, validFN, modelFN, "step", step);
nStepCheck = 0;
nCheckpoint++;
}
}
if (isEnd)
break;
if (useEpochCheckpoint)
MakeCheckpoint(model, validFN, modelFN, "epoch", epoch);
}
double elapsed = GetClockSec() - startT;
epoch = MIN(epoch, nepoch);
LOG("lr=%.2e, elapsed=%.1fs, step=%d, "
"epoch=%d, word=%d, loss=%.3f, ppl=%.3f",
lr, elapsed, step, epoch, wordCountTotal, loss / wordCount / log(2.0), exp(loss / wordCount));
LOG("training finished (took %.1fs, step=%d, "
"skipped=%d and epoch=%d)", elapsed, step, nSkipped, epoch);
LOG("saving the final model");
model->Dump(modelFN);
delete[] trainFN;
}
/*
test the model
>> fn - test data file
>> ofn - output data file
>> model - model that is trained
*/
void Trainer::Validate(const char* fn, const char* ofn, Model* model)
{
int wc = 0;
int ws = 0;
int wordCount = 0;
int sentCount = 0;
float loss = 0;
/* data files */
batchLoader.Init(fn, 0, false);
double startT = GetClockSec();
while (!batchLoader.IsEmpty())
{
/* batch of input sequences */
XTensor batchEnc;
XTensor batchDec;
/* label */
XTensor label;
/* padding */
XTensor paddingEnc;
XTensor paddingDec;
/* output probabilities */
XTensor output;
/* prediction probabilities */
XTensor labelOnehot;
XTensor lossTensor;
UInt64List info = batchLoader.LoadBatch(&batchEnc, &paddingEnc, &batchDec, &paddingDec, &label,
sBatchSize, 0, model->devID);
wc = info[0];
ws = info[1];
CheckNTErrors(batchEnc.order == 2, "Wrong tensor order of the sequence batch");
/* make the network */
if (model->isLM)
model->MakeLM(batchEnc, output, paddingEnc, false);
else if (model->isMT)
model->MakeMT(batchEnc, batchDec, output, paddingEnc, paddingDec, false);
else {
ShowNTErrors("Illegal model type!");
}
int bSize = output.GetDim(0);
int length = output.GetDim(1);
labelOnehot = IndexToOnehot(label, vSizeTgt, 0);
lossTensor = CrossEntropy(output, labelOnehot, paddingDec);
float lossBatch = ReduceSumAllValue(lossTensor);
loss += lossBatch;
wordCount += wc;
sentCount += bSize;
}
double elapsed = GetClockSec() - startT;
LOG("test finished (took %.1fs, sentence=%d, word=%d, loss=%.3f and ppl=%.3f)",
elapsed, sentCount, wordCount, loss / wordCount / log(2.0), exp(loss / wordCount));
}
/*
make a checkpoint
>> model - the model
>> validFN - validation data file
>> modelFN - model data file
>> label - label of the model
>> id - id of the checkpoint
*/
void Trainer::MakeCheckpoint(Model* model, const char* validFN,
const char* modelFN, const char* label, int id)
{
LOG("make a checkpoint");
char* fn = new char[MAX_LINE_LENGTH];
Trainer validator;
validator.Init(*cfg);
/* save last checkpoints */
id = validator.maxCheckpoint - (maxCheckpoint--);
if (maxCheckpoint == 0)
maxCheckpoint = validator.maxCheckpoint;
sprintf(fn, "%s.%s.%03d", modelFN, label, id);
model->Dump(fn);
delete[] fn;
char* fn2 = new char[MAX_LINE_LENGTH];
sprintf(fn2, "%s.%s.%03d.output", modelFN, label, id);
if (validFN != NULL) {
validator.Validate(validFN, fn2, model);
}
delete[] fn2;
}
/*
update the model by delta rule
\theta_{new} = \theta - \lrate * grad
where
\lrate = d^-0.5 * min(stepNum^{-0.5}, stepNum * warmupStepNum^{-1.5})
>> model - the model
>> lr - learning rate
*/
void Trainer::Update(Model* model, const float lr)
{
TensorList ws;
model->GetParams(ws);
for (int i = 0; i < ws.Size(); i++) {
XTensor* para = ws[i];
XTensor* paraGrad = para->grad;
if (paraGrad == NULL)
continue;
CheckNTErrors(para != NULL, "NULL parameter tensor!");
CheckNTErrors(paraGrad != NULL, "NULL gradient tensor!");
if (useAdam) {
adamBeta1T *= adamBeta1;
adamBeta2T *= adamBeta2;
DTYPE e = lr * (DTYPE)sqrt(1 - adamBeta2T) / (1 - adamBeta1T);
DTYPE d = adamDelta * (DTYPE)sqrt(1 - adamBeta2T);
/* m = beta_1 * m + (1-beta_1) * grad */
XTensor* m = (XTensor*)moments.Get(i);
_ScaleAndShiftMe(m, adamBeta1, 0);
_Sum(m, paraGrad, m, (1.0F - adamBeta1));
/* v = beta_2 * v + (1-beta_2) * grad * grad*/
XTensor* v = (XTensor*)moments2nd.Get(i);
_Multiply(paraGrad, paraGrad, v, adamBeta2 / (1.0F - adamBeta2));
_ScaleAndShiftMe(v, (1.0F - adamBeta2), 0);
/* v2 = m / (sqrt(v) + delta) */
XTensor* v2 = NewTensorBuf(v, v->devID);
_Power(v, v2, 0.5F);
_ScaleAndShiftMe(v2, 1.0F, d);
_Div(m, v2, v2);
/* the delta rule */
_Sum(para, v2, para, -e);
DelTensorBuf(v2);
}
else {
/* the delta rule */
_Sum(para, paraGrad, para, -lr);
}
/* clear gradient */
paraGrad->SetZeroAll();
}
}
/*
prepare model for training
>> model - the model for training
*/
void Trainer::PrepareModel(Model* model)
{
moments.Clear();
moments2nd.Clear();
TensorList ws;
model->GetParams(ws);
for (int i = 0; i < ws.Size(); i++) {
XTensor* para = ws[i];
XNoder::MakeGrad(para);
if (useAdam) {
XTensor* m = new XTensor(para);
XTensor* m2 = new XTensor(para);
m->SetZeroAll();
m2->SetZeroAll();
moments.Add(m);
moments2nd.Add(m2);
}
}
adamBeta1T = 1.0F;
adamBeta2T = 1.0F;
}
}
\ No newline at end of file
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-08-02
*/
#ifndef __TRAINER_H__
#define __TRAINER_H__
#include "../Model.h"
#include "TrainDataSet.h"
using namespace nts;
namespace nmt
{
/* trainer of the model */
class Trainer
{
public:
/* configurations */
Config* cfg;
/* dimension size of each inner layer */
int d;
/* step number of warm-up for training */
int nwarmup;
/* vocabulary size of the source side */
int vSize;
/* vocabulary size of the target side */
int vSizeTgt;
/* learning rate */
float lrate;
/* the parameter that controls the maximum learning rate in training */
float lrbias;
/* sentence batch size */
int sBatchSize;
/* word batch size */
int wBatchSize;
/* size of bucket for grouping data by length */
int bucketSize;
/* training epoch number */
int nepoch;
/* traing step number */
int nstep;
/* the maximum number of saved checkpoints */
int maxCheckpoint;
/* indicates whether we use adam */
bool useAdam;
/* hyper parameters of adam*/
float adamBeta1;
float adamBeta2;
float adamDelta;
float adamBeta1T;
float adamBeta2T;
/* list of the moment of the parameter matrices */
TensorList moments;
/* list of the 2nd order moment of the parameter matrices */
TensorList moments2nd;
/* indicates whether the data file is shuffled for training */
bool isShuffled;
/* the factor of label smoothing */
DTYPE labelSmoothingP;
/* number of steps after which we make a checkpoint */
int nStepCheckpoint;
/* indicates whether we make a checkpoint after each training epoch */
bool useEpochCheckpoint;
/* number of batches on which we do model update */
int updateStep;
/* indicates whether the sequence is sorted by length */
bool isLenSorted;
/* used for loading batches */
TrainDataSet batchLoader;
public:
/* constructor */
Trainer();
/* de-constructor */
~Trainer();
/* initialize the trainer */
void Init(Config& config);
/* train the model */
void Train(const char* fn, const char* validFN, const char* modelFN, Model* model);
/* test the model */
void Validate(const char* fn, const char* ofn, Model* model);
/* make a checkpoint */
void MakeCheckpoint(Model* model, const char* validFN, const char* modelFN, const char* label, int id);
/* update the model by delta rule */
void Update(Model* model, const float lr);
/* prepare model for training */
void PrepareModel(Model* model);
};
}
#endif
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: HU Chi (huchinlp@foxmail.com) 2019-04-03
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-06
*/
#include <string>
#include <vector>
#include <cstdlib>
#include <fstream>
#include <algorithm>
#include "DataSet.h"
#include "../Utility.h"
using namespace nmt;
namespace nts {
/* sort the output by id (in ascending order) */
void DataSet::SortInput() {
sort(inputBuffer.items, inputBuffer.items + inputBuffer.count,
[](Example* a, Example* b) {
return a->values.count > b->values.count;
});
}
/* sort the input by length (in descending order) */
void DataSet::SortOutput() {
sort(outputBuffer.items, outputBuffer.items + outputBuffer.count,
[](Result* a, Result* b) {
return a->id < b->id;
});
}
/*
load data from the file to the buffer
*/
void DataSet::LoadDataToBuffer()
{
string line;
inputBuffer.Clear();
bufferUsed = 0;
int id = 0;
const string tokenDelimiter = " ";
while (getline(*fp, line)) {
IntList values;
/* load words and transform them to ids */
auto indices = SplitToPos(line, tokenDelimiter);
/* reserve the first 120 words if the input is too long */
size_t maxLen = indices.Size() > MAX_WORD_NUM ? MAX_WORD_NUM : indices.Size();
for (size_t i = 0; i < maxLen; i++) {
auto offset = (i != (indices.Size() - 1)) ?
indices[i + 1] - indices[i] - tokenDelimiter.size()
: line.size() - indices[i];
string word = line.substr(indices[i], offset);
if (srcVocab.word2id.find(word) == srcVocab.word2id.end())
values.Add(UNK);
else
values.Add(srcVocab.word2id.at(word));
}
/* make sure that the sequence ends with EOS */
if (values.Size() != 0 && values[-1] != EOS)
values.Add(EOS);
Example* example = new Example;
example->id = id;
example->values = values;
if (values.Size() != 0)
inputBuffer.Add(example);
else
emptyLines.Add(id);
id++;
}
fp->close();
SortInput();
XPRINT1(0, stderr, "[INFO] loaded %d sentences\n", id);
}
/*
load a mini-batch to the device (for translating)
>> batchEnc - a tensor to store the batch of input
>> paddingEnc - a tensor to store the batch of paddings
>> minSentBatch - the minimum number of sentence batch
>> batchSize - the maxium number of words in a batch
>> devID - the device id, -1 for the CPU
<< indices of the sentences
*/
UInt64List DataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
size_t minSentBatch, size_t batchSize, int devID)
{
size_t realBatchSize = minSentBatch;
/* get the maximum sentence length in a mini-batch */
size_t maxLen = inputBuffer[bufferUsed]->values.Size();
/* dynamic batching for sentences */
//while ((realBatchSize < (inputBuffer.Size() - bufferUsed))
// && (realBatchSize * maxLen < batchSize)) {
// realBatchSize++;
//}
/* real batch size */
if ((inputBuffer.Size() - bufferUsed) < realBatchSize) {
realBatchSize = inputBuffer.Size() - bufferUsed;
}
CheckNTErrors(maxLen != 0, "invalid length");
int* batchValues = new int[realBatchSize * maxLen];
float* paddingValues = new float[realBatchSize * maxLen];
for (int i = 0; i < realBatchSize * maxLen; i++) {
batchValues[i] = PAD;
paddingValues[i] = 1.0F;
}
size_t curSrc = 0;
/* right padding */
UInt64List infos;
size_t totalLength = 0;
for (int i = 0; i < realBatchSize; ++i) {
infos.Add(inputBuffer[bufferUsed + i]->id);
totalLength += inputBuffer[bufferUsed + i]->values.Size();
curSrc = maxLen * i;
for (int j = 0; j < inputBuffer[bufferUsed + i]->values.Size(); j++)
batchValues[curSrc++] = inputBuffer[bufferUsed + i]->values[j];
while (curSrc < maxLen * (i + 1))
paddingValues[curSrc++] = 0;
}
infos.Add(totalLength);
InitTensor2D(batchEnc, realBatchSize, maxLen, X_INT, devID);
InitTensor2D(paddingEnc, realBatchSize, maxLen, X_FLOAT, devID);
bufferUsed += realBatchSize;
batchEnc->SetData(batchValues, batchEnc->unitNum);
paddingEnc->SetData(paddingValues, paddingEnc->unitNum);
delete[] batchValues;
delete[] paddingValues;
return infos;
}
/*
the constructor of DataSet
>> dataFile - path of the data file
>> srcVocabFN - path of the source vocab file
>> tgtVocabFN - path of the target vocab file
*/
void DataSet::Init(const char* dataFile, const char* srcVocabFN, const char* tgtVocabFN)
{
fp = new ifstream(dataFile);
CheckNTErrors(fp->is_open(), "Can not open the test data");
bufferUsed = 0;
CheckNTErrors(strcmp(srcVocabFN, "") != 0, "missing source vocab file");
CheckNTErrors(strcmp(tgtVocabFN, "") != 0, "missing target vocab file");
srcVocab.Load(srcVocabFN);
/* share source and target vocabs */
if (strcmp(srcVocabFN, tgtVocabFN) == 0) {
XPRINT(0, stderr, "[INFO] share source and target vocabs \n");
tgtVocab.CopyFrom(srcVocab);
}
else {
tgtVocab.Load(tgtVocabFN);
}
LoadDataToBuffer();
}
/* check if the buffer is empty */
bool DataSet::IsEmpty() {
if (bufferUsed < inputBuffer.Size())
return false;
return true;
}
/* dump the translation to a file */
void DataSet::DumpRes(const char* ofn)
{
ofstream ofile(ofn, ios::out);
for (int t = 0; t < outputBuffer.Size(); t++) {
auto res = outputBuffer[t];
for (int i = 0; i < res->res.Size(); i++) {
if (res->res[i] < 4)
break;
ofile << tgtVocab.id2word[res->res[i]] << " ";
}
ofile << "\n";
}
ofile.close();
}
/* de-constructor */
DataSet::~DataSet()
{
/* release the file */
delete fp;
/* release the input buffer */
for (int i = 0; i < inputBuffer.Size(); i++)
delete inputBuffer[i];
/* release the output buffer */
for (int i = 0; i < outputBuffer.Size(); i++)
delete outputBuffer[i];
}
}
\ No newline at end of file
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: HU Chi (huchinlp@foxmail.com) 2019-04-03
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-06
*/
#ifndef __DATASET_H__
#define __DATASET_H__
#include <cstdio>
#include <vector>
#include <fstream>
#include "Vocab.h"
#include "../../niutensor/tensor/XList.h"
#include "../../niutensor/tensor/XTensor.h"
#include "../../niutensor/tensor/XGlobal.h"
#define MAX_WORD_NUM 120
using namespace std;
namespace nts {
/* the struct of tokenized input */
struct Example {
int id;
IntList values;
};
/* the struct of tokenized output */
struct Result {
int id;
IntList res;
};
/* A `DataSet` is associated with a file which contains variable length data.*/
struct DataSet {
public:
/* the data buffer */
InputBufferType inputBuffer;
/* a list of empty line number */
IntList emptyLines;
/* the result buffer */
OutputBufferType outputBuffer;
/* the pointer to file stream */
ifstream* fp;
/* size of used data in buffer */
size_t bufferUsed;
/* the source vocabulary */
Vocab srcVocab;
/* the target vocabulary */
Vocab tgtVocab;
public:
/* sort the input by length */
void SortInput();
/* reorder the output by ids */
void SortOutput();
/* load data from a file to the buffer */
void LoadDataToBuffer();
/* generate a mini-batch */
UInt64List LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
size_t sBatch, size_t wBatch, int devID);
/* initialization function */
void Init(const char* dataFile, const char* srcVocabFN, const char* tgtVocabFN);
/* check if the buffer is empty */
bool IsEmpty();
/* dump the translations to a file */
void DumpRes(const char* ofn);
/* de-constructor */
~DataSet();
};
}
#endif // __DATASET_H__
\ No newline at end of file
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-04-08
* Start of a new week - I just finished several documents.
* Writing document is harder than writing code :)
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#include "LengthPenalty.h"
using namespace nts;
namespace nmt
{
/*
GNMT-like length penalty: pl = ((5 + n)/(5 + 1))^\alpha
where n = length of the sequence
>> length - length of the sequence
>> alpha - the parameter controls the length preference
<< return - length penalty of the sequence
*/
float LengthPenalizer::GNMT(float length, float alpha)
{
float base;
float lp;
base = (length + 5.0F) / (1.0F + 5.0F);
lp = pow(base, alpha);
return lp;
}
}
\ No newline at end of file
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-04-08
* Start of a new week - I just finished several documents.
* Writing document is harder than writing code :)
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#ifndef __LENGTHPENALTY_H__
#define __LENGTHPENALTY_H__
#include "../Utility.h"
#include "../../niutensor/tensor/XTensor.h"
using namespace nts;
namespace nmt
{
/* We intend to penalize short sequences because they have higher score
in product of a sequence of probability-like terms and have more chances
to beat others in search. */
class LengthPenalizer
{
public:
/* GNMT-like length penalty: pl = ((5 + n)/(5 + 1))^\alpha
where n = length of the sequence */
static float GNMT(float length, float alpha);
};
}
#endif
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-13
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#include <iostream>
#include "Predictor.h"
#include "../layer/NNUtil.h"
using namespace nts;
namespace nmt
{
/* constructor */
StateBundle::StateBundle()
{
states = NULL;
isStart = false;
}
/* de-constructor */
StateBundle::~StateBundle()
{
if (states != NULL)
delete[] states;
}
/*
create states
>> num - number of states
*/
void StateBundle::MakeStates(int num)
{
CheckNTErrors(num > 0, "invalid number");
if (states != NULL)
delete[] states;
states = new State[num];
for (int i = 0; i < num; i++) {
states[i].prediction = -1;
states[i].pid = _PID_EMPTY;
states[i].isEnd = false;
states[i].isStart = false;
states[i].isCompleted = false;
states[i].prob = 0;
states[i].probPath = 0;
states[i].modelScore = 0;
states[i].nstep = 0;
states[i].last = NULL;
}
stateNum = num;
}
/* constructor */
Predictor::Predictor()
{
startSymbol = 2;
}
/* de-constructor */
Predictor::~Predictor()
{
}
/*
create an initial state
>> model - the model
>> top - the top-most layer of the network
>> input - input of the network
>> beamSize - beam size
>> state - the state to be initialized
*/
void Predictor::Create(Model* model, XTensor* top, const XTensor* input,
int beamSize, StateBundle* state)
{
int dims[MAX_TENSOR_DIM_NUM];
for (int i = 0; i < input->order - 1; i++)
dims[i] = input->dimSize[i];
dims[input->order - 1] = beamSize;
InitTensor(&state->probPath, input->order, dims, X_FLOAT, input->devID);
InitTensor(&state->endMark, input->order, dims, X_INT, input->devID);
state->probPath.SetZeroAll();
state->nstep = 0.0F;
state->endMark.SetZeroAll();
state->stateNum = 0;
}
/*
set start symbol
>> symbol - the symbol (in integer)
*/
void Predictor::SetStartSymbol(int symbol)
{
startSymbol = symbol;
}
/*
read a state
>> model - the model that keeps the network created so far
>> state - a set of states. It keeps
1) hypotheses (states)
2) probabilities of hypotheses
3) parts of the network for expanding toward the next state
*/
void Predictor::Read(Model* model, StateBundle* state)
{
m = model;
s = state;
}
/*
predict the next state
>> next - next states
>> aliveIndices - indices of alive states, (B)
>> absoluteIdx - the absolute indices of alive states, (B)
>> encoding - encoder output, (B, L, E)
>> inputEnc - input of the encoder, (B, L)
>> paddingEnc - padding of the encoder, (B, L)
>> rawBatchSize - the raw batch size (in case of some states are pruned)
>> isStart - whether it is the start state or not
>> reorderState - the new order of states
>> needReorder - whether we need reordering the states
>> nstep - current time step of the target sequence
*/
void Predictor::Predict(StateBundle* next, XTensor& aliveState, XTensor& encoding,
XTensor& inputEnc, XTensor& paddingEnc, int batchSize, bool isStart,
XTensor& reorderState, bool needReorder, int nstep)
{
int dims[MAX_TENSOR_DIM_NUM];
/* word indices of positions up to next state */
XTensor inputDec;
/* the first token */
XTensor first;
InitTensor2D(&first, batchSize, 1, X_INT, inputEnc.devID);
first.SetDataFixed(startSymbol);
/* add a new word into the input sequence of the decoder side */
if (isStart) {
inputDec = Identity(first);
}
else {
/* only pass one step to the decoder */
inputDec = GetLastPrediction(s, inputEnc.devID);
}
/* keep alive states for the decoder */
if (aliveState.dimSize[0] < batchSize) {
/* alive inputs */
inputDec = AutoGather(inputDec, aliveState);
/* alive cache */
for (int i = 0; i < m->decoder->nlayer; i++) {
m->decoder->selfAttCache[i].KeepAlive(aliveState);
m->decoder->enDeAttCache[i].KeepAlive(aliveState);
}
}
if (needReorder) {
for (int i = 0; i < m->decoder->nlayer; i++) {
m->decoder->selfAttCache[i].Reorder(reorderState);
m->decoder->enDeAttCache[i].Reorder(reorderState);
}
}
/* prediction probabilities */
XTensor& output = next->prob;
XTensor decoding;
for (int i = 0; i < inputDec.order - 1; i++)
dims[i] = inputDec.dimSize[i];
dims[inputDec.order - 1] = inputDec.dimSize[inputDec.order - 1];
XTensor paddingDec;
InitTensor(&paddingDec, inputDec.order, dims, X_INT, paddingEnc.devID);
paddingDec.SetDataFixed(1);
XTensor maskDec;
XTensor maskEncDec;
/* decoder mask */
m->MakeMTMaskDec(paddingEnc, paddingDec, maskDec, maskEncDec);
/* make the decoding network */
decoding = m->decoder->Make(inputDec, encoding, NULL, &maskEncDec, nstep, false);
CheckNTErrors(decoding.order >= 2, "The tensor must be of order 2 or larger!");
/* generate the output probabilities */
m->outputLayer->Make(decoding, output, false, true);
}
/*
generate paths up to the states of the current step
>> state - state bundle of the current step
*/
XTensor Predictor::GeneratePaths(StateBundle* state)
{
CheckNTErrors(state->stateNum >= 0, "Illegal state!");
int distance = -1;
for (int i = 0; i < state->stateNum; i++) {
State* cur = state->states + i;
int nsteps = 0;
while (cur != NULL) {
nsteps++;
cur = cur->last;
}
if (nsteps > distance)
distance = nsteps;
}
XTensor path;
InitTensor2D(&path, state->stateNum, distance, X_INT);
path.SetZeroAll();
for (int i = 0; i < state->stateNum; i++) {
State* cur = state->states + i;
int nsteps = 0;
while (cur != NULL) {
nsteps++;
path.Set2DInt(cur->prediction, i, distance - nsteps);
cur = cur->last;
}
}
return path;
}
/*
get the predictions of the previous step
>> state - state bundle of the current step
>> devID - the device id for the predictions
*/
XTensor Predictor::GetLastPrediction(StateBundle* state, int devID)
{
CheckNTErrors(state->stateNum >= 0, "Illegal state!");
IntList last;
for (int i = 0; i < state->stateNum; i++) {
State* cur = state->states + i;
last.Add(cur->prediction);
}
XTensor lastPred;
InitTensor2D(&lastPred, int(last.Size()), 1, X_INT, devID);
lastPred.SetData(last.items, int(last.Size()));
return lastPred;
}
}
\ No newline at end of file
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-13
* This is the first source file I create in 2019 - new start!
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
*/
#ifndef __PREDICTOR_H__
#define __PREDICTOR_H__
#include "../Model.h"
#include "LengthPenalty.h"
using namespace std;
namespace nmt
{
#define _PID_EMPTY -1
/* state for search. It keeps the path (back-pointer), prediction distribution,
and etc. It can be regarded as a hypotheses in translation. */
class State
{
public:
/* we assume that the prediction is an integer */
int prediction;
/* id of the problem. One can regard it as the sentence id when we
translate a number of sentences in the batched manner. The hypotheses
is empty if id = -1 */
int pid;
/* indicates whether the state is an end */
bool isEnd;
/* indicates whether the state is the start */
bool isStart;
/* indicates whether the state is completed */
bool isCompleted;
/* probability of every prediction (last state of the path) */
float prob;
/* probability of every path */
float probPath;
/* model score of every path. A model score = path probability + some other stuff */
float modelScore;
/* number of steps we go over so far */
int nstep;
/* pointer to the previous state */
State* last;
};
/* a bundle of states */
class StateBundle
{
public:
/* predictions */
XTensor prediction;
/* id of the previous state that generates the current one */
XTensor preID;
/* mark that indicates whether each hypotheses is completed */
XTensor endMark;
/* probability of every prediction (last state of the path) */
XTensor prob;
/* probability of every path */
XTensor probPath;
/* model score of every path */
XTensor modelScore;
/* step number of each hypotheses */
float nstep;
/* list of states */
State* states;
/* number of states */
int stateNum;
/* indicates whether it is the first state */
bool isStart;
public:
/* constructor */
StateBundle();
/* de-constructor */
~StateBundle();
/* create states */
void MakeStates(int num);
};
/* The predictor reads the current state and then predicts the next.
It is exactly the same procedure of MT inference -
we get the state of previous words and then generate the next word.
Here, a state can be regarded as the representation of words (word
indices, hidden states, embeddings and etc.). */
class Predictor
{
private:
/* pointer to the transformer model */
Model* m;
/* current state */
StateBundle* s;
/* start symbol */
int startSymbol;
/* end symbol */
int endSymbol;
public:
/* constructor */
Predictor();
/* de-constructor */
~Predictor();
/* create an initial state */
void Create(Model* model, XTensor* top, const XTensor* input, int beamSize, StateBundle* state);
/* set the start symbol */
void SetStartSymbol(int symbol);
/* read a state */
void Read(Model* model, StateBundle* state);
/* predict the next state */
void Predict(StateBundle* next, XTensor& aliveIndices, XTensor& encoding,
XTensor& inputEnc, XTensor& paddingEnc, int rawBatchSize,
bool isStart, XTensor& reorderState, bool needReorder, int nstep);
/* generate paths up to the states of the current step */
XTensor GeneratePaths(StateBundle* state);
/* get the predictions of the previous step */
XTensor GetLastPrediction(StateBundle* state, int devID);
};
}
#endif
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
*/
#include "Search.h"
#include "../Utility.h"
#include "../../niutensor/tensor/core/CHeader.h"
using namespace nts;
namespace nmt
{
/* constructor */
BeamSearch::BeamSearch()
{
alpha = 0;
maxLength = 0;
beamSize = 0;
batchSize = 0;
endSymbolNum = 0;
fullHypos = NULL;
endSymbols = new int[32];
startSymbol = -1;
}
/* de-constructor */
BeamSearch::~BeamSearch()
{
if (fullHypos != NULL)
delete[] fullHypos;
if (endSymbols != NULL)
delete[] endSymbols;
}
/*
initialize the model
>> argc - number of arguments
>> argv - list of pointers to the arguments
*/
void BeamSearch::Init(Config& config)
{
beamSize = config.beamSize;
batchSize = config.sBatchSize;
alpha = config.lenAlpha;
endSymbols[0] = config.endID;
startSymbol = config.startID;
scalarMaxLength = config.maxLenAlpha;
if (endSymbols[0] >= 0)
endSymbolNum = 1;
}
/*
prepare for search
>> batchSize - size of the batch
>> beamSize - size of the beam
*/
void BeamSearch::Prepare(int myBatchSize, int myBeamSize)
{
batchSize = myBatchSize;
beamSize = myBeamSize;
needReorder = false;
/* prepare for the heap of hypotheses */
if (fullHypos != NULL)
delete[] fullHypos;
fullHypos = new XHeap<MIN_HEAP, float>[batchSize];
for (int i = 0; i < batchSize; i++)
fullHypos[i].Init(beamSize);
/* prepare for the indices of alive states */
aliveStatePids.Clear();
aliveSentList.Clear();
for (int i = 0; i < batchSize; i++) {
aliveStatePids.Add(i);
aliveSentList.Add(i);
}
}
/*
search for the most promising states
>> model - the transformer model
>> input - input of the model
>> padding - padding of the input
>> output - output that represents the sequences as rows
>> score - score of the sequences
*/
void BeamSearch::Search(Model* model, XTensor& input, XTensor& padding,
IntList* output, XTensor& score)
{
Predictor predictor;
XTensor maskEnc;
XTensor encoding;
XTensor encodingBeam;
XTensor inputBeam;
XTensor paddingBeam;
CheckNTErrors(endSymbolNum > 0, "The search class is not initialized!");
CheckNTErrors(startSymbol >= 0, "The search class is not initialized!");
Prepare(input.unitNum / input.dimSize[input.order - 1], beamSize);
/* encoder mask */
model->MakeMTMaskEnc(padding, maskEnc);
/* make the encoding network */
encoding = model->MakeEncoder(input, &maskEnc, false);
encodingBeam = Unsqueeze(encoding, encoding.order - 2, beamSize);
inputBeam = Unsqueeze(input, input.order - 1, beamSize);
paddingBeam = Unsqueeze(padding, padding.order - 1, beamSize);
encodingBeam.ReshapeMerged(encodingBeam.order - 4);
inputBeam.ReshapeMerged(inputBeam.order - 3);
paddingBeam.ReshapeMerged(paddingBeam.order - 3);
/* max output-length = scalar * source-length */
int lengthLimit = (int)(input.dimSize[input.order - 1] * scalarMaxLength);
CheckNTErrors(lengthLimit > 0, "no max length specified!");
maxLength = lengthLimit;
StateBundle* states = new StateBundle[lengthLimit + 1];
StateBundle* first = states;
StateBundle* cur = NULL;
StateBundle* next = NULL;
/* create the first state */
predictor.Create(model, &encodingBeam, &input, beamSize, first);
predictor.SetStartSymbol(startSymbol);
first->isStart = true;
XTensor aliveState;
InitTensor1D(&aliveState, batchSize * beamSize, X_INT, input.devID);
SetAscendingOrder(aliveState, 0);
XTensor reorderState;
InitTensor1D(&reorderState, batchSize * beamSize, X_INT, input.devID);
SetAscendingOrder(reorderState, 0);
/* generate the sequence from left to right */
for (int l = 0; l < lengthLimit; l++) {
if (beamSize > 1) {
inputBeam = AutoGather(inputBeam, reorderState);
paddingBeam = AutoGather(paddingBeam, reorderState);
encodingBeam = AutoGather(encodingBeam, reorderState);
}
cur = states + l;
next = states + l + 1;
/* read the current state */
predictor.Read(model, cur);
/* predict the next state */
predictor.Predict(next, aliveState, encodingBeam, inputBeam,
paddingBeam, batchSize * beamSize, l == 0, reorderState, needReorder, l);
/* compute the model score (given the prediction probability) */
Score(cur, next);
/* beam pruning */
Generate(cur, next);
/* expand the search graph */
Expand(cur, next, reorderState);
/* push complete hypotheses into the heap */
Collect(next);
/* stop searching when all hypotheses are completed */
if (IsAllCompleted(next)) {
maxLength = l + 1;
break;
}
/* remove finished sentences */
//RemoveFinishedStates(next, encodingBeam, inputBeam, paddingBeam, aliveState);
}
/* fill the heap with incomplete hypotheses if necessary */
FillHeap(next);
Dump(output, &score);
delete[] states;
}
/*
compute the model score for each hypotheses
>> prev - the beam of the previous state
>> beam - the beam that keeps a number of states
*/
void BeamSearch::Score(StateBundle* prev, StateBundle* beam)
{
XTensor& score = beam->modelScore;
XTensor& prob = beam->prob;
XTensor& probPath = beam->probPath;
XTensor& probPathPrev = prev->probPath;
XTensor mask;
int order = prob.order;
int outputSize = prob.dimSize[prob.order - 1];
int dims[MAX_TENSOR_DIM_NUM];
for (int i = 0; i < order; i++)
dims[i] = prob.dimSize[i];
if (prob.dataType == X_FLOAT16)
prob = ConvertDataType(prob, X_FLOAT);
InitTensor(&score, &prob);
InitTensor(&probPath, &prob);
prob.Reshape(prob.unitNum / outputSize, outputSize);
score.Reshape(score.unitNum / outputSize, outputSize);
probPath.Reshape(score.unitNum / outputSize, outputSize);
probPathPrev.Reshape(probPathPrev.unitNum);
/* the log-scale probability of the entire sequence */
SumDim(prob, probPathPrev, probPath, 0);
beam->nstep = prev->nstep + 1.0F;
/* the GNMT-like length penalty */
float lp = LengthPenalizer::GNMT(beam->nstep, alpha);
/* score = log-prob/lp */
score = probPath / lp;
if (prev->isStart) {
XTensor firstMask = MakeFirstMask(beam);
firstMask.Reshape(firstMask.unitNum);
/* mask the hypotheses in the beam except the first one */
SumDim(score, firstMask, score, 0);
}
InitTensor(&mask,
prev->endMark.order, prev->endMark.dimSize, X_FLOAT,
prev->endMark.devID);
mask.SetZeroAll();
_SetDataFixedCond(&mask, &prev->endMark, -1e9F);
mask.Reshape(mask.unitNum);
/* mask the completed hypotheses so that they cannot
be involved in further sorting and beam search. */
SumDim(score, mask, score, 0);
prob.Reshape(order, dims);
score.Reshape(order, dims);
probPath.Reshape(order, dims);
}
/*
generate tokens for the next state via beam pruning
>> prev - the last beam
>> beam - the beam that keeps a number of states
*/
void BeamSearch::Generate(StateBundle* prev, StateBundle* beam)
{
int dims[MAX_TENSOR_DIM_NUM];
int dimsBeam[MAX_TENSOR_DIM_NUM];
int dimsTopK[MAX_TENSOR_DIM_NUM];
XTensor scoreTopK;
XTensor indexCPU;
XTensor& score = beam->modelScore;
XTensor& index = beam->prediction;
XTensor& preID = beam->preID;
XTensor& probPath = beam->probPath;
XTensor& prob = beam->prob;
int order = score.order;
for (int i = 0; i < order; i++) {
dims[i] = score.dimSize[i];
dimsBeam[i] = score.dimSize[i];
dimsTopK[i] = score.dimSize[i];
}
CheckNTErrors(order >= 3, "The tensor must be of order 2 or larger.");
CheckNTErrors(dimsBeam[order - 3] % beamSize == 0, "Wrong dimension size!");
int sizeVocab = score.dimSize[score.order - 1];
int stride = score.dimSize[score.order - 1];
dimsBeam[order - 3] /= beamSize;
dimsBeam[order - 1] *= beamSize;
dimsTopK[order - 3] = dimsBeam[order - 3];
dimsTopK[order - 1] = beamSize;
InitTensor(&scoreTopK, order, dimsTopK, score.dataType, score.devID);
InitTensor(&index, order, dimsTopK, X_INT, score.devID);
InitTensor(&preID, order, dimsTopK, X_INT, -1);
InitTensor(&indexCPU, order, dimsTopK, X_INT, -1);
score.Reshape(order, dimsBeam);
prob.Reshape(order, dimsBeam);
/* keep the most promising candidates in the beam */
TopK(score, scoreTopK, index, -1, beamSize, true);
float lp = LengthPenalizer::GNMT(beam->nstep, alpha);
CopyValues(index, indexCPU);
CopyValues(index, preID);
/* "preID" represents the id (or the offset) of the previous state used to make the current
hypotheses. Note that we reshape the "score" tensor into a matrix where each
row means a previous state. The column number is size-of-beam \times vocab-size. We,
therefore, divide entries of the top-k index by vocab-size to compute the id of the
previous state for each hypotheses in the top-k list. */
DescaleMe(preID, sizeVocab);
/* Then, we do something similar to "preID". For the top-k predictions, we need
to know their indices in the vocabulary. We compute the offset of each prediction
in the vocabulary by dividing it with vocab-size and computing the remainder. */
ModMe(index, sizeVocab);
/* we keep the top-k scores */
score = CopyValues(scoreTopK);
for (int i = 0; i < indexCPU.unitNum; i += beamSize) {
for (int j = 0; j < beamSize; j++) {
indexCPU.SetInt(i * stride + indexCPU.GetInt(i + j), i + j);
}
}
/* sequence probability of top-k candidates */
for (int i = 0; i < probPath.order; i++) {
dims[i] = probPath.dimSize[i];
dimsTopK[i] = scoreTopK.dimSize[i];
}
order = probPath.order;
prob.Reshape(prob.unitNum, 1);
probPath.Reshape(probPath.unitNum, 1);
indexCPU.Reshape(indexCPU.dimSize[0], indexCPU.dimSize[indexCPU.order - 1]);
indexCPU.SetDevice(prob.devID);
prob = Gather(prob, indexCPU);
probPath = Gather(probPath, indexCPU);
prob.Reshape(order, dimsTopK);
probPath.Reshape(order, dimsTopK);
}
/*
expand the search graph
>> prev - the last beam
>> beam - the beam that keeps a number of states
>> reorderState - the new order of states
*/
void BeamSearch::Expand(StateBundle* prev, StateBundle* beam, XTensor& reorderState)
{
CheckNTErrors(beam->prediction.unitNum == beam->preID.unitNum,
"A problem occurs in the beam!");
beam->MakeStates(beam->prediction.unitNum);
State* states = beam->states;
XTensor& idRef = beam->preID;
XTensor& modelScoreRef = beam->modelScore;
XTensor& probRef = beam->prob;
XTensor& probPathRef = beam->probPath;
XTensor& predictionRef = beam->prediction;
XTensor& endMark = beam->endMark;
XTensor id;
XTensor modelScore;
XTensor prob;
XTensor probPath;
XTensor prediction;
XTensor endMarkCPU;
XTensor reorderStateCPU;
InitTensorOnCPU(&id, &idRef);
InitTensorOnCPU(&modelScore, &modelScoreRef);
InitTensorOnCPU(&prob, &probRef);
InitTensorOnCPU(&probPath, &probPathRef);
InitTensorOnCPU(&prediction, &predictionRef);
InitTensorOnCPU(&endMarkCPU, &predictionRef);
InitTensor(&endMark, &predictionRef);
InitTensorOnCPU(&reorderStateCPU, &reorderState);
/* we copy the data to CPU because the frequent access to GPU is slow
and we can speed-up the process by doing the job on CPU. */
CopyValues(idRef, id);
CopyValues(modelScoreRef, modelScore);
CopyValues(probRef, prob);
CopyValues(probPathRef, probPath);
CopyValues(predictionRef, prediction);
CheckNTErrors(beam->stateNum == id.unitNum, "Errors occur in counting!");
/* Related variables are kept on the states of the graph. All these are
maintained on CPUs to ease the implementation of frequent access and
modification of the states. An alternative is to do this on GPUs but
it needs much more coding work and the speed-up is not obvious. */
for (int i = 0; i < beam->stateNum; i += beamSize) {
for (int j = 0; j < beamSize; j++) {
int k = i + j;
State& state = states[k];
int offset = id.GetInt(k);
int pid = i / beamSize;
reorderStateCPU.SetInt(i + offset, i + j);
if (offset != j)
needReorder = true;
State* last = prev->states + pid * beamSize + offset;
CheckNTErrors(offset >= 0, "Wrong state index!");
/* pointer to the previous state */
if (prev->isStart) {
state.last = NULL;
state.pid = pid;
state.nstep = 0;
state.isCompleted = false;
}
else {
state.last = last;
state.pid = state.last->pid;
state.nstep = last->nstep + 1;
state.isCompleted = last->isCompleted;
CheckNTErrors(offset < prev->stateNum, "Wrong state index!");
}
/*if(aliveStatePids.size() < batchSize)
state.pid = aliveStatePids[i/beamSize];*/
/* scores */
state.modelScore = modelScore.Get(k);
state.prob = prob.Get(k);
state.probPath = probPath.Get(k);
/* prediction */
state.prediction = prediction.GetInt(k);
CheckNTErrors(state.prediction >= 0, "Illegal prediction!");
/* check if it is the end of the sequence */
state.isEnd = IsEnd(state.prediction);
state.isCompleted = (state.isCompleted || state.isEnd);
/* set the ending mark */
endMarkCPU.SetInt(state.isEnd, k);
}
}
/* copy the ending mark from CPU to the target device */
CopyValues(endMarkCPU, endMark);
CopyValues(reorderStateCPU, reorderState);
}
/*
collect hypotheses with ending symbols. Given a beam of hypotheses,
we remove the finished hypotheses and keep them in a heap.
>> beam - the beam that keeps a number of states
*/
void BeamSearch::Collect(StateBundle* beam)
{
State* states = beam->states;
for (int i = 0; i < beam->stateNum; i++) {
State& state = states[i];
CheckNTErrors(state.pid >= 0 && state.pid < batchSize,
"Invalid sample id!");
/* check if this is the first end symbol. It is false
if there have been end symbols in previously generated words. */
bool isCompleted = state.isCompleted &&
(state.last == NULL || !state.last->isCompleted);
/* we push the hypothesis into the heap when it is completed */
if ((state.isEnd || state.isCompleted)) {
fullHypos[state.pid].Push(HeapNode<float>(&state, state.modelScore));
}
}
}
/*
fill the hypothesis heap with incomplete hypotheses
>> beam - the beam that keeps a number of states (final)
*/
void BeamSearch::FillHeap(StateBundle* beam)
{
State* states = beam->states;
for (int i = 0; i < beam->stateNum / beamSize; i++) {
for (int j = 0; j < beamSize; j++) {
State& state = states[i * beamSize + j];
/* we push the incomplete hypothesis into the heap */
if (fullHypos[state.pid].Count() == 0 && state.isEnd && state.isCompleted) {
fullHypos[state.pid].Push(HeapNode<float>(&state, state.modelScore));
}
else {
auto node = fullHypos[state.pid].Top();
float score = node.value;
if (score < state.modelScore)
fullHypos[state.pid].Push(HeapNode<float>(&state, state.modelScore));
}
}
}
}
/*
save the output sequences in a tensor
>> output - output sequences (for return)
>> score - score of thes sequences
*/
void BeamSearch::Dump(IntList* output, XTensor* score)
{
int dims[3] = { batchSize, 1, maxLength };
InitTensor(score, 2, dims, X_FLOAT);
score->SetZeroAll();
/* heap for an input sentence in the batch */
for (int h = 0; h < batchSize; h++) {
XHeap<MIN_HEAP, float>& heap = fullHypos[h];
int c = heap.Count();
float bestScore = -1e9F;
State* state = NULL;
for (int i = 0; i < c; i++) {
auto node = heap.Pop();
State* s = (State*)node.index;
if (i == 0 || bestScore < node.value) {
state = s;
bestScore = node.value;
}
}
int count = 0;
bool isCompleted = true;
/* we track the state from the end to the beginning */
while (state != NULL) {
if (!state->isCompleted)
isCompleted = false;
if (isCompleted) {
output[h].Add(2);
}
else {
output[h].Add(state->prediction);
}
state = state->last;
}
output[h].Reverse();
score->Set2D(bestScore, h, 0);
}
}
/*
check if the token is an end symbol
>> token - token to be checked
*/
bool BeamSearch::IsEnd(int token)
{
CheckNTErrors(endSymbolNum > 0, "No end symbol?");
for (int i = 0; i < endSymbolNum; i++) {
if (endSymbols[i] == token)
return true;
}
return false;
}
/*
set end symbols for search
>> tokens - end symbols
>> tokenNum - number of the end symbols
*/
void BeamSearch::SetEnd(const int* tokens, const int tokenNum)
{
if (endSymbols != NULL)
delete[] endSymbols;
if (tokenNum <= 0)
return;
/* we may have multiple end symbols */
tokens = new int[tokenNum];
for (int i = 0; i < tokenNum; i++)
endSymbols[i] = tokens[i];
endSymbolNum = tokenNum;
}
/*
check whether all hypotheses are completed
>> beam - the beam that keeps the searching states
*/
bool BeamSearch::IsAllCompleted(StateBundle* beam)
{
State* states = beam->states;
for (int i = 0; i < beam->stateNum; i++) {
State& state = states[i];
if (!state.isCompleted)
return false;
}
return true;
}
/*
update the beam by removing finished hypotheses
>> beam - the beam that keeps the searching states
>> aliveEncoding - new input embeddings for the encoder, (B, L, E)
>> aliveInput - new input tokens of the encoder, (B, L)
>> alivePadding - new paddings for the inputs, (B, L)
<< aliveIdx - the indices of alive states
*/
void BeamSearch::RemoveFinishedStates(StateBundle* beam, XTensor& aliveEncoding,
XTensor& aliveInput, XTensor& alivePadding,
XTensor& aliveState)
{
State* states = beam->states;
/* get the indices of uncompleted sentences and states */
aliveSentList.Clear();
IntList aliveStateList;
int count = 0;
/* the number of completed sentences */
for (int i = 0; i < beam->stateNum; i += beamSize) {
int endState = 0;
for (int j = 0; j < beamSize; j++) {
if (states[i + j].isEnd) {
endState++;
}
}
bool isSentCompleted = (endState == beamSize);
int sent = i / beamSize;
if (!isSentCompleted) {
aliveSentList.Add(sent);
for (int j = 0; j < beamSize; j++) {
aliveStateList.Add(i + j);
}
}
else {
aliveStatePids.Remove(sent - count);
count++;
}
}
InitTensor1D(&aliveState, int(aliveStateList.Size()), X_INT, aliveEncoding.devID);
aliveState.SetData(aliveStateList.items, int(aliveStateList.Size()));
XTensor aliveSent;
InitTensor1D(&aliveSent, int(aliveSentList.Size()), X_INT, aliveEncoding.devID);
aliveSent.SetData(aliveSentList.items, int(aliveSentList.Size()));
if (aliveStateList.Size() < aliveEncoding.dimSize[0] && aliveStateList.Size() > 0) {
aliveInput = AutoGather(aliveInput, aliveState);
alivePadding = AutoGather(alivePadding, aliveState);
aliveEncoding = AutoGather(aliveEncoding, aliveState);
beam->prob = AutoGather(beam->prob, aliveSent);
beam->endMark = AutoGather(beam->endMark, aliveSent);
beam->probPath = AutoGather(beam->probPath, aliveSent);
beam->modelScore = AutoGather(beam->modelScore, aliveSent);
beam->prediction = AutoGather(beam->prediction, aliveSent);
}
}
/*
make a mask to prevent duplicated entries in beam expansion for the first position
>> beam - the beam that keeps the searching states
*/
XTensor BeamSearch::MakeFirstMask(StateBundle* beam)
{
XTensor& prob = beam->prob;
XTensor mask;
int order = prob.order;
int dims[MAX_TENSOR_DIM_NUM];
for (int i = 0; i < order - 1; i++)
dims[i] = prob.dimSize[i];
InitTensor(&mask, order - 1, dims, X_FLOAT);
mask.SetZeroAll();
for (int i = 0; i < mask.unitNum; i++) {
if (i % beamSize != 0)
mask.Set(-1e9, i);
}
mask.SetDevice(prob.devID);
return mask;
}
/* constructor */
GreedySearch::GreedySearch()
{
maxLength = 0;
batchSize = 0;
endSymbolNum = 0;
endSymbols = new int[32];
startSymbol = -1;
}
/* de-constructor */
GreedySearch::~GreedySearch()
{
if (endSymbols != NULL)
delete[] endSymbols;
}
/*
initialize the model
>> argc - number of arguments
>> argv - list of pointers to the arguments
*/
void GreedySearch::Init(Config& config)
{
batchSize = config.wBatchSize;
endSymbols[0] = config.endID;
startSymbol = config.startID;
scalarMaxLength = config.maxLenAlpha;
if (endSymbols[0] >= 0)
endSymbolNum = 1;
}
/*
prepare for search
>> batchSize - size of the batch
*/
void GreedySearch::Prepare(int myBatchSize)
{
batchSize = myBatchSize;
}
/* check if the token is an end symbol */
bool GreedySearch::IsEnd(int token)
{
CheckNTErrors(endSymbolNum > 0, "No end symbol?");
for (int i = 0; i < endSymbolNum; i++) {
if (endSymbols[i] == token)
return true;
}
return false;
}
/* set end symbols for search */
void GreedySearch::SetEnd(const int* tokens, const int tokenNum)
{
if (endSymbols != NULL)
delete[] endSymbols;
if (tokenNum <= 0)
return;
/* we may have multiple end symbols */
tokens = new int[tokenNum];
for (int i = 0; i < tokenNum; i++)
endSymbols[i] = tokens[i];
endSymbolNum = tokenNum;
}
/*
search for the most promising states
>> model - the transformer model
>> input - input of the model
>> padding - padding of the input
>> output - output that represents the sequences as rows
*/
void GreedySearch::Search(Model* model, XTensor& input,
XTensor& padding, IntList* output)
{
XTensor maskEnc;
XTensor encoding;
/* dynamic batch size */
Prepare(input.unitNum / input.dimSize[input.order - 1]);
/* encoder mask */
model->MakeMTMaskEnc(padding, maskEnc);
/* make the encoding network */
encoding = model->encoder->Make(input, &maskEnc, false);
/* max output-length = scalar * source-length */
maxLength = (int)(input.dimSize[input.order - 1] * scalarMaxLength);
/* the first token */
XTensor inputDec;
InitTensor2D(&inputDec, batchSize, 1, X_INT, input.devID);
inputDec.SetDataFixed(startSymbol);
/* initialize the finished flags */
int* finishedFlags = new int[batchSize];
for (int i = 0; i < batchSize; i++)
finishedFlags[i] = 0;
/* generate the sequence from left to right */
int l = 0;
for (; l < maxLength; l++) {
XTensor prob;
XTensor maskDec;
XTensor maskEncDec;
XTensor paddingDec;
XTensor decoding;
XTensor indexCPU;
XTensor bestScore;
InitTensor(&paddingDec, inputDec.order, inputDec.dimSize, X_INT, padding.devID);
paddingDec.SetDataFixed(1);
/* decoder mask */
model->MakeMTMaskDec(padding, paddingDec, maskDec, maskEncDec);
/* make the decoding network */
decoding = model->decoder->Make(inputDec, encoding, NULL, &maskEncDec, l, false);
/* generate the output probabilities */
model->outputLayer->Make(decoding, prob, false, false);
/* get the most promising prediction */
prob.Reshape(prob.dimSize[0], prob.dimSize[prob.order - 1]);
InitTensor2D(&bestScore, prob.dimSize[0], 1, prob.dataType, prob.devID);
TopK(prob, bestScore, inputDec, -1, 1);
/* save the prediction */
InitTensorOnCPU(&indexCPU, &inputDec);
CopyValues(inputDec, indexCPU);
for (int i = 0; i < batchSize; i++) {
output[i].Add(indexCPU.GetInt(i));
if (IsEnd(indexCPU.GetInt(i)))
finishedFlags[i] = 1;
}
int finished = 0;
for (int i = 0; i < batchSize; i++)
finished += finishedFlags[i];
if (finished == batchSize)
break;
}
delete[] finishedFlags;
}
}
\ No newline at end of file
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
*/
#ifndef __SEARCH_H__
#define __SEARCH_H__
#include "../Model.h"
#include "Predictor.h"
using namespace std;
namespace nmt
{
/* The class organizes the search process. It calls "predictors" to generate
distributions of the predictions and prunes the search space by beam pruning.
This makes a graph where each path represents a translation hypotheses.
The output can be the path with the highest model score. */
class BeamSearch
{
private:
/* the alpha parameter controls the length preference */
float alpha;
/* predictor */
Predictor predictor;
/* max length of the generated sequence */
int maxLength;
/* beam size */
int beamSize;
/* batch size */
int batchSize;
/* we keep the final hypotheses in a heap for each sentence in the batch. */
XHeap<MIN_HEAP, float>* fullHypos;
/* array of the end symbols */
int* endSymbols;
/* number of the end symbols */
int endSymbolNum;
/* start symbol */
int startSymbol;
/* scalar of the input sequence (for max number of search steps) */
float scalarMaxLength;
/* indicate whether the early stop strategy is used */
bool isEarlyStop;
/* pids for alive states */
IntList aliveStatePids;
/* alive sentences */
IntList aliveSentList;
/* whether we need to reorder the states */
bool needReorder;
public:
/* constructor */
BeamSearch();
/* de-constructor */
~BeamSearch();
/* initialize the model */
void Init(Config& config);
/* search for the most promising states */
void Search(Model* model, XTensor& input, XTensor& padding, IntList* output, XTensor& score);
/* preparation */
void Prepare(int myBatchSize, int myBeamSize);
/* compute the model score for each hypotheses */
void Score(StateBundle* prev, StateBundle* beam);
/* generate token indices via beam pruning */
void Generate(StateBundle* prev, StateBundle* beam);
/* expand the search graph */
void Expand(StateBundle* prev, StateBundle* beam, XTensor& reorderState);
/* collect hypotheses with ending symbol */
void Collect(StateBundle* beam);
/* fill the hypotheses heap with incomplete hypotheses */
void FillHeap(StateBundle* beam);
/* save the output sequences and score */
void Dump(IntList* output, XTensor* score);
/* check if the token is an end symbol */
bool IsEnd(int token);
/* check whether all hypotheses are completed */
bool IsAllCompleted(StateBundle* beam);
/* update the beam by pruning finished states */
void RemoveFinishedStates(StateBundle* beam, XTensor& aliveEncoding,
XTensor& aliveInput, XTensor& alivePadding, XTensor& aliveIdx);
/* set end symbols for search */
void SetEnd(const int* tokens, const int tokenNum);
/* make a mask to prevent duplicated entries in beam expansion for the first position */
XTensor MakeFirstMask(StateBundle* beam);
};
class GreedySearch
{
private:
/* predictor */
Predictor predictor;
/* max length of the generated sequence */
int maxLength;
/* batch size */
int batchSize;
/* array of the end symbols */
int* endSymbols;
/* number of the end symbols */
int endSymbolNum;
/* start symbol */
int startSymbol;
/* scalar of the input sequence (for max number of search steps) */
float scalarMaxLength;
public:
/* constructor */
GreedySearch();
/* de-constructor */
~GreedySearch();
/* initialize the model */
void Init(Config& config);
/* search for the most promising states */
void Search(Model* model, XTensor& input, XTensor& padding, IntList* output);
/* preparation */
void Prepare(int myBatchSize);
/* check if the token is an end symbol */
bool IsEnd(int token);
/* set end symbols for search */
void SetEnd(const int* tokens, const int tokenNum);
};
}
#endif
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
*/
#include "Search.h"
#include "Translator.h"
#include "../Utility.h"
#include "../../niutensor/tensor/XTensor.h"
#include "../../niutensor/tensor/XUtility.h"
#include "../../niutensor/tensor/core/CHeader.h"
using namespace nts;
namespace nmt
{
/* constructor */
Translator::Translator()
{
}
/* de-constructor */
Translator::~Translator()
{
if (beamSize > 1)
delete (BeamSearch*)seacher;
else
delete (GreedySearch*)seacher;
}
/* initialize the model */
void Translator::Init(Config& config)
{
beamSize = config.beamSize;
vSize = config.srcVocabSize;
vSizeTgt = config.tgtVocabSize;
sentBatch = config.sBatchSize;
wordBatch = config.wBatchSize;
if (beamSize > 1) {
LOG("translating with beam search (%d)", beamSize);
seacher = new BeamSearch();
((BeamSearch*)seacher)->Init(config);
}
else if (beamSize == 1) {
LOG("translating with greedy search");
seacher = new GreedySearch();
((GreedySearch*)seacher)->Init(config);
}
else {
CheckNTErrors(false, "Invalid beam size\n");
}
}
/*
test the model
>> ifn - input data file
>> sfn - source vocab file
>> tfn - target vocab file
>> ofn - output data file
>> model - pretrained model
*/
void Translator::Translate(const char* ifn, const char* sfn,
const char* tfn, const char* ofn, Model* model)
{
int wc = 0;
int wordCountTotal = 0;
int sentCount = 0;
int batchCount = 0;
int devID = model->devID;
double startT = GetClockSec();
/* batch of input sequences */
XTensor batchEnc;
/* padding */
XTensor paddingEnc;
batchLoader.Init(ifn, sfn, tfn);
LOG("loaded the input file, elapsed=%.1fs ", GetClockSec() - startT);
int count = 0;
double batchStart = GetClockSec();
while (!batchLoader.IsEmpty())
{
count++;
for (int i = 0; i < model->decoder->nlayer; ++i) {
model->decoder->selfAttCache[i].miss = true;
model->decoder->enDeAttCache[i].miss = true;
}
auto indices = batchLoader.LoadBatch(&batchEnc, &paddingEnc,
sentBatch, wordBatch, devID);
IntList* output = new IntList[indices.Size() - 1];
/* greedy search */
if (beamSize == 1) {
((GreedySearch*)seacher)->Search(model, batchEnc, paddingEnc, output);
}
/* beam search */
else {
XTensor score;
((BeamSearch*)seacher)->Search(model, batchEnc, paddingEnc, output, score);
}
for (int i = 0; i < indices.Size() - 1; ++i) {
Result* res = new Result;
res->id = int(indices[i]);
res->res = output[i];
batchLoader.outputBuffer.Add(res);
}
delete[] output;
wc += int(indices[-1]);
wordCountTotal += int(indices[-1]);
sentCount += int(indices.Size() - 1);
batchCount += 1;
if (count % 1 == 0) {
double elapsed = GetClockSec() - batchStart;
batchStart = GetClockSec();
LOG("elapsed=%.1fs, sentence=%f, sword=%.1fw/s",
elapsed, float(sentCount) / float(batchLoader.inputBuffer.Size()),
double(wc) / elapsed);
wc = 0;
}
}
/* append empty lines to the result */
for (int i = 0; i < batchLoader.emptyLines.Size(); i++) {
Result* emptyRes = new Result;
emptyRes->id = batchLoader.emptyLines[i];
batchLoader.outputBuffer.Add(emptyRes);
}
double startDump = GetClockSec();
/* reorder the result */
batchLoader.SortOutput();
/* print the result to a file */
batchLoader.DumpRes(ofn);
double elapsed = GetClockSec() - startDump;
LOG("translation completed (word=%d, sent=%zu)",
wordCountTotal, batchLoader.inputBuffer.Size() + batchLoader.emptyLines.Size());
}
/*
dump the result into the file
>> file - data file
>> output - output tensor
*/
void Translator::Dump(FILE* file, XTensor* output)
{
if (output != NULL && output->unitNum != 0) {
int seqLength = output->dimSize[output->order - 1];
for (int i = 0; i < output->unitNum; i += seqLength) {
for (int j = 0; j < seqLength; j++) {
int w = output->GetInt(i + j);
if (w < 0 || w == 1 || w == 2)
break;
fprintf(file, "%d ", w);
}
fprintf(file, "\n");
}
}
else
{
fprintf(file, "\n");
}
}
}
\ No newline at end of file
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
* A week with no trips :)
* $Modified by: HU Chi (huchinlp@gmail.com) 2020-06
*/
#ifndef __TESTER_H__
#define __TESTER_H__
#include "Search.h"
#include "DataSet.h"
namespace nmt
{
/* This class translates test sentences with a trained model. */
class Translator
{
public:
/* vocabulary size of the source side */
int vSize;
/* vocabulary size of the target side */
int vSizeTgt;
/* batch size for sentences */
int sentBatch;
/* batch size for words */
int wordBatch;
/* beam size */
int beamSize;
/* for batching */
DataSet batchLoader;
/* decoder for inference */
void* seacher;
public:
/* constructor */
Translator();
/* de-constructor */
~Translator();
/* initialize the model */
void Init(Config& config);
/* test the model */
void Translate(const char* ifn, const char* vfn, const char* ofn,
const char* tfn, Model* model);
/* dump the result into the file */
void Dump(FILE* file, XTensor* output);
};
}
#endif
\ No newline at end of file
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: HU Chi (huchinlp@foxmail.com) 2020-01-03
*/
#include <fstream>
#include "Vocab.h"
#include "../Utility.h"
namespace nts {
/* load a vocabulary from a file */
void Vocab::Load(const string& src)
{
string vsz, sid;
ifstream f(src, ios::in);
CheckNTErrors(f.is_open(), "unable to open the vocabulary file");
/* get the vocab size and the start id */
f >> vsz >> sid;
startID = stol(sid);
vocabSize = stol(vsz);
string word, id;
for (int i = 0; i < vocabSize - startID; i++) {
f >> word >> id;
word2id[word] = stol(id);
id2word[stol(id)] = word;
}
f.close();
}
/* save a vocabulary to a file */
void Vocab::Save(const string& src)
{
ofstream f(src, ios::out);
/* the first line: size of the vocab and the start id */
f << vocabSize << "\t" << startID;
/* other lines: words and indices */
for (const auto& p : word2id)
f << p.first << "\t" << p.second;
f.close();
}
/*
copy data from another vocabulary
>> v - the target vocabulary
*/
void Vocab::CopyFrom(const Vocab& v)
{
for (const auto& w2i : v.word2id)
word2id.insert(w2i);
for (const auto& i2w : v.id2word)
id2word.insert(i2w);
}
}
\ No newline at end of file
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: HU Chi (huchinlp@foxmail.com) 2020-01-03
*/
#ifndef __VOCAB_H__
#define __VOCAB_H__
#include <cstdio>
#include <unordered_map>
using namespace std;
namespace nts {
/* user-defined symbols */
#define PAD 1
#define SOS 2
#define EOS 2
#define UNK 3
/* the vocabulary class */
struct Vocab
{
/* the start id for words */
int startID;
/* size of the vocabulary */
int vocabSize;
/* a dict that maps words to ids */
unordered_map<string, int> word2id;
/* a dict that maps ids to words */
unordered_map<int, string> id2word;
/* load a vocabulary from a file */
void Load(const string& src);
/* save a vocabulary to a file */
void Save(const string& src);
/* copy data from another vocab */
void CopyFrom(const Vocab& v);
};
}
#endif
\ No newline at end of file
'''
Ensemble multiple models by checkpoint averaging.
Usage: python3 Ensemble.py -src <model_files> -tgt <ensembled_model>
Help: python3 ModelConverter.py -h
'''
import argparse
import numpy as np
from glob import glob
from struct import pack
from struct import unpack
parser = argparse.ArgumentParser(
description='A model ensemble tool for NiuTrans.NMT')
parser.add_argument('-input', help='Model file pattern, e.g., \'model.bin.*\'',
type=str, default='model.bin.*')
parser.add_argument('-output', help='The ensembled model',
type=str, default='model.ensemble')
args = parser.parse_args()
model_files = glob(args.input)
meta_infos = None
parameters = []
for file in model_files:
with open(file, "rb") as f:
meta_infos = f.read(12 * 4)
data = f.read()
values = unpack('f' * (len(data) // 4), data)
print("Loaded {} parameters from: {}".format(len(values), file))
parameters.append(np.array(values))
parameters = np.mean(np.array(parameters), axis=0)
with open(args.output, "wb") as f:
f.write(meta_infos)
values = pack("f" * len(parameters), *parameters)
f.write(values)
print("Model ensemble finished")
'''
Convert the format of a model.
Usage: python3 FormatConverter.py -src <raw_model> -tgt <new_model>
Help: python3 FormatConverter.py -h
'''
import argparse
import numpy as np
from glob import glob
from struct import pack
from struct import unpack
parser = argparse.ArgumentParser(
description='The format converter for NiuTrans.NMT')
parser.add_argument('-input', help='Path of the raw model file',
type=str, default='')
parser.add_argument('-output', help='Path of the new model file',
type=str, default='')
parser.add_argument('-format', help='Target storage format, FP16 (Default) or FP32', type=str, default='fp16')
args = parser.parse_args()
args.format = args.format.lower()
META_INFO_NUM = 12
meta_infos = None
parameters = None
if args.format == 'fp32':
PARAM_LEN = 2
elif args.format == 'fp16':
PARAM_LEN = 4
else:
raise NotImplementedError("Unsupported data type")
with open(args.input, "rb") as f:
meta_infos = f.read(META_INFO_NUM * 4)
data = f.read()
if args.format == 'fp32':
values = unpack('e' * (len(data) // PARAM_LEN), data)
elif args.format == 'fp16':
values = unpack('f' * (len(data) // PARAM_LEN), data)
print("Loaded {} parameters from: {}".format(len(values), args.input))
parameters = np.array(values)
with open(args.output, "wb") as f:
f.write(meta_infos)
if args.format == 'fp32':
values = pack("f" * len(parameters), *(parameters.astype(np.float32)))
elif args.format == 'fp16':
values = pack("e" * len(parameters), *(parameters.astype(np.float16)))
f.write(values)
\ No newline at end of file
'''
Convert a bpe vocabulary to a NiuTrans.NMT vocab
Usage: python3 GetVocab.py -src [bpe_vocab] -tgt [niutrans_nmt_vocab]
'''
import sys
import argparse
parser = argparse.ArgumentParser(description='prepare parallel data for nmt training')
parser.add_argument('-raw', help='Path of the BPE vocabulary', type=str, default='')
parser.add_argument('-new', help='Path of the NiuTrans.NMT vocabulary to be saved', type=str, default='')
args = parser.parse_args()
# User defined words
PAD=1
SOS=2
EOS=2
UNK=3
with open(args.raw, "r", encoding="utf8") as fi:
with open(args.new, "w", encoding="utf8") as fo:
all_lines = fi.readlines()
vocab_size = len(all_lines) + UNK + 1
# make sure the vocabulary size is divisible by 8
vocab_size += (8 - vocab_size % 8)
start_id = UNK + 1
# first line: vocab size, start id
fo.write("{} {}\n".format(vocab_size, start_id))
# other lines: word, id
for l in all_lines:
fo.write("{} {}\n".format(l.split()[0], start_id))
start_id += 1
\ No newline at end of file
'''
Convert a fairseq checkpoint to a NiuTrans.NMT model.
Usage: python3 ModelConverter.py -src <fairseq_models> -tgt <niutrans_nmt_model>
Help: python3 ModelConverter.py -h
Requirements: fairseq >= 0.6.2
'''
import torch
import argparse
import numpy as np
from glob import glob
from struct import pack
parser = argparse.ArgumentParser(
description='The model converter for NiuTrans.NMT')
parser.add_argument('-src', help='The pattern used to find fairseq checkpoints, e.g., \'checkpoint*\'',
type=str, default='checkpoint')
parser.add_argument('-tgt', help='The file name prefix for Niutrans.NMT models',
type=str, default='model')
parser.add_argument('-mode', help='Storage mode, FP32 (Default) or FP16', type=str, default='fp32')
args = parser.parse_args()
args.mode = args.mode.lower()
def get_model_parameters(m):
'''
get flattend transformer model parameters
'''
p = []
encoder_emb = None
decoder_emb = None
decoder_output_w = None
for k in m['model']:
if 'encoder.embed_tokens.weight' in k:
encoder_emb = m['model'][k]
elif 'decoder.embed_tokens.weight' in k:
decoder_emb = m['model'][k]
elif 'decoder.embed_out' in k:
decoder_output_w = m['model'][k]
elif m['model'][k].numel() != 1:
# ignore fairseq version descriptions
if 'weight' in k:
# weights for qkv
if 'in_proj' in k:
# split qkv weights to slices
dim = m['model'][k].shape[0] // 3
p.append((m['model'][k][:dim, :]).t())
p.append((m['model'][k][dim:dim*2, :]).t())
p.append((m['model'][k][dim*2:, :]).t())
else:
if 'norm' in k:
p.append(m['model'][k])
else:
# transpose weights for matrix multiplication
if 'fc' in k:
p.append(m['model'][k].t())
else:
p.append(m['model'][k].t())
else:
# bias
p.append(m['model'][k])
# encoder embedding weight
p.append(encoder_emb)
# decoder embedding weight
if decoder_emb is not None:
p.append(decoder_emb)
else:
print('Sharing all embeddings')
# decoder output weight
if decoder_output_w is not None:
p.append(decoder_output_w)
else:
print('Sharing decoder input output embeddings')
return p
with torch.no_grad():
model_files = glob(args.src)
for index, model_file in enumerate(model_files):
print('-' * 120)
print("source model: \'{}\' ({}/{})".format(model_file, index+1, len(model_files)))
print("target model: \'{}\'".format(args.tgt + "." + str(index)))
model = torch.load(model_file, map_location='cpu')
meta_info = {
'src_vocab_size': 0,
'tgt_vocab_size': 0,
'encoder_layer': model['args'].encoder_layers,
'decoder_layer': model['args'].decoder_layers,
'ffn_hidden_size': model['args'].encoder_ffn_embed_dim,
'hidden_size': model['args'].decoder_input_dim,
'emb_size': model['args'].encoder_embed_dim,
'head_num': model['args'].encoder_attention_heads,
'max_relative_length': model['args'].max_relative_length,
'share_all_embeddings': model['args'].share_all_embeddings,
'share_decoder_input_output_embed': model['args'].share_decoder_input_output_embed,
'max_source_positions': model['args'].max_source_positions,
}
params = get_model_parameters(model)
print('total params: ', len(params))
print('total params size: ', sum([p.numel() for p in params]))
model = model['model']
with open(args.tgt + "." + str(index) + "." +"name.txt", "w") as name_list:
for p in model:
name_list.write("{}\t{}\n".format(p, model[p].shape))
if 'embed_tokens' in p:
if 'encoder' in p:
meta_info['src_vocab_size'] = model[p].shape[0]
else:
meta_info['tgt_vocab_size'] = model[p].shape[0]
meta_info_list = [
meta_info['encoder_layer'],
meta_info['decoder_layer'],
meta_info['ffn_hidden_size'],
meta_info['hidden_size'],
meta_info['emb_size'],
meta_info['src_vocab_size'],
meta_info['tgt_vocab_size'],
meta_info['head_num'],
meta_info['max_relative_length'],
meta_info['share_all_embeddings'],
meta_info['share_decoder_input_output_embed'],
meta_info['max_source_positions'],
]
print(meta_info)
meta_info_list = [int(p) for p in meta_info_list]
meta_info = pack("i" * len(meta_info_list), *meta_info_list)
with open(args.tgt + "." + str(index), 'wb') as tgt:
# part 1: meta info
tgt.write(meta_info)
# part 2: values of parameters (in FP32 or FP16)
for p in params:
if args.mode == 'fp32':
values = pack("f" * p.numel(), *
(p.contiguous().view(-1).cpu().numpy()))
tgt.write(values)
elif args.mode == 'fp16':
values = pack(
"e" * p.numel(), *(p.contiguous().view(-1).cpu().numpy().astype(np.float16)))
tgt.write(values)
'''
Convert a fairseq vocab to a NiuTrans.NMT vocab
Help: python3 PrepareParallelData.py -h
Training data format (binary):
first 8 bit: number of sentence pairs
subsequent segements:
source sentence length (4 bit)
target sentence length (4 bit)
source tokens (4 bit per token)
target tokens (4 bit per token)
'''
from struct import pack
import argparse
# User defined words
PAD = 1
SOS = 2
EOS = 2
UNK = 3
# The maximum length for a sentence
MAX_SENT_LEN = 120
parser = argparse.ArgumentParser(
description='Prepare parallel data for nmt training')
parser.add_argument('-src', help='Source language file', type=str, default='')
parser.add_argument('-tgt', help='Target language file', type=str, default='')
parser.add_argument(
'-src_vocab', help='Source language vocab file', type=str, default='')
parser.add_argument(
'-tgt_vocab', help='Target language vocab file', type=str, default='')
parser.add_argument('-output', help='Training file', type=str, default='')
args = parser.parse_args()
src_vocab = dict()
tgt_vocab = dict()
cut_num = 0
def load_vocab(vocab, file):
with open(file, 'r', encoding='utf8') as f:
vocab_size = int(f.readline().split()[0])
for l in f:
l = l.split()
vocab[l[0]] = int(l[1])
print("{}: {} types".format(file, vocab_size))
return vocab_size
def get_id(vocab, word, is_src=True):
if word in vocab.keys():
return vocab[word]
else:
return UNK
src_vocab_size = load_vocab(src_vocab, args.src_vocab)
tgt_vocab_size = load_vocab(tgt_vocab, args.tgt_vocab)
if (not isinstance(src_vocab_size, int)) or (src_vocab_size < 0):
raise ValueError("Invalid source vocab size")
if (not isinstance(tgt_vocab_size, int)) or (src_vocab_size < 0):
raise ValueError("Invalid source vocab size")
with open(args.src, 'r', encoding='utf8') as fs:
with open(args.tgt, 'r', encoding='utf8') as ft:
src_sentences, tgt_sentences = list(), list()
for ls in fs:
ls = ls.split()
lt = ft.readline().split()
if len(ls) >= MAX_SENT_LEN:
cut_num += 1
ls = ls[:MAX_SENT_LEN - 1]
if len(lt) >= MAX_SENT_LEN:
cut_num += 1
lt = lt[:MAX_SENT_LEN - 1]
src_sent = [get_id(src_vocab, w) for w in ls] + [EOS]
tgt_sent = [SOS] + [get_id(tgt_vocab, w, False) for w in lt]
src_sentences.append(src_sent)
tgt_sentences.append(tgt_sent)
src_tokens = sum([len(s) - 1 for s in src_sentences])
tgt_tokens = sum([len(t) - 1 for t in tgt_sentences])
print("{}: {} sents, {} tokens, {:.2f} replaced by <UNK>".format(
args.src, len(src_sentences), src_tokens, sum([s.count(UNK) for s in src_sentences]) / src_tokens))
print("{}: {} sents, {} tokens, {:.2f} replaced by <UNK>".format(
args.tgt, len(tgt_sentences), tgt_tokens, sum([s.count(UNK) for s in tgt_sentences]) / tgt_tokens))
with open(args.output, 'wb') as fo:
# seg 1: source and target vocabulary size
vocab_size = [src_vocab_size, tgt_vocab_size]
vocab_size_pack = pack("i" * len(vocab_size), *vocab_size)
fo.write(vocab_size_pack)
# seg 2: number of sentence pairs (8 bit per number)
sent_num = [len(src_sentences)]
sent_num_pack = pack("Q", *sent_num)
fo.write(sent_num_pack)
for i in range(len(src_sentences)):
src_sent = src_sentences[i]
tgt_sent = tgt_sentences[i]
# seg 3: number of source and target sentence length (4 bit per number)
src_tgt_length = [len(src_sent), len(tgt_sent)]
src_tgt_length_pack = pack(
"i" * len(src_tgt_length), *src_tgt_length)
fo.write(src_tgt_length_pack)
# seg 4: source sentence and target sentence pairs (4 bit per token)
# print(src_sent)
src_sent_pack = pack("i" * len(src_sent), *src_sent)
fo.write(src_sent_pack)
tgt_sent_pack = pack("i" * len(tgt_sent), *tgt_sent)
fo.write(tgt_sent_pack)
'''
Convert a fairseq vocab to a NiuTrans.NMT vocab
Usage: python3 VocabConverter.py [fairseq_vocab] [niutrans_nmt_vocab]
'''
import sys
# User defined words
PAD=1
SOS=2
EOS=2
UNK=3
with open(sys.argv[1], "r", encoding="utf8") as fi:
with open(sys.argv[2], "w", encoding="utf8") as fo:
lines = fi.readlines()
# the first several indices are reserved
start_id = UNK + 1
# the first line: vocab_size, start_id
fo.write("{} {}\n".format(len(lines)+start_id, start_id))
# other lines: word, id
for l in lines:
fo.write("{} {}\n".format(l.split()[0], start_id))
start_id += 1
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论