Commit 3852f15a by huchi

Merge with branch: xiaotong-working

parent 98a9130d
# CMake minimum version
# cmake minimum version
cmake_minimum_required(VERSION 2.8)
# Project's name
project(NiuTensor)
# The prefix of the generated executable file
set(NIUTRANS_EXE "NiuTensor")
set(NIUTRANS_DLL "${NIUTRANS_EXE}")
# The name of the generated executable file
# The name of the dynamic link library
set(NIUTENSOR_EXE "NiuTensor")
set(NIUTENSOR_DLL "${NIUTENSOR_EXE}")
# Generated file path
set(EXECUTABLE_OUTPUT_PATH ../bin)
set(LIBRARY_OUTPUT_PATH ../lib)
# Use CMAKE_MACOSX_RPATH for MacOS
# Use CMAKE_MACOSX_RPATH for macOS
set(CMAKE_MACOSX_RPATH 1)
# Open floder manage
set_property(GLOBAL PROPERTY USE_FOLDERS ON)
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
option(USE_CUDA "Use CUDA" OFF)
option(USE_HALF_PRECISION "Use Half Precision in CUDA Codes" OFF)
option(USE_MKL "Use MKL" OFF)
option(USE_OPENBLAS "Use OpenBLAS" OFF)
option(USE_FP16 "Use FP16" OFF)
option(GEN_DLL "Generate Dynamic Link Library" OFF)
# If set USE_CUDA ON, please modify CUDA_TOOLKIT_ROOT below.
# If set USE_MKL ON, please modify the INTEL_ROOT below.
# If set USE_OPENBLAS ON, please modify the OPENBLAS_ROOT below.
if (USE_CUDA)
if(NOT DEFINED CUDA_TOOLKIT_ROOT_DIR)
if(NOT EXISTS ${CUDA_TOOLKIT_ROOT})
if(WIN32)
message(STATUS "HERE cuda")
set(CUDA_TOOLKIT_ROOT_DIR "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1")
set(CUDA_TOOLKIT_ROOT "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1")
else()
set(CUDA_TOOLKIT_ROOT_DIR "/usr/cuda-9.0")
set(CUDA_TOOLKIT_ROOT "/usr/local/cuda-10.1")
endif()
endif()
message(STATUS "CUDA_TOOLKIT_ROOT_DIR: ${CUDA_TOOLKIT_ROOT_DIR}")
set(CUDA_TOOLKIT_ROOT_DIR ${CUDA_TOOLKIT_ROOT})
message(STATUS "CUDA_TOOLKIT_ROOT: ${CUDA_TOOLKIT_ROOT}")
endif()
if(USE_MKL)
if(NOT DEFINED INTEL_ROOT)
if(WIN32)
message(STATUS "HERE mkl")
set(INTEL_ROOT "C:/Program Files (x86)/IntelSWTools/compilers_and_libraries_2020.2.254/windows")
else()
set(INTEL_ROOT "/usr/intel/compilers_and_libraries_2020.2.254/linux")
set(INTEL_ROOT "/opt/intel/compilers_and_libraries_2020.2.254/linux")
endif()
endif()
message(STATUS "INTEL_ROOT: ${INTEL_ROOT}")
......@@ -49,9 +52,9 @@ endif()
if(USE_OPENBLAS)
if(NOT DEFINED OPENBLAS_ROOT)
if(WIN32)
set(OPENBLAS_ROOT "D:/software/BaiduNetdiskDownload/thirdparty20170624/OpenBLAS")
set(OPENBLAS_ROOT "C:/Program Files/OpenBLAS")
else()
set(OPENBLAS_ROOT "/usr/OpenBLAS")
set(OPENBLAS_ROOT "/opt/OpenBLAS")
endif()
endif()
message(STATUS "OPENBLAS_ROOT: ${OPENBLAS_ROOT}")
......@@ -90,74 +93,90 @@ endfunction(my_add_executable)
# Set libs and compiler options for CUDA
if(USE_CUDA)
add_definitions(-DUSE_CUDA)
if(USE_FP16)
if(USE_HALF_PRECISION)
add_definitions(-DHALF_PRECISION)
endif()
find_package(CUDA ${CUDA_VERSION} REQUIRED)
find_package(CUDA REQUIRED)
if(GPU_ARCH STREQUAL K) # Kepler cards (CUDA 5 until CUDA 10)
set(ARCH_FLAGS -arch=compute_30 -code=compute_30,sm_30,sm_35,sm_37)
elseif(GPU_ARCH STREQUAL M) # Maxwell cards (CUDA 6 until CUDA 11)
set(ARCH_FLAGS -arch=compute_50 -code=compute_50,sm_50,sm_52,sm_53)
elseif(GPU_ARCH STREQUAL P) # Pascal (CUDA 8 and later)
set(ARCH_FLAGS -arch=compute_60 -code=compute_60,sm_60,sm_61,sm_62)
elseif(GPU_ARCH STREQUAL V) # Volta (CUDA 9 and later)
set(ARCH_FLAGS -arch=compute_70 -code=compute_70,sm_70,sm_72)
elseif(GPU_ARCH STREQUAL T) # Turing (CUDA 10 and later)
set(ARCH_FLAGS -arch=compute_75 -code=sm_75)
elseif(GPU_ARCH STREQUAL A) # Ampere (CUDA 11 and later)
set(ARCH_FLAGS -arch=compute_80 -code=sm_80)
endif()
if(USE_HALF_PRECISION)
if(NOT DEFINED GPU_ARCH)
set(ARCH_FLAGS -arch=sm_60
-gencode=arch=compute_60,code=sm_60
-gencode=arch=compute_61,code=sm_61
-gencode=arch=compute_62,code=sm_62
-gencode=arch=compute_70,code=sm_70
-gencode=arch=compute_72,code=sm_72
-gencode=arch=compute_70,code=compute_70
)
elseif(${GPU_ARCH} STREQUAL K OR ${GPU_ARCH} STREQUAL M)
message(FATAL_ERROR "your GPU cannot use the function half precision")
endif()
endif()
if(WIN32)
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4819")
set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-maxrregcount=0 -m64 -use_fast_math -DUSE_CUDA")
set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} -arch=sm_30
-gencode=arch=compute_30,code=sm_30
-gencode=arch=compute_50,code=sm_50
-gencode=arch=compute_52,code=sm_52
-gencode=arch=compute_60,code=sm_60
-gencode=arch=compute_61,code=sm_61
-gencode=arch=compute_62,code=sm_62
-gencode=arch=compute_70,code=sm_70
-gencode=arch=compute_70,code=compute_70
)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4819")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-maxrregcount=0 -Wno-deprecated-gpu-targets -use_fast_math")
string(REPLACE -m32 -m64 CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${ARCH_FLAGS})
set(CMAKE_POLICY_DEFAULT_CMP0028 NEW)
link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib/x64")
include_directories("${CUDA_TOOLKIT_ROOT_DIR}/include")
set(CUDA_LIB_DIR "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64/")
link_directories("${CUDA_TOOLKIT_ROOT}/lib/x64")
include_directories("${CUDA_TOOLKIT_ROOT}/include")
set(CUDA_LIB_DIR "${CUDA_TOOLKIT_ROOT}/lib/x64/")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}cublas.lib")
if(CUDA_VERSION_MAJOR EQUAL 11)
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}cublasLt.lib")
endif()
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}npps.lib")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}nppc.lib")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}cudadevrt.lib")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}curand.lib")
else()
set(CMAKE_CXX_FLAGS "-fPIC -msse4.2 -w -march=native -Wno-enum-compare -Wno-sign-compare -Wno-format -Wno-dev -O3 -DNDEBUG -rdynamic")
if(USE_FP16)
set(CUDA_NVCC_FLAGS "-Xcompiler -fPIC -maxrregcount=0 -use_fast_math -DUSE_CUDA -DHALF_PRECISION -Wno-deprecated-gpu-targets -std=c++11 ")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -arch=sm_60
-gencode=arch=compute_60,code=sm_60
-gencode=arch=compute_61,code=sm_61
-gencode=arch=compute_62,code=sm_62
-gencode=arch=compute_70,code=sm_70
-gencode=arch=compute_70,code=compute_70
)
else()
set(CUDA_NVCC_FLAGS "-Xcompiler -fPIC -maxrregcount=0 -use_fast_math -DUSE_CUDA -Wno-deprecated-gpu-targets -std=c++11 ")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -arch=sm_60
-gencode=arch=compute_60,code=sm_60
-gencode=arch=compute_61,code=sm_61
-gencode=arch=compute_62,code=sm_62
-gencode=arch=compute_70,code=sm_70
-gencode=arch=compute_70,code=compute_70
)
endif()
link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64)
include_directories(${CUDA_TOOLKIT_ROOT_DIR}/include)
set(CUDA_LIB_DIR "${CUDA_TOOLKIT_ROOT_DIR}/lib64/")
set(CUDA_NVCC_FLAGS "-Xcompiler -fPIC -maxrregcount=0 --disable-warnings -use_fast_math -DUSE_CUDA -Wno-deprecated-gpu-targets -std=c++11")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${ARCH_FLAGS})
link_directories("${CUDA_TOOLKIT_ROOT}/lib64")
include_directories("${CUDA_TOOLKIT_ROOT}/include")
set(CUDA_LIB_DIR "${CUDA_TOOLKIT_ROOT}/lib64/")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcublas_static.a")
if(CUDA_VERSION_MAJOR EQUAL 11)
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcublasLt_static.a")
endif()
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libculibos.a")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libnpps_static.a")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libnppc_static.a")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcudadevrt.a")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcurand_static.a")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "/usr/lib64/libdl.so.2")
if(EXISTS "/usr/lib64/libdl.so.2")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "/usr/lib64/libdl.so.2")
elseif(EXISTS "/lib/x86_64-linux-gnu/libdl.so.2")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "/lib/x86_64-linux-gnu/libdl.so.2")
elseif(EXISTS "/lib64/libdl.so.2")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "/lib64/libdl.so.2")
endif()
endif()
endif()
# Set libs and compiler options for MKL
if(USE_MKL)
add_definitions(-DMKL)
add_definitions(-DUSE_BLAS -DMKL)
set(COMPILER_DIR "${INTEL_ROOT}/compiler")
set(MKL_DIR "${INTEL_ROOT}/mkl")
set(CPU_ARCH intel64)
if(WIN32)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DNDEBUG -DMKL")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DNDEBUG")
link_directories(${MKL_DIR}/lib/intel64/)
link_directories(${COMPILER_DIR}/lib/intel64)
include_directories(${MKL_DIR}/include)
......@@ -169,9 +188,9 @@ if(USE_MKL)
set(MKL_LIB_PATH ${MKL_LIB_PATH} "${COMPILER_LIB_DIR}libiomp5md.lib")
else()
if(USE_CUDA)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-reorder -DMKL")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-reorder")
else()
set(CMAKE_CXX_FLAGS "-std=c++11 -msse4.2 -w -march=native -Wno-enum-compare -Wno-sign-compare -Wno-reorder -Wno-format -O3 -flto -DNDEBUG -rdynamic -fkeep-inline-functions -fPIC -DMKL")
set(CMAKE_CXX_FLAGS "-std=c++11 -msse4.2 -w -march=native -Wno-enum-compare -Wno-sign-compare -Wno-reorder -Wno-format -O3 -flto -DNDEBUG -rdynamic -fkeep-inline-functions -fPIC")
endif(USE_CUDA)
link_directories(${MKL_DIR}/lib/intel64/)
link_directories(${COMPILER_DIR}/lib/intel64)
......@@ -187,10 +206,9 @@ endif()
# Set libs and compiler options for OpenBLAS
if(USE_OPENBLAS)
add_definitions(-DUSE_BLAS -DMKL)
add_definitions(-DUSE_BLAS -DOPENBLAS)
set(OPENBLAS_INCLUDE_DIR "${OPENBLAS_ROOT}/include")
set(OPENBLAS_LIB_DIR "${OPENBLAS_ROOT}/lib")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_BLAS")
if(WIN32)
link_directories(${OPENBLAS_LIB_DIR})
include_directories(${OPENBLAS_INCLUDE_DIR})
......@@ -211,15 +229,15 @@ set(OPENBLAS_LIB ${OPENBLAS_LIB_PATH})
# Generate dynamic link library about project
if(USE_CUDA)
if(GEN_DLL)
cuda_add_library(${NIUTRANS_DLL} SHARED ${CPP_FILES} ${H_FILES} ${CU_FILES} ${CUH_FILES})
cuda_add_library(${NIUTENSOR_DLL} SHARED ${CPP_FILES} ${H_FILES} ${CU_FILES} ${CUH_FILES})
else()
my_add_executable(${NIUTRANS_EXE} ${CPP_FILES} ${H_FILES} ${CU_FILES} ${CUH_FILES})
my_add_executable(${NIUTENSOR_EXE} ${CPP_FILES} ${H_FILES} ${CU_FILES} ${CUH_FILES})
endif()
else()
if(GEN_DLL)
add_library(${NIUTRANS_DLL} SHARED ${CPP_FILES} ${H_FILES})
add_library(${NIUTENSOR_DLL} SHARED ${CPP_FILES} ${H_FILES})
else()
my_add_executable(${NIUTRANS_EXE} ${CPP_FILES} ${H_FILES})
my_add_executable(${NIUTENSOR_EXE} ${CPP_FILES} ${H_FILES})
endif()
endif()
......@@ -243,17 +261,17 @@ if(WIN32)
if(GEN_DLL)
message(STATUS "Generate Dynamic Link Library")
message(STATUS "Name of Dynamic Link Library: " ${NIUTRANS_DLL})
target_link_libraries(${NIUTRANS_DLL} ${ALL_LIB})
message(STATUS "Name of Dynamic Link Library: " ${NIUTENSOR_DLL})
target_link_libraries(${NIUTENSOR_DLL} ${ALL_LIB})
else()
message(STATUS "Generate Makefile For Executable File")
message(STATUS "Name of Executable File :" ${NIUTRANS_EXE})
target_link_libraries(${NIUTRANS_EXE} ${ALL_LIB})
message(STATUS "Name of Executable File :" ${NIUTENSOR_EXE})
target_link_libraries(${NIUTENSOR_EXE} ${ALL_LIB})
endif()
message(STATUS "${MESS}")
else()
add_definitions(-std=c++11)
set(MESS ${MESS} "On Linux")
set(MESS ${MESS} "On Linux or macOS")
if(USE_CUDA)
set(MESS ${MESS} " Use CUDA")
set(ALL_LIB ${ALL_LIB} ${CUDA_LIB})
......@@ -274,12 +292,12 @@ else()
if(GEN_DLL)
message(STATUS "Generate Dynamic Link Library")
message(STATUS "Name of Dynamic Link Library: " ${NIUTRANS_DLL})
target_link_libraries(${NIUTRANS_DLL} ${ALL_LIB} ${FLAG})
message(STATUS "Name of Dynamic Link Library: " ${NIUTENSOR_DLL})
target_link_libraries(${NIUTENSOR_DLL} ${ALL_LIB} ${FLAG})
else()
message(STATUS "Generate Makefile For Executable File")
message(STATUS "Name of Executable File: " ${NIUTRANS_EXE})
target_link_libraries(${NIUTRANS_EXE} ${ALL_LIB} ${FLAG})
message(STATUS "Name of Executable File: " ${NIUTENSOR_EXE})
target_link_libraries(${NIUTENSOR_EXE} ${ALL_LIB} ${FLAG})
endif()
message(STATUS "${MESS}")
endif()
......@@ -39,14 +39,14 @@ NiuTensor蟾・蜈キ蛹庄莉・蝨ィWindows縲´inux莉・蜿確acOS邇ッ蠅ク玖ソ幄。悟ョ芽」シ梧髪
##### CMake方式(Visual Studio)
对于WIndows平台的NiuTensor工具包安装,这里可以使用CMake工具自动生成Visual Studio项目(需要用户提前安装CMake工具以及Visual Studio集成开发环境),操作步骤如下:
对于Windows平台的NiuTensor工具包安装,这里可以使用CMake工具自动生成Visual Studio项目(需要用户提前安装CMake工具以及Visual Studio集成开发环境),操作步骤如下:
- 在工具包根目录新建目录以保存生成的Visual Studio项目文件(如建立build目录)。
- 在项目根目录打开Windows平台的命令行工具(如PowerShell),执行`cd build`命令进入新建的build目录。
- 执行CMake命令对Visual Studio项目进行生成(如果 visual studio 版本低于 2019,则在使用下列命令的时候需额外加上`-A x64`的CMake参数),如计划生成动态链接库,则仅需在命令中额外加上`-DGEN_DLL=ON`的CMake参数即可,否则默认生成可执行程序。
- 如项目计划启用MKL数学运算库(需用户自行安装),则仅需在CMake命令中使用`-DUSE_MKL=ON`参数,并通过`-DINTEL_ROOT='/intel/root/path'`指定MKL库(Intel工具包)的安装路径。如`cmake -DUSE_MKL=ON -DINTEL_ROOT='C:/Program Files (x86)/IntelSWTools/compilers_and_libraries_2020.2.254/windows' ..`
- 如项目计划启用OpenBLAS数学运算库(需用户自行安装),则仅需在CMake命令中使用`-DUSE_OPENBLAS=ON`参数,并通过`-DOPENBLAS_ROOT='/openblas/root/path'`指定OpenBLAS库的安装路径。如`cmake -DUSE_OPENBLAS=ON -DOPENBLAS_ROOT='C:/Program Files/OpenBLAS' ..`
- 如项目计划启用CUDA数学运算库(需用户自行安装),则仅需在CMake命令中使用`-DUSE_CUDA=ON`参数,并通过`-DCUDA_ROOT='/cuda/root/path'`指定CUDA库的安装路径,通过-DGPU_ARCH=ARCH指定所在GPU设备的架构(K:Kepler架构;M:Maxwell架构;P:Pascal架构;V:Volta架构;T:Turing架构;A:Ampere架构)。如`cmake -DUSE_CUDA=ON -DCUDA_ROOT='C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.2' -DGPU_ARCH=P ..`。如需在GPU设备上使用半精度浮点数进行运算,需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数(需要注意的是半精度但需要注意的是,半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持,该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询)。
- 如项目计划启用CUDA数学运算库(需用户自行安装),则仅需在CMake命令中使用`-DUSE_CUDA=ON`参数,并通过`-DCUDA_TOOLKIT_ROOT='/cuda/root/path'`指定CUDA库的安装路径,通过-DGPU_ARCH=ARCH指定所在GPU设备的架构(K:Kepler架构;M:Maxwell架构;P:Pascal架构;V:Volta架构;T:Turing架构;A:Ampere架构)。如`cmake -DUSE_CUDA=ON -DCUDA_TOOLKIT_ROOT='C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.2' -DGPU_ARCH=P ..`。如需在GPU设备上使用半精度浮点数进行运算,需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数(需要注意的是半精度但需要注意的是,半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持,该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询)。
- 执行成功将显示`Build files have been written to:...`
- 打开build目录中的NiuTensor.sln文件即可通过Visual Studio打开NiuTensor项目。
- 打开后在解决方案管理器中选中NiuTensor,右键将其设为启动项目即可开始使用。
......@@ -67,7 +67,7 @@ NiuTensor蟾・蜈キ蛹庄莉・蝨ィWindows縲´inux莉・蜿確acOS邇ッ蠅ク玖ソ幄。悟ョ芽」シ梧髪
- 打开CLion首选项,点击“构建,执行,部署”选项卡中的CMake,在“CMake选项”中进行设置,设置完成后CLion将自动使用CMake对项目进行构建,如计划生成动态链接库,则仅需在在“CMake选项”中额外加上`-DGEN_DLL=ON`的CMake参数即可,否则默认生成可执行程序。
- 如项目计划启用MKL数学运算库(需用户自行安装),则仅需在“CMake选项”中填入`-DUSE_MKL=ON`,并通过`-DINTEL_ROOT='/intel/root/path'`指定MKL库(Intel工具包)的安装路径。如`-DUSE_MKL=ON -DINTEL_ROOT='/opt/intel/compilers_and_libraries_2020.2.254/linux'`
- 如项目计划启用OpenBLAS数学运算库(需用户自行安装),则仅需在“CMake选项”中填入`-DUSE_OPENBLAS=ON`,并通过`-DOPENBLAS_ROOT='/openblas/root/path'`指定OpenBLAS库的安装路径。如`-DUSE_OPENBLAS=ON -DOPENBLAS_ROOT='/opt/OpenBLAS'`
- 如项目计划启用CUDA数学运算库(需用户自行安装),则仅需在“CMake选项”中填入`-DUSE_CUDA=ON`,并通过`-DCUDA_ROOT='/cuda/root/path'`指定CUDA库的安装路径,通过-DGPU_ARCH=ARCH指定所在GPU设备的架构(K:Kepler架构;M:Maxwell架构;P:Pascal架构;V:Volta架构;T:Turing架构;A:Ampere架构)。如`-DUSE_CUDA=ON -DCUDA_ROOT='/usr/local/cuda-9.2' -DGPU_ARCH=P `。如需在GPU设备上使用半精度浮点数进行运算,需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数(需要注意的是半精度但需要注意的是,半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持,该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询)。
- 如项目计划启用CUDA数学运算库(需用户自行安装),则仅需在“CMake选项”中填入`-DUSE_CUDA=ON`,并通过`-DCUDA_TOOLKIT_ROOT='/cuda/root/path'`指定CUDA库的安装路径,通过-DGPU_ARCH=ARCH指定所在GPU设备的架构(K:Kepler架构;M:Maxwell架构;P:Pascal架构;V:Volta架构;T:Turing架构;A:Ampere架构)。如`-DUSE_CUDA=ON -DCUDA_TOOLKIT_ROOT='/usr/local/cuda-9.2' -DGPU_ARCH=P `。如需在GPU设备上使用半精度浮点数进行运算,需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数(需要注意的是半精度但需要注意的是,半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持,该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询)。
##### CMake方式(命令行)
......@@ -78,7 +78,7 @@ NiuTensor蟾・蜈キ蛹庄莉・蝨ィWindows縲´inux莉・蜿確acOS邇ッ蠅ク玖ソ幄。悟ョ芽」シ梧髪
- 执行CMake命令对项目进行生成,如计划生成动态链接库,则仅需在命令中额外加上`-DGEN_DLL=ON`的CMake参数即可,否则默认生成可执行程序。
- 如项目计划启用MKL数学运算库(需用户自行安装),则仅需在CMake命令中使用`-DUSE_MKL=ON`参数,并通过`-DINTEL_ROOT='/intel/root/path'`指定MKL库(Intel工具包)的安装路径。如`cmake -DUSE_MKL=ON -DINTEL_ROOT='/opt/intel/compilers_and_libraries_2020.2.254/linux' ..`
- 如项目计划启用OpenBLAS数学运算库(需用户自行安装),则仅需在CMake命令中使用`-DUSE_OPENBLAS=ON`参数,并通过`-DOPENBLAS_ROOT='/openblas/root/path'`指定OpenBLAS库的安装路径。如`cmake -DUSE_OPENBLAS=ON -DOPENBLAS_ROOT='/opt/OpenBLAS' ..`
- 如项目计划启用CUDA数学运算库(需用户自行安装),则仅需在CMake命令中使用`-DUSE_CUDA=ON`参数,并通过`-DCUDA_ROOT='/cuda/root/path'`指定CUDA库的安装路径,通过-DGPU_ARCH=ARCH指定所在GPU设备的架构(K:Kepler架构;M:Maxwell架构;P:Pascal架构;V:Volta架构;T:Turing架构;A:Ampere架构)。如`cmake -DUSE_CUDA=ON -DCUDA_ROOT='/usr/local/cuda-9.2' -DGPU_ARCH=P ..`。如需在GPU设备上使用半精度浮点数进行运算,需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数(需要注意的是半精度但需要注意的是,半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持,该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询)。
- 如项目计划启用CUDA数学运算库(需用户自行安装),则仅需在CMake命令中使用`-DUSE_CUDA=ON`参数,并通过`-DCUDA_TOOLKIT_ROOT='/cuda/root/path'`指定CUDA库的安装路径,通过-DGPU_ARCH=ARCH指定所在GPU设备的架构(K:Kepler架构;M:Maxwell架构;P:Pascal架构;V:Volta架构;T:Turing架构;A:Ampere架构)。如`cmake -DUSE_CUDA=ON -DCUDA_TOOLKIT_ROOT='/usr/local/cuda-9.2' -DGPU_ARCH=P ..`。如需在GPU设备上使用半精度浮点数进行运算,需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数(需要注意的是半精度但需要注意的是,半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持,该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询)。
- 执行成功将显示`Build files have been written to:...`并在该目录下生成Makefile文件。
- 执行`make -j`命令对NiuTensor项目进行编译,执行成功将显示`Built target NiuTensor`,安装完毕。
......@@ -137,4 +137,4 @@ NiuTensor蠑驥剰ョ。邂怜コ鍋罰荳懷圏螟ァ蟄ヲ閾ェ辟カ隸ュ險螟炊螳樣ェ悟ョ、蟆冗央蠑貅仙
## 更新版本
NiuTensor version 0.3.5 - 2021年2月6日
NiuTensor version 0.4.0 - 2021年3月13日
......@@ -27,6 +27,7 @@
#include "./tensor/test/Test.h"
#include "./sample/fnnlm/FNNLM.h"
#include "./sample/transformer/NMT.h"
#include "./train/TTrain.h"
//#define CRTDBG_MAP_ALLOC
//#include <stdlib.h>
......@@ -38,8 +39,17 @@ using namespace nmt;
int main( int argc, const char ** argv )
{
if(argc > 1 && !strcmp(argv[1], "-test"))
XConfig config;
if(argc > 1){
config.Create(argc - 1, argv + 1);
verboseLevel = config.GetInt("verbose", 1);
}
if (argc > 1 && !strcmp(argv[1], "-test"))
Test();
else if (argc > 1 && !strcmp(argv[1], "-testtrain"))
TestTrain();
else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
FNNLMMain(argc - 1, argv + 1);
else if(argc > 1 && !strcmp(argv[1], "-t2t"))
......@@ -47,7 +57,8 @@ int main( int argc, const char ** argv )
else{
fprintf(stderr, "Thanks for using NiuTensor! This is a library for building\n");
fprintf(stderr, "neural networks in an easy way. \n\n");
fprintf(stderr, "Run this program with \"-test\" for unit test!\n");
fprintf(stderr, " Run this program with \"-test\" for unit test!\n");
fprintf(stderr, "Or run this program with \"-testtrain\" for test of the trainer!\n");
fprintf(stderr, "Or run this program with \"-fnnlm\" for sample FNNLM!\n");
fprintf(stderr, "Or run this program with \"-t2t\" for sample Transformer!\n");
}
......
......@@ -93,6 +93,7 @@ void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient)
}
node->visitMark = NODE_FINISHED;
node->isGradFinished = true;
}
/* indicates whether the node is for an activation function */
......
......@@ -89,6 +89,7 @@ void XLossGrad::MakeGrad(XTensor * node, bool isEfficient)
}
node->visitMark = NODE_FINISHED;
node->isGradFinished = true;
}
/* indicates whether the node is for a loss computation */
......
......@@ -125,6 +125,9 @@ void XMathGrad::MakeGrad(XTensor * node, bool isEfficient)
else{
ShowNTErrors("Unsupported backward computation! TODO!");
}
node->visitMark = NODE_FINISHED;
node->isGradFinished = true;
}
/* indicates whether the node is for a math operation */
......@@ -156,14 +159,16 @@ void XMathGrad::GradAbsolute(XTensor * node, bool isEfficient)
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
if (a->mem != NULL)
a->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
_Sign(a, tmp);
_Multiply(node->grad, tmp, a->grad, 1.0F);
DelTensorBuf(tmp);
if (a->mem != NULL)
a->mem->UnlockBuf();
}
node->visitMark = NODE_FINISHED;
}
/*
......@@ -187,15 +192,17 @@ void XMathGrad::GradCos(XTensor * node, bool isEfficient)
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
if (a->mem != NULL)
a->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
_Sin(a, tmp);
_NegateMe(tmp);
_Multiply(node->grad, tmp, a->grad, 1.0F);
DelTensorBuf(tmp);
if (a->mem != NULL)
a->mem->UnlockBuf();
}
node->visitMark = NODE_FINISHED;
}
/*
......@@ -219,14 +226,16 @@ void XMathGrad::GradExp(XTensor * node, bool isEfficient)
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
if (a->mem != NULL)
a->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
_Exp(a, tmp);
_Multiply(node->grad, tmp, a->grad, 1.0F);
DelTensorBuf(tmp);
if (a->mem != NULL)
a->mem->UnlockBuf();
}
node->visitMark = NODE_FINISHED;
}
/*
......@@ -251,8 +260,6 @@ void XMathGrad::GradLog(XTensor * node, bool isEfficient)
XNoder::MakeGrad(a);
_Div(node->grad, a, a->grad, 1.0F);
}
node->visitMark = NODE_FINISHED;
}
/*
......@@ -276,8 +283,6 @@ void XMathGrad::GradRound(XTensor * node, bool isEfficient)
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
}
node->visitMark = NODE_FINISHED;
}
/*
......@@ -301,8 +306,6 @@ void XMathGrad::GradSign(XTensor * node, bool isEfficient)
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
}
node->visitMark = NODE_FINISHED;
}
/*
......@@ -326,14 +329,16 @@ void XMathGrad::GradSin(XTensor * node, bool isEfficient)
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
if (a->mem != NULL)
a->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
_Cos(a, tmp);
_Multiply(node->grad, tmp, a->grad, 1.0F);
DelTensorBuf(tmp);
if (a->mem != NULL)
a->mem->UnlockBuf();
}
node->visitMark = NODE_FINISHED;
}
/*
......@@ -352,20 +357,23 @@ void XMathGrad::GradTan(XTensor * node, bool isEfficient)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for TAN!");
XTensor * a = income.tails[0];
XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
/* dE/da = dE/dc * 1/(cos(a))^2
= dE/dc * (cos(a))^-2 */
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
if (a->mem != NULL)
a->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
_Cos(a, tmp);
_PowerMe(tmp, -2.0F);
_Multiply(node->grad, tmp, a->grad, 1.0F);
DelTensorBuf(tmp);
if (a->mem != NULL)
a->mem->UnlockBuf();
}
node->visitMark = NODE_FINISHED;
}
/*
......@@ -392,14 +400,16 @@ void XMathGrad::GradClip(XTensor * node, bool isEfficient)
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
if (a->mem != NULL)
a->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
_ClipBackward(node, a, node->grad, tmp, lower, upper);
_SumMe(a->grad, tmp);
DelTensorBuf(tmp);
if (a->mem != NULL)
a->mem->UnlockBuf();
}
node->visitMark = NODE_FINISHED;
}
/*
......@@ -432,6 +442,8 @@ void XMathGrad::GradDiv(XTensor * node, bool isEfficient)
= dE/dc * a * (-b^-2) */
if (!isEfficient || b->isGrad) {
XNoder::MakeGrad(b);
if (a->mem != NULL)
a->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
_Power(b, tmp, -2.0F);
_NegateMe(tmp);
......@@ -439,9 +451,9 @@ void XMathGrad::GradDiv(XTensor * node, bool isEfficient)
_Multiply(node->grad, tmp, b->grad, 1.0F);
DelTensorBuf(tmp);
if (a->mem != NULL)
a->mem->UnlockBuf();
}
node->visitMark = NODE_FINISHED;
}
/*
......@@ -478,9 +490,17 @@ void XMathGrad::GradDivDim(XTensor * node, bool isEfficient)
int dimSize[MAX_TENSOR_DIM_NUM];
memcpy(dimSize, a->dimSize, sizeof(int) * a->order);
if (a->mem != NULL)
a->mem->LockBuf();
XTensor * aTMP1 = NewTensorBufV2(a, a->devID, a->mem);
XTensor * aTMP2 = NewTensorBufV2(a, a->devID, a->mem);
if ((b->mem != NULL) && (b->mem != a->mem)) {
b->mem->LockBuf();
}
XTensor * bTMP = NewTensorBufV2(b, b->devID, b->mem);
if ((node->mem != NULL) && (node->mem != a->mem) && (node->mem != b->mem)) {
node->mem->LockBuf();
}
XTensor * interGradTMP = NewTensorBufV2(node->grad, node->devID, node->mem);
_Negate(a, aTMP1);
......@@ -522,6 +542,7 @@ void XMathGrad::GradDivDim(XTensor * node, bool isEfficient)
Then reduce along with z and x to obtain dE/db. */
interGradTMP->Reshape(3, reshapedSize);
// b->mem->LockBuf();
XTensor * interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
_ReduceSum(interGradTMP, interGrad, 2);
......@@ -532,15 +553,22 @@ void XMathGrad::GradDivDim(XTensor * node, bool isEfficient)
DelTensorBuf(bGradTMP2);
DelTensorBuf(interGrad);
// b->mem->UnlockBuf();
}
DelTensorBuf(interGradTMP);
if ((node->mem != NULL) && (node->mem != a->mem) && (node->mem != b->mem)) {
node->mem->UnlockBuf();
}
DelTensorBuf(bTMP);
if ((b->mem != NULL) && (b->mem != a->mem)) {
b->mem->UnlockBuf();
}
DelTensorBuf(aTMP2);
DelTensorBuf(aTMP1);
if (a->mem != NULL)
a->mem->UnlockBuf();
}
node->visitMark = NODE_FINISHED;
}
/*
......@@ -602,8 +630,6 @@ void XMathGrad::GradMatrixMul(XTensor * node, bool isEfficient)
else{
ShowNTErrors("TODO!");
}
node->visitMark = NODE_FINISHED;
}
/*
......@@ -757,8 +783,6 @@ void XMathGrad::GradMatrixMulBatched(XTensor * node, bool isEfficient)
if (!isEfficient || b->isGrad)
_MatrixMulBatched(dedc, X_TRANS, a, X_TRANS, dedb, alpha, 1.0F);
}
node->visitMark = NODE_FINISHED;
}
/*
......@@ -793,8 +817,6 @@ void XMathGrad::GradMultiply(XTensor * node, bool isEfficient)
XNoder::MakeGrad(b);
_Multiply(node->grad, a, b->grad, 1.0F);
}
node->visitMark = NODE_FINISHED;
}
/*
......@@ -830,6 +852,8 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)
int dimSize[MAX_TENSOR_DIM_NUM];
memcpy(dimSize, a->dimSize, sizeof(int) * a->order);
if (node->mem != NULL)
node->mem->LockBuf();
XTensor * bGradTMP = NewTensorBufV2(node->grad, node->devID, node->mem);
_Multiply(node->grad, a, bGradTMP);
......@@ -842,12 +866,18 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)
size of b. Then we can reduce the matrix into a row vector. */
bGradTMP->Reshape(2, reshapedSize);
if ((b->mem != NULL) && (b->mem != node->mem)) {
b->mem->LockBuf();
}
XTensor * bGradTMP2 = NewTensorBufV2(b->grad, b->devID, b->mem);
_ReduceSum(bGradTMP, bGradTMP2, 0);
_Sum(b->grad, bGradTMP2, b->grad);
DelTensorBuf(bGradTMP2);
if ((b->mem != NULL) && (b->mem != node->mem)) {
b->mem->UnlockBuf();
}
}
else {
int reshapedSize[MAX_TENSOR_DIM_NUM];
......@@ -866,6 +896,9 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)
Then reduce along with z and x to obtain dE/db. */
bGradTMP->Reshape(3, reshapedSize);
if ((b->mem != NULL) && (b->mem != node->mem)) {
b->mem->LockBuf();
}
XTensor * interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
_ReduceSum(bGradTMP, interGrad, 2);
......@@ -876,11 +909,14 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)
DelTensorBuf(bGradTMP2);
DelTensorBuf(interGrad);
if ((b->mem != NULL) && (b->mem != node->mem)) {
b->mem->UnlockBuf();
}
}
DelTensorBuf(bGradTMP);
if (node->mem != NULL)
node->mem->UnlockBuf();
}
node->visitMark = NODE_FINISHED;
}
/*
......@@ -916,8 +952,6 @@ void XMathGrad::GradMultiplyBroadcast(XTensor * node, bool isEfficient)
if (b->isVar || b->income.tailNum > 0)
ShowNTErrors("TODO");
}
node->visitMark = NODE_FINISHED;
}
/*
......@@ -942,8 +976,6 @@ void XMathGrad::GradNegate(XTensor * node, bool isEfficient)
XNoder::MakeGrad(a);
_Sum(a->grad, node->grad, a->grad, -1.0F);
}
node->visitMark = NODE_FINISHED;
}
/*
......@@ -980,15 +1012,17 @@ void XMathGrad::GradPower(XTensor * node, bool isEfficient)
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
if (a->mem != NULL)
a->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
_Power(a, tmp, p - 1.0F);
_ScaleAndShiftMe(tmp, p);
_Multiply(node->grad, tmp, a->grad, 1.0F);
DelTensorBuf(tmp);
if (a->mem != NULL)
a->mem->UnlockBuf();
}
node->visitMark = NODE_FINISHED;
}
......@@ -1012,15 +1046,17 @@ void XMathGrad::GradReciprocal(XTensor* node, bool isEfficient)
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
if (a->mem != NULL)
a->mem->LockBuf();
XTensor* tmp = NewTensorBufV2(a, a->devID, a->mem);
_Power(a, tmp, -2.0F);
_NegateMe(tmp);
_Multiply(node->grad, tmp, a->grad, 1.0F);
DelTensorBuf(tmp);
if (a->mem != NULL)
a->mem->UnlockBuf();
}
node->visitMark = NODE_FINISHED;
}
/*
......@@ -1043,14 +1079,16 @@ void XMathGrad::GradSqrt(XTensor * node, bool isEfficient)
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
if (a->mem != NULL)
a->mem->LockBuf();
XTensor* tmp = NewTensorBufV2(a, a->devID, a->mem);
_ScaleMe(tmp, 2.0F);
_Multiply(node->grad, tmp, a->grad, 1.0F);
DelTensorBuf(tmp);
if (a->mem != NULL)
a->mem->UnlockBuf();
}
node->visitMark = NODE_FINISHED;
}
/*
......@@ -1073,15 +1111,17 @@ void XMathGrad::GradSquare(XTensor * node, bool isEfficient)
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
if (a->mem != NULL)
a->mem->LockBuf();
XTensor* tmp = NewTensorBufV2(a, a->devID, a->mem);
_Power(a, tmp, -0.5F);
_ScaleMe(tmp, 0.5);
_Multiply(node->grad, tmp, a->grad, 1.0F);
DelTensorBuf(tmp);
if (a->mem != NULL)
a->mem->UnlockBuf();
}
node->visitMark = NODE_FINISHED;
}
/*
......@@ -1109,8 +1149,6 @@ void XMathGrad::GradScaleAndShift(XTensor * node, bool isEfficient)
_Sum(a->grad, node->grad, a->grad, scale);
}
node->visitMark = NODE_FINISHED;
}
/*
......@@ -1138,8 +1176,6 @@ void XMathGrad::GradScale(XTensor * node, bool isEfficient)
_Sum(a->grad, node->grad, a->grad, scale);
}
node->visitMark = NODE_FINISHED;
}
/*
......@@ -1166,9 +1202,7 @@ void XMathGrad::GradDescale(XTensor * node, bool isEfficient)
XNoder::MakeGrad(a);
_Sum(a->grad, node->grad, a->grad, 1 / descale);
}
node->visitMark = NODE_FINISHED;
}
}
/*
......@@ -1194,8 +1228,6 @@ void XMathGrad::GradShift(XTensor * node, bool isEfficient)
_Sum(a->grad, node->grad, a->grad);
}
node->visitMark = NODE_FINISHED;
}
/*
......@@ -1229,8 +1261,6 @@ void XMathGrad::GradSub(XTensor * node, bool isEfficient)
XNoder::MakeGrad(b);
_Sum(b->grad, node->grad, b->grad, -beta);
}
node->visitMark = NODE_FINISHED;
}
/*
......@@ -1275,12 +1305,16 @@ void XMathGrad::GradSubDim(XTensor * node, bool isEfficient)
size of b. Then we can reduce the matrix into a row vector. */
node->grad->Reshape(2, reshapedSize);
if (b->mem != NULL)
b->mem->LockBuf();
XTensor * bGradTMP = NewTensorBufV2(b->grad, b->devID, b->mem);
_ReduceSum(node->grad, bGradTMP, 0);
if (beta != 1.0F)
_ScaleAndShiftMe(bGradTMP, beta);
_Sub(b->grad, bGradTMP, b->grad);
DelTensorBuf(bGradTMP);
if (b->mem != NULL)
b->mem->UnlockBuf();
node->grad->Reshape(order, dimSize);
}
......@@ -1301,6 +1335,8 @@ void XMathGrad::GradSubDim(XTensor * node, bool isEfficient)
Then reduce along with z and x to obtain dE/db. */
node->grad->Reshape(3, reshapedSize);
if (b->mem != NULL)
b->mem->LockBuf();
XTensor * interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
_ReduceSum(node->grad, interGrad, 2);
......@@ -1315,10 +1351,10 @@ void XMathGrad::GradSubDim(XTensor * node, bool isEfficient)
node->grad->Reshape(order, dimSize);
DelTensorBuf(interGrad);
if (b->mem != NULL)
b->mem->UnlockBuf();
}
}
node->visitMark = NODE_FINISHED;
}
/*
......@@ -1352,8 +1388,6 @@ void XMathGrad::GradSum(XTensor * node, bool isEfficient)
XNoder::MakeGrad(b);
_Sum(b->grad, node->grad, b->grad, beta);
}
node->visitMark = NODE_FINISHED;
}
/*
......@@ -1399,12 +1433,16 @@ void XMathGrad::GradSumDim(XTensor * node, bool isEfficient)
size of b. Then we can reduce the matrix into a row vector. */
node->grad->Reshape(2, reshapedSize);
if (b->mem != NULL)
b->mem->LockBuf();
XTensor * bGradTMP = NewTensorBufV2(b->grad, b->devID, b->mem);
_ReduceSum(node->grad, bGradTMP, 0);
if (beta != 1.0F)
_ScaleAndShiftMe(bGradTMP, beta);
_Sum(bGradTMP, b->grad, b->grad);
DelTensorBuf(bGradTMP);
if (b->mem != NULL)
b->mem->UnlockBuf();
node->grad->Reshape(order, dimSize);
}
......@@ -1425,6 +1463,8 @@ void XMathGrad::GradSumDim(XTensor * node, bool isEfficient)
Then reduce along with z and x to obtain dE/db. */
node->grad->Reshape(3, reshapedSize);
if (b->mem != NULL)
b->mem->LockBuf();
XTensor * interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
_ReduceSum(node->grad, interGrad, 2);
......@@ -1439,10 +1479,10 @@ void XMathGrad::GradSumDim(XTensor * node, bool isEfficient)
node->grad->Reshape(order, dimSize);
DelTensorBuf(interGrad);
if (b->mem != NULL)
b->mem->UnlockBuf();
}
}
node->visitMark = NODE_FINISHED;
}
/*
......@@ -1480,8 +1520,6 @@ void XMathGrad::GradSumBroadcast(XTensor * node, bool isEfficient)
ShowNTErrors("TODO");
}
}
node->visitMark = NODE_FINISHED;
}
/*
......@@ -1509,15 +1547,17 @@ void XMathGrad::GradReduceMean(XTensor * node, bool isEfficient)
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
if (a->mem != NULL)
a->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
_Unsqueeze(node->grad, tmp, dim, n);
_ScaleAndShiftMe(tmp, 1.0F / n);
_Sum(a->grad, tmp, a->grad);
DelTensorBuf(tmp);
if (a->mem != NULL)
a->mem->UnlockBuf();
}
node->visitMark = NODE_FINISHED;
}
/*
......@@ -1545,13 +1585,15 @@ void XMathGrad::GradReduceSum(XTensor * node, bool isEfficient)
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
if (a->mem != NULL)
a->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
_Unsqueeze(node->grad, tmp, dim, n);
_Sum(a->grad, tmp, a->grad);
DelTensorBuf(tmp);
if (a->mem != NULL)
a->mem->UnlockBuf();
}
node->visitMark = NODE_FINISHED;
}
/*
......@@ -1576,14 +1618,16 @@ void XMathGrad::GradReduceSumAll(XTensor * node, bool isEfficient)
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
if (a->mem != NULL)
a->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
DTYPE value = node->grad->Get0D();
tmp->SetDataFixed(value);
_Sum(a->grad, tmp, a->grad);
DelTensorBuf(tmp);
if (a->mem != NULL)
a->mem->UnlockBuf();
}
node->visitMark = NODE_FINISHED;
}
/*
......@@ -1605,9 +1649,14 @@ void XMathGrad::GradReduceSumSquared(XTensor * node, bool isEfficient)
XTensor * a = income.tails[0];
XTensor * b = income.tails[1];
if (a->mem != NULL)
a->mem->LockBuf();
XTensor * c = NewTensorBufV2(a, a->devID, a->mem);
XTensor * d = NewTensorBufV2(a, a->devID, a->mem);
XTensor * e = NewTensorBufV2(a, a->devID, a->mem);
if ((b->mem != NULL) && (b->mem != a->mem)) {
b->mem->LockBuf();
}
XTensor * f = NewTensorBufV2(b, b->devID, b->mem);
int dim = income.GetParamInt(0);
......@@ -1636,11 +1685,14 @@ void XMathGrad::GradReduceSumSquared(XTensor * node, bool isEfficient)
}
DelTensorBuf(f);
if ((b->mem != NULL) && (b->mem != a->mem)) {
b->mem->UnlockBuf();
}
DelTensorBuf(e);
DelTensorBuf(d);
DelTensorBuf(c);
node->visitMark = NODE_FINISHED;
if (a->mem != NULL)
a->mem->UnlockBuf();
}
/*
......@@ -1663,9 +1715,14 @@ void XMathGrad::GradReduceVariance(XTensor * node, bool isEfficient)
XTensor * a = income.tails[0];
XTensor * b = income.tails[1];
if (a->mem != NULL)
a->mem->LockBuf();
XTensor * c = NewTensorBufV2(a, a->devID, a->mem);
XTensor * d = NewTensorBufV2(a, a->devID, a->mem);
XTensor * e = NewTensorBufV2(a, a->devID, a->mem);
if ((b->mem != NULL) && (b->mem != a->mem)) {
b->mem->LockBuf();
}
XTensor * f = NewTensorBufV2(b, b->devID, b->mem);
int dim = income.GetParamInt(0);
......@@ -1693,11 +1750,14 @@ void XMathGrad::GradReduceVariance(XTensor * node, bool isEfficient)
}
DelTensorBuf(f);
if ((b->mem != NULL) && (b->mem != a->mem)) {
b->mem->UnlockBuf();
}
DelTensorBuf(e);
DelTensorBuf(d);
DelTensorBuf(c);
node->visitMark = NODE_FINISHED;
if (a->mem != NULL)
a->mem->UnlockBuf();
}
/*
......@@ -1742,10 +1802,14 @@ void XMathGrad::GradMulAndShift(XTensor * node, bool isEfficient)
size of b. Then we can reduce the matrix into a row vector. */
node->grad->Reshape(2, reshapedSize);
if (b->mem != NULL)
b->mem->LockBuf();
XTensor * bGradTMP = NewTensorBufV2(b->grad, b->devID, b->mem);
_ReduceSum(node->grad, bGradTMP, 0);
_Sum(bGradTMP, b->grad, b->grad);
DelTensorBuf(bGradTMP);
if (b->mem != NULL)
b->mem->UnlockBuf();
node->grad->Reshape(order, dimSize);
}
......@@ -1766,6 +1830,8 @@ void XMathGrad::GradMulAndShift(XTensor * node, bool isEfficient)
Then reduce along with z and x to obtain dE/db. */
node->grad->Reshape(3, reshapedSize);
if (b->mem != NULL)
b->mem->LockBuf();
XTensor * interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
_ReduceSum(node->grad, interGrad, 2);
......@@ -1777,6 +1843,8 @@ void XMathGrad::GradMulAndShift(XTensor * node, bool isEfficient)
node->grad->Reshape(order, dimSize);
DelTensorBuf(interGrad);
if (b->mem != NULL)
b->mem->UnlockBuf();
}
}
......@@ -1815,9 +1883,6 @@ void XMathGrad::GradMulAndShift(XTensor * node, bool isEfficient)
dedx->Reshape(orderBackupX, dimsBackupX);
dedc->Reshape(orderBackupC, dimsBackupC);
}
node->visitMark = NODE_FINISHED;
}
/*
......@@ -1884,6 +1949,8 @@ void XMathGrad::GradMLP(XTensor* node, bool isEfficient)
Then reduce along with z and x to obtain dE/db. */
node->grad->Reshape(3, reshapedSize);
if (b->mem != NULL)
b->mem->LockBuf();
XTensor* interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
_ReduceSum(node->grad, interGrad, 2);
......@@ -1895,6 +1962,8 @@ void XMathGrad::GradMLP(XTensor* node, bool isEfficient)
node->grad->Reshape(order, dimSize);
DelTensorBuf(interGrad);
if (b->mem != NULL)
b->mem->UnlockBuf();
}
}
......@@ -1933,9 +2002,6 @@ void XMathGrad::GradMLP(XTensor* node, bool isEfficient)
dedx->Reshape(orderBackupX, dimsBackupX);
dedc->Reshape(orderBackupC, dimsBackupC);
}
node->visitMark = NODE_FINISHED;
}
}
......@@ -105,12 +105,19 @@ void XShapeGrad::GradConvertDataType(XTensor* node, bool isEfficient)
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
if (a->mem != NULL)
a->mem->LockBuf();
XTensor* tmp = NewTensorBufV2(a, a->devID, a->mem);
_ConvertDataType(node->grad, tmp);
_SumMe(a->grad, tmp);
DelTensorBuf(tmp);
if (a->mem != NULL)
a->mem->UnlockBuf();
}
node->visitMark = NODE_FINISHED;
node->isGradFinished = true;
}
/*
......@@ -138,12 +145,19 @@ void XShapeGrad::GradCopyIndexed(XTensor * node, bool isEfficient)
if (!isEfficient || input->isGrad) {
XNoder::MakeGrad(input);
if (input->mem != NULL)
input->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
_SpreadForCopyIndexed(tmp, node->grad, dim, srcIndex, tgtIndex, copyNum);
_SumMe(input->grad, tmp);
DelTensorBuf(tmp);
if (input->mem != NULL)
input->mem->UnlockBuf();
}
node->visitMark = NODE_FINISHED;
node->isGradFinished = true;
}
/*
......@@ -167,15 +181,20 @@ void XShapeGrad::GradGather(XTensor * node, bool isEfficient)
if (!isEfficient || input->isGrad) {
XNoder::MakeGrad(input);
if (input->mem != NULL)
input->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
tmp->SetZeroAll();
_SpreadForGather(tmp, node->grad, index);
_SumMe(input->grad, tmp);
DelTensorBuf(tmp);
if (input->mem != NULL)
input->mem->UnlockBuf();
}
node->visitMark = NODE_FINISHED;
node->isGradFinished = true;
}
/*
......@@ -193,6 +212,8 @@ void XShapeGrad::GradDropoutWithIndex(XTensor * node, bool isEfficient)
if (!isEfficient || input->isGrad) {
XNoder::MakeGrad(input);
if (input->mem != NULL)
input->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
_CopyValues(node->grad, tmp);
......@@ -205,9 +226,12 @@ void XShapeGrad::GradDropoutWithIndex(XTensor * node, bool isEfficient)
_SumMe(input->grad, tmp);
DelTensorBuf(tmp);
if (input->mem != NULL)
input->mem->UnlockBuf();
}
node->visitMark = NODE_FINISHED;
node->isGradFinished = true;
}
/*
......@@ -246,13 +270,16 @@ void XShapeGrad::GradMerge(XTensor * node, bool isEfficient)
dims[j++] = input->dimSize[i];
}
}
dims[0] = -dims[0];
dims[0] = -abs(dims[0]);
XTensor gradInputSmall(input->order - leadDim, dims,
input->dataType, input->denseRatio,
input->devID, input->mem);
dims[whereToMerge - leadDim] *= dims[0];
XTensor gradNodeSmall(node->order - leadDim, dims + leadDim + 1,
dims[whereToMerge - leadDim] *= abs(dims[0]);
int * dimsNode = dims + 1;
dimsNode[0] = -abs(dimsNode[0]);
XTensor gradNodeSmall(node->order - leadDim, dimsNode,
node->dataType, node->denseRatio,
node->devID, node->mem);
......@@ -296,6 +323,7 @@ void XShapeGrad::GradMerge(XTensor * node, bool isEfficient)
}
node->visitMark = NODE_FINISHED;
node->isGradFinished = true;
}
/*
......@@ -379,6 +407,7 @@ void XShapeGrad::GradMergeList(XTensor * node, bool isEfficient)
}
node->visitMark = NODE_FINISHED;
node->isGradFinished = true;
}
/*
......@@ -407,6 +436,7 @@ void XShapeGrad::GradReshape(XTensor * node, bool isEfficient)
}
node->visitMark = NODE_FINISHED;
node->isGradFinished = true;
}
/*
......@@ -442,16 +472,21 @@ void XShapeGrad::GradSplit(XTensor * node, bool isEfficient)
/* if the tensor is used somewhere else, we need another SUM
for gradient accumulation */
else {
if (input->mem != NULL)
input->mem->LockBuf();
XTensor * inputGradTMP = NewTensorBufV2(input, input->devID, input->mem);
_Merge(node->grad, inputGradTMP, whereToSplit + 1, 0);
_Sum(input->grad, inputGradTMP, input->grad);
DelTensorBuf(inputGradTMP);
if (input->mem != NULL)
input->mem->UnlockBuf();
}
}
node->visitMark = NODE_FINISHED;
node->isGradFinished = true;
}
/*
......@@ -528,14 +563,21 @@ void XShapeGrad::GradSplitListPost(XTensor * node, bool isEfficient)
somewhere else, we need another SUM for gradient
accumulation */
else {
if (node->mem != NULL)
node->mem->LockBuf();
XTensor * nodeGradTMP = NewTensorBufV2(node, node->devID, node->mem);
_Merge(&splits, nodeGradTMP, whereToSplit + 1);
_Sum(node->grad, nodeGradTMP, node->grad);
DelTensorBuf(nodeGradTMP);
if (node->mem != NULL)
node->mem->UnlockBuf();
}
}
node->visitMark = NODE_DOING;
node->isGradFinished = true;
}
/*
......@@ -566,14 +608,19 @@ void XShapeGrad::GradTranspose(XTensor * node, bool isEfficient)
CheckNTErrors(input->order > i && i >= 0, "index of dimension is out of scope!");
CheckNTErrors(input->order > j && j >= 0, "index of dimension is out of scope!");
if (input->mem != NULL)
input->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
_Transpose(output->grad, tmp, i, j);
_Sum(input->grad, tmp, input->grad);
DelTensorBuf(tmp);
if (input->mem != NULL)
input->mem->UnlockBuf();
}
node->visitMark = NODE_FINISHED;
node->isGradFinished = true;
}
/*
......@@ -603,15 +650,20 @@ void XShapeGrad::GradUnsqueeze(XTensor * node, bool isEfficient)
if (!isEfficient || input->isGrad) {
XNoder::MakeGrad(input);
if (input->mem != NULL)
input->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(input->grad, input->devID, input->mem);
_ReduceSum(output->grad, tmp, dim);
_Sum(input->grad, tmp, input->grad);
DelTensorBuf(tmp);
if (input->mem != NULL)
input->mem->UnlockBuf();
}
node->visitMark = NODE_FINISHED;
node->isGradFinished = true;
}
}
\ No newline at end of file
......@@ -101,6 +101,7 @@ void XNet::Backward(TensorList &roots)
for(int i = 0; i < nodes.count; i++){
XTensor * node = (XTensor*)nodes.Get(i);
node->visitMark = NODE_UNFINISHED;
node->isGradFinished = false;
}
/* back-propagation from output to input */
......@@ -108,7 +109,7 @@ void XNet::Backward(TensorList &roots)
XTensor * node = (XTensor*)nodes.Get(i);
if(node->mem != NULL){
CheckNTErrors(node->mem->bufUsed < BUF_PITCH, "Illegal access of buffer!");
//CheckNTErrors(node->mem->bufUsed < BUF_PITCH, "Illegal access of buffer!");
}
if(node->visitMark != NODE_FINISHED)
......@@ -127,7 +128,20 @@ void XNet::Backward(TensorList &roots)
delete node;
}
}
}
}
for (int i = 0; i < nodes.count; i++) {
XTensor* node = (XTensor*)nodes.Get(i);
if (node->income.tailNum >= 100 || node->outgo.tailNum >= 100) {
XPRINT(1, stderr, "Are you sure that the node should connect so many (100) nodes?\n");
}
if (node->grad != NULL) {
XTensor* grad = node->grad;
if (grad->income.tailNum >= 100 || grad->outgo.tailNum >= 100) {
XPRINT(1, stderr, "Are you sure that the grad node should connect so many (100) nodes?\n");
}
}
}
}
......
/* NiuTrans.Tensor - an open-source tensor library
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......
/* NiuTrans.Tensor - an open-source tensor library
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......
/* NiuTrans.Tensor - an open-source tensor library
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......
/* NiuTrans.Tensor - an open-source tensor library
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......
/* NiuTrans.Tensor - an open-source tensor library
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......@@ -224,8 +224,6 @@ void Model::MakeMT(XTensor& inputEnc, XTensor& inputDec, XTensor& output,
XTensor maskDec;
XTensor maskEncDec;
bool debug(false);
/* encoder mask */
MakeMTMaskEnc(paddingEnc, maskEnc);
......@@ -234,25 +232,9 @@ void Model::MakeMT(XTensor& inputEnc, XTensor& inputDec, XTensor& output,
encoding = MakeEncoder(inputEnc, &maskEnc, isTraining);
if (debug) {
LOG("after encoding:");
encoding.mem->ShowMemUsage(stderr);
}
decoding = MakeDecoder(inputDec, encoding, &maskDec, maskEncDec, isTraining);
if (debug) {
LOG("after decoding:");
encoding.mem->ShowMemUsage(stderr);
}
outputLayer->Make(decoding, output, true, true);
if (debug) {
LOG("after outputing:");
encoding.mem->ShowMemUsage(stderr);
exit(0);
}
}
/*
......@@ -287,6 +269,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
dims[inputDec.order + 1] = inputEnc.GetDim(inputEnc.order - 1);
InitTensor(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, paddingEnc.devID);
GMems.GetMem(paddingEnc.devID)->LockBuf();
XTensor* maskEncDecTMPEnc = NewTensorBufV2(paddingEnc.order + 1, dims + 1,
paddingEnc.dataType, 1.0F, paddingEnc.devID, paddingEnc.mem);
XTensor* maskEncDecTMPDec = NewTensorBufV2(maskEncDecTMPEnc, paddingEnc.devID, paddingEnc.mem);
......@@ -297,6 +280,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
DelTensorBuf(maskEncDecTMPDec);
DelTensorBuf(maskEncDecTMPEnc);
GMems.GetMem(paddingEnc.devID)->UnlockBuf();
/* padding on the source side */
int* dimsPadding = new int[paddingEnc.order + 2];
......@@ -305,6 +289,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
dimsPadding[paddingEnc.order - 1] = paddingEnc.GetDim(-1);
dimsPadding[paddingEnc.order] = paddingEnc.GetDim(-1);
GMems.GetMem(paddingEnc.devID)->LockBuf();
XTensor* padding2 = NewTensorBufV2(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType, 1.0F,
paddingEnc.devID, paddingEnc.mem);
......@@ -331,6 +316,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
DelTensorBuf(padding3);
DelTensorBuf(padding2);
GMems.GetMem(paddingEnc.devID)->UnlockBuf();
}
/*
......@@ -344,7 +330,6 @@ void Model::MakeMTMaskEnc(XTensor& paddingEnc, XTensor& maskEnc)
/* mask of the padding */
Unsqueeze(paddingEnc, padding2, paddingEnc.order - 1, paddingEnc.GetDim(-1));
Unsqueeze(padding2, maskEnc, 0, nhead);
ScaleAndShiftMe(maskEnc, 1e9F, -1e9F);
}
......@@ -378,7 +363,6 @@ void Model::MakeMTMaskDec(XTensor& paddingEnc, XTensor& paddingDec,
Unsqueeze(paddingEnc, maskEncDecTMP, paddingEnc.order - 1, paddingDec.GetDim(-1));
ScaleAndShiftMe(maskEncDecTMP, 1e9F, -1e9F);
Unsqueeze(maskEncDecTMP, maskEncDec, 0, dims[0]);
delete[] dims;
......@@ -571,4 +555,14 @@ void Model::Read(FILE* file)
LOG("model loaded (took %.1fs)", elapsed);
}
XModel* Model::Clone(int devID)
{
return nullptr;
}
bool Model::RunSimple(XList* inputs, XList* outputs, XList* golds, XList* losses)
{
return false;
}
}
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......@@ -24,17 +24,18 @@
#include "Encoder.h"
#include "Decoder.h"
#include "Utility.h"
#include "submodel/FNN.h"
#include "submodel/Output.h"
#include "Utility.h"
#include "submodel/Attention.h"
#include "../../train/XModel.h"
namespace nmt
{
/* a nmt model that keeps parameters of the encoder,
/* an nmt model that keeps parameters of the encoder,
the decoder and the output layer (softmax). */
class Model
class Model : public XModel
{
public:
/* device id */
......@@ -85,26 +86,26 @@ public:
/* make the encoding network */
XTensor MakeDecoder(XTensor& inputEnc, XTensor& inputDec, XTensor* mask,
XTensor& MaskEncDec, bool isTraining);
XTensor& MaskEncDec, bool isTraining);
/* make the network for language modeling (with the output softmax layer) */
void MakeLM(XTensor& input, XTensor& output, XTensor& padding, bool isTraining);
/* make the network for machine translation (with the output softmax layer) */
void MakeMT(XTensor& inputEnc, XTensor& inputDec, XTensor& output,
XTensor& paddingEnc, XTensor& paddingDec, bool isTraining);
XTensor& paddingEnc, XTensor& paddingDec, bool isTraining);
/* make the mask for training MT models */
void MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
XTensor& paddingEnc, XTensor& paddingDec,
XTensor& maskEnc, XTensor& maskDec, XTensor& maskEncDec);
XTensor& paddingEnc, XTensor& paddingDec,
XTensor& maskEnc, XTensor& maskDec, XTensor& maskEncDec);
/* make the mask of the encoder */
void MakeMTMaskEnc(XTensor& paddingEnc, XTensor& maskEnc);
/* make the mask of the decoder */
void MakeMTMaskDec(XTensor& paddingEnc, XTensor& paddingDec,
XTensor& maskDec, XTensor& maskEncDec);
XTensor& maskDec, XTensor& maskEncDec);
/* get parameter matrices */
void GetParams(TensorList& list);
......@@ -114,6 +115,13 @@ public:
/* read the parameters */
void Read(FILE* file);
public:
/* clone the model (overloaded method of XModel) */
XModel * Clone(int devID);
/* run the neural network (overloaded method of XModel) */
bool RunSimple(XList * inputs, XList * outputs, XList * golds, XList * losses);
};
}
......
/* NiuTrans.Tensor - an open-source tensor library
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......
/* NiuTrans.Tensor - an open-source tensor library
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......
/* NiuTrans.Tensor - an open-source tensor library
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......@@ -28,6 +28,7 @@
#include "Utility.h"
#include "../../tensor/XGlobal.h"
#include "../../tensor/XConfig.h"
using namespace nts;
using namespace std;
......@@ -165,89 +166,7 @@ int Config::LoadFromFile(const char* configFN, char** args) {
return argsNum;
}
void LoadParamString(int argc, char** argv, const char* name, char* p, const char* defaultP)
{
char vname[128];
vname[0] = '-';
strcpy(vname + 1, name);
bool hit = false;
for (int i = 0; i < argc; i++) {
if (!strcmp(argv[i], vname) && i + 1 < argc) {
strcpy(p, argv[i + 1]);
hit = true;
break;
}
}
if (!hit)
strcpy(p, defaultP);
}
void LoadParamInt(int argc, char** argv, const char* name, int* p, int defaultP)
{
char vname[128];
vname[0] = '-';
strcpy(vname + 1, name);
bool hit = false;
for (int i = 0; i < argc; i++) {
if (!strcmp(argv[i], vname) && i + 1 < argc) {
*(int*)p = atoi(argv[i + 1]);
hit = true;
break;
}
}
if (!hit)
*p = defaultP;
}
void LoadParamBool(int argc, char** argv, const char* name, bool* p, bool defaultP)
{
char vname[128];
vname[0] = '-';
strcpy(vname + 1, name);
bool hit = false;
for (int i = 0; i < argc; i++) {
if (!strcmp(argv[i], vname)) {
*(bool*)p = true;
hit = true;
break;
}
}
if (!hit)
*p = defaultP;
}
void LoadParamFloat(int argc, char** argv, const char* name, float* p, float defaultP)
{
char vname[128];
vname[0] = '-';
strcpy(vname + 1, name);
bool hit = false;
for (int i = 0; i < argc; i++) {
if (!strcmp(argv[i], vname) && i + 1 < argc) {
*p = (float)atof(argv[i + 1]);
hit = true;
break;
}
}
if (!hit)
*p = defaultP;
}
void ShowParams(int argc, char** argv)
{
fprintf(stderr, "args:\n");
for (int i = 0; i < argc; i++) {
if (argv[i][1] == 0)
continue;
if (argv[i][0] == '-' && (argv[i][1] < '1' || argv[i][1] > '9')) {
if (i + 1 < argc && argv[i + 1][0] != '-')
fprintf(stderr, " %s=%s\n", argv[i], argv[i + 1]);
else
fprintf(stderr, " %s=yes\n", argv[i]);
}
}
fprintf(stderr, "\n");
}
/*
split string by delimiter, this will return indices of all sub-strings
......@@ -281,7 +200,9 @@ IntList SplitInt(const string& s, const string& delimiter)
IntList values;
auto indices = SplitToPos(s, delimiter);
for (int i = 0; i < indices.Size(); i++) {
values.Add(strtol(s.data() + indices[i], nullptr, 10));
/* this line is with problem. Why do we need an IntList to keep an int64*/
values.Add((int)strtol(s.data() + indices[i], nullptr, 10));
}
return values;
}
......@@ -297,4 +218,4 @@ FloatList SplitFloat(const string& s, const string& delimiter)
return values;
}
}
\ No newline at end of file
}
/* NiuTrans.Tensor - an open-source tensor library
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......@@ -34,16 +34,6 @@ namespace nmt
{
#define MAX_PARAM_NUM 100
/* load arguments */
void LoadParamInt(int argc, char** argv, const char* name, int* p, int defaultP);
void LoadParamBool(int argc, char** argv, const char* name, bool* p, bool defaultP);
void LoadParamFloat(int argc, char** argv, const char* name, float* p, float defaultP);
void LoadParamString(int argc, char** argv, const char* name, char* p, const char* defaultP);
/* show arguments */
void ShowParams(int argc, char** argv);
/* split string */
IntList SplitInt(const string& s, const string& delimiter);
FloatList SplitFloat(const string& s, const string& delimiter);
......@@ -115,10 +105,10 @@ public:
/* the maximum length in positional embedding */
int maxPosition;
/* the maximum length for the source sequence */
/* the maximum length of the source sequence */
int maxSrcLen;
/* the maximum length for the target sequence */
/* the maximum length of the target sequence */
int maxTgtLen;
/* the dimension of fnn hidden layer */
......
/* NiuTrans.Tensor - an open-source tensor library
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......@@ -259,7 +259,7 @@ XTensor Attention::MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v,
relativeKey = ConvertDataType(relativeKey, X_FLOAT);
}
float scaling = float(sqrt(d / nhead));
float scaling = (float)sqrt(d / nhead);
qheads = ScaleAndShift(qheads, 1.0F / scaling);
dot = RPDotProduct(qheads, kheads, relativeKey, true);
......@@ -373,7 +373,7 @@ XTensor Attention::RPDotProduct(XTensor& x, XTensor& y, XTensor& z, const bool i
xTrans = Transpose(x, 0, 1);
XTensor relative;
relative = BMMul(xTrans, X_NOTRANS, z, transposeFlag);
relative = MatrixMulBatched(xTrans, X_NOTRANS, z, transposeFlag);
XTensor relativeTrans;
relativeTrans = Transpose(relative, 0, 1);
......
/* NiuTrans.Tensor - an open-source tensor library
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......
/* NiuTrans.Tensor - an open-source tensor library
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......
/* NiuTrans.Tensor - an open-source tensor library
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......
/* NiuTrans.Tensor - an open-source tensor library
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......
/* NiuTrans.Tensor - an open-source tensor library
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......
/* NiuTrans.Tensor - an open-source tensor library
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......@@ -67,9 +67,7 @@ void FNN::InitModel(Config& config)
float scale = 1.0F;
_SetDataFanInOut(&w1, scale);
_SetDataFanInOut(&w2, scale);
//w1.SetDataRand(-(DTYPE)sqrt(6.0F / inSize), (DTYPE)sqrt(6.0F / inSize));
//w2.SetDataRand(-(DTYPE)sqrt(6.0F / hSize), (DTYPE)sqrt(6.0F / hSize));
b1.SetZeroAll();
b2.SetZeroAll();
......
/* NiuTrans.Tensor - an open-source tensor library
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......
/* NiuTrans.Tensor - an open-source tensor library
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......
/* NiuTrans.Tensor - an open-source tensor library
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......
/* NiuTrans.Tensor - an open-source tensor library
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......
/* NiuTrans.Tensor - an open-source tensor library
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......
/* NiuTrans.Tensor - an open-source tensor library
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......
/* NiuTrans.Tensor - an open-source tensor library
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......
/* NiuTrans.Tensor - an open-source tensor library
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......
/* NiuTrans.Tensor - an open-source tensor library
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......
/* NiuTrans.Tensor - an open-source tensor library
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......
/* NiuTrans.Tensor - an open-source tensor library
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......
/* NiuTrans.Tensor - an open-source tensor library
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......
/* NiuTrans.Tensor - an open-source tensor library
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......@@ -29,6 +29,7 @@
#include "../../../tensor/XList.h"
#include "../../../tensor/XTensor.h"
#include "../../../tensor/XGlobal.h"
#include "../../../train/XBaseTemplate.h"
using namespace std;
......@@ -74,8 +75,8 @@ struct ReservedIDs {
};
/* A `TrainDataSet` is associated with a file which contains training data. */
struct TrainDataSet {
struct TrainDataSet : public DataDistributeBase
{
public:
/* the pointer to file stream */
......
/* NiuTrans.Tensor - an open-source tensor library
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......@@ -97,7 +97,6 @@ initialization
void Trainer::Init(Config& config)
{
cfg = &config;
lrate = config.lrate;
lrbias = config.lrbias;
sBatchSize = config.sBatchSize;
......@@ -242,17 +241,8 @@ void Trainer::Train(const char* fn, const char* validFN,
DTYPE lossLocal = lossBatch / wc;
bool doUpdate = (!IsNAN(lossLocal) && !IsINF(lossLocal) && lossLocal < 1e3F);
net.isGradEfficient = true;
bool debug(false);
if (debug) {
LOG("after forward:");
batchEnc.mem->ShowMemUsage(stderr);
exit(0);
}
if (doUpdate) {
/* back-propagation */
net.Backward(lossTensor);
if (model->encoder->useHistory)
......@@ -502,6 +492,7 @@ void Trainer::Update(Model* model, const float lr)
_ScaleAndShiftMe(v, (1.0F - adamBeta2), 0);
/* v2 = m / (sqrt(v) + delta) */
GMems.GetMem(v->devID)->LockBuf();
XTensor* v2 = NewTensorBufV2(v, v->devID, v->mem);
_Power(v, v2, 0.5F);
_ScaleAndShiftMe(v2, 1.0F, d);
......@@ -511,6 +502,7 @@ void Trainer::Update(Model* model, const float lr)
_Sum(para, v2, para, -e);
DelTensorBuf(v2);
GMems.GetMem(v->devID)->UnlockBuf();
}
else {
/* the delta rule */
......
/* NiuTrans.Tensor - an open-source tensor library
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......
/* NiuTrans.Tensor - an open-source tensor library
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......
/* NiuTrans.Tensor - an open-source tensor library
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......
/* NiuTrans.Tensor - an open-source tensor library
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......@@ -42,7 +42,7 @@ float LengthPenalizer::GNMT(float length, float alpha)
base = (length + 5.0F) / (1.0F + 5.0F);
lp = float(pow(base, alpha));
lp = (float)pow(base, alpha);
return lp;
}
......
/* NiuTrans.Tensor - an open-source tensor library
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......
/* NiuTrans.Tensor - an open-source tensor library
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......
/* NiuTrans.Tensor - an open-source tensor library
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......
/* NiuTrans.Tensor - an open-source tensor library
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......@@ -322,7 +322,7 @@ void BeamSearch::Generate(StateBundle* prev, StateBundle* beam)
/* keep the most promising candidates in the beam */
TopK(score, scoreTopK, index, -1, beamSize, true);
float lp = LengthPenalizer::GNMT(beam->nstep, alpha);
//float lp = LengthPenalizer::GNMT(beam->nstep, alpha);
CopyValues(index, indexCPU);
CopyValues(index, preID);
......@@ -493,8 +493,8 @@ void BeamSearch::Collect(StateBundle* beam)
/* check if this is the first end symbol. It is false
if there have been end symbols in previously generated words. */
bool isCompleted = state.isCompleted &&
(state.last == NULL || !state.last->isCompleted);
//bool isCompleted = state.isCompleted &&
// (state.last == NULL || !state.last->isCompleted);
/* we push the hypothesis into the heap when it is completed */
if ((state.isEnd || state.isCompleted)) {
......@@ -557,7 +557,6 @@ void BeamSearch::Dump(IntList* output, XTensor* score)
}
}
int count = 0;
bool isCompleted = true;
/* we track the state from the end to the beginning */
......@@ -874,4 +873,4 @@ void GreedySearch::Search(Model* model, XTensor& input,
delete[] finishedFlags;
}
}
\ No newline at end of file
}
/* NiuTrans.Tensor - an open-source tensor library
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......
/* NiuTrans.Tensor - an open-source tensor library
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......@@ -161,7 +161,7 @@ void Translator::Translate(const char* ifn, const char* sfn,
batchLoader.outputBuffer.emplace_back(emptyRes);
}
double startDump = GetClockSec();
//double startDump = GetClockSec();
/* reorder the result */
batchLoader.SortOutput();
......@@ -169,10 +169,10 @@ void Translator::Translate(const char* ifn, const char* sfn,
/* print the result to a file */
batchLoader.DumpRes(ofn);
double elapsed = GetClockSec() - startDump;
//double elapsed = GetClockSec() - startDump;
LOG("translation completed (word=%d, sent=%zu)",
wordCountTotal, batchLoader.outputBuffer.size() + batchLoader.emptyLines.size());
wordCountTotal, batchLoader.inputBuffer.size() + batchLoader.emptyLines.size());
}
/*
......@@ -202,4 +202,4 @@ void Translator::Dump(FILE* file, XTensor* output)
}
}
}
\ No newline at end of file
}
/* NiuTrans.Tensor - an open-source tensor library
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......
/* NiuTrans.Tensor - an open-source tensor library
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......@@ -34,14 +34,14 @@ void Vocab::Load(const string& src)
/* get the vocab size and the start id */
f >> vsz >> sid;
startID = stol(sid);
vocabSize = stol(vsz);
startID = (int)stol(sid);
vocabSize = (int)stol(vsz);
string word, id;
for (int i = 0; i < vocabSize - startID; i++) {
f >> word >> id;
word2id[word] = stol(id);
id2word[stol(id)] = word;
word2id[word] = (int)stol(id);
id2word[(int)stol(id)] = word;
}
f.close();
......@@ -75,4 +75,4 @@ void Vocab::CopyFrom(const Vocab& v)
id2word.insert(i2w);
}
}
\ No newline at end of file
}
/* NiuTrans.Tensor - an open-source tensor library
/* NiuTrans.NMT - an open-source neural machine translation system.
* Copyright (C) 2020 NiuTrans Research. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......
/*
* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2021
* Natural Language Processing Lab, Northeastern University
* and
* NiuTrans Research
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* this class keeps a batch of paramters.
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-02-28
*/
#include "XConfig.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* constructor */
XConfig::XConfig()
{
n = 0;
args = NULL;
nReal = 0;
}
/* de-constructor */
XConfig::~XConfig()
{
for (int i = 0; i < n; i++) {
delete[] args[i];
}
delete[] args;
}
/* clear it */
void XConfig::Clear()
{
for (int i = 0; i < n; i++) {
delete[] args[i];
}
delete[] args;
n = 0;
args = NULL;
nReal = 0;
}
/*
create a config
>> myN - number of the input arguments
>> myArgs - the input arguments
*/
void XConfig::Create(const int myN, const char ** myArgs)
{
CheckNTErrors(myN > 0, "No input parameters to XConfig!");
for (int i = 0; i < n; i++) {
delete[] args[i];
}
delete[] args;
args = NULL;
n = myN;
nReal = n * 2;
args = new char*[nReal];
for (int i = 0; i < nReal; i++) {
args[i] = NULL;
}
for (int i = 0; i < n; i++) {
CheckNTErrors(myArgs[i] != NULL, "Illegal parameter input!");
args[i] = new char[strlen(myArgs[i]) + 1];
strcpy(args[i], myArgs[i]);
}
}
/*
add an argument
>> myArg - the argument
>> myValue - the value of the argument
*/
void XConfig::Add(const char * myArg, const char * myValue)
{
CheckNTErrors(myArg != NULL, "No argument!");
if (n + 2 > nReal) {
nReal = MAX(n * 2 + 1, 128);
char ** newArgs = new char*[nReal];
memset(newArgs, 0, sizeof(char*) * n);
memcpy(newArgs, args, sizeof(char*) * n);
delete[] args;
args = newArgs;
}
args[n] = new char[strlen(myArg) + 2];
args[n][0] = '-';
strcpy(args[n] + 1, myArg);
n++;
if (myValue != NULL) {
args[n] = new char[strlen(myValue) + 1];
strcpy(args[n], myValue);
n++;
}
}
/*
add an argument (in integer)
>> myArg - the argument
>> myValue - the value of the argument
*/
void XConfig::Add(const char * myArg, int myValue)
{
char value[MAX_WORD_LENGTH_IN_CONFIG];
sprintf(value, "%d", myValue);
Add(myArg, value);
}
/*
add an argument (in bool)
>> myArg - the argument
>> myValue - the value of the argument
*/
void XConfig::Add(const char * myArg, bool myValue)
{
char value[2];
if (myValue)
value[0] = '1';
else
value[0] = '0';
value[1] = 0;
Add(myArg, value);
}
/*
add an argument (in float)
>> myArg - the argument
>> myValue - the value of the argument
*/
void XConfig::Add(const char * myArg, float myValue)
{
char value[MAX_WORD_LENGTH_IN_CONFIG];
sprintf(value, "%f", myValue);
Add(myArg, value);
}
/*
load the value of an argument (in integer)
>> name - the name of the argument
>> p - where we place the loaded value
>> defaultP - the default value (used only if no argument is hit in the list)
*/
void XConfig::LoadInt(const char * name, int * p, int defaultP)
{
LoadParamInt(n, args, name, p, defaultP);
}
/*
load the value of an argument (in boolean)
>> name - the name of the argument
>> p - where we place the loaded value
>> defaultP - the default value (used only if no argument is hit in the list)
*/
void XConfig::LoadBool(const char * name, bool * p, bool defaultP)
{
LoadParamBool(n, args, name, p, defaultP);
}
/*
load the value of an argument (in float)
>> name - the name of the argument
>> p - where we place the loaded value
>> defaultP - the default value (used only if no argument is hit in the list)
*/void XConfig::LoadFloat(const char * name, float * p, float defaultP)
{
LoadParamFloat(n, args, name, p, defaultP);
}
/*
load the value of an argument (in char string)
>> name - the name of the argument
>> p - where we place the loaded value
>> defaultP - the default value (used only if no argument is hit in the list)
*/
void XConfig::LoadString(const char * name, char * p, const char* defaultP)
{
LoadParamString(n, args, name, p, defaultP);
}
/*
get the value of an argument (in integer)
>> name - the name of the argument
>> defaultP - the default value (used only if no argument is hit in the list)
*/
int XConfig::GetInt(const char * name, int defaultP)
{
int r;
LoadInt(name, &r, defaultP);
return r;
}
/*
get the value of an argument (in bool)
>> name - the name of the argument
>> defaultP - the default value (used only if no argument is hit in the list)
*/
bool XConfig::GetBool(const char * name, bool defaultP)
{
bool r;
LoadBool(name, &r, defaultP);
return r;
}
/*
get the value of an argument (in float)
>> name - the name of the argument
>> defaultP - the default value (used only if no argument is hit in the list)
*/
float XConfig::GetFloat(const char * name, float defaultP)
{
float r;
LoadFloat(name, &r, defaultP);
return r;
}
/* get item number */
int XConfig::GetItemNum()
{
return n;
}
/*
get the item with offset i
>> i - offset
*/
char * XConfig::GetItem(int i)
{
if (i < n && i >= 0)
return args[i];
else
return NULL;
}
/*
initialize with another config model
>> myConfig - the configure model that we want to copy
*/
void XConfig::CreateFromMe(XConfig & myConfig)
{
Clear();
for (int i = 0; i < myConfig.GetItemNum(); i++)
Add(myConfig.GetItem(i), i);
}
/*
load the value of an argument (in integer)
>> argc - number of arguments
>> argv - arguments
>> name - the argument we search for
>> p - the pointer to the target variable where we want to place the value
>> defaultP - the default value we use if no argument is found
*/
void LoadParamInt(int argc, char** argv, const char* name, int* p, int defaultP)
{
char vname[128];
vname[0] = '-';
strcpy(vname + 1, name);
bool hit = false;
for (int i = 0; i < argc; i++) {
if (!strcmp(argv[i], vname) && i + 1 < argc) {
*(int*)p = atoi(argv[i + 1]);
hit = true;
break;
}
}
if (!hit)
*p = defaultP;
}
/*
load the value of an argument (in boolean)
>> argc - number of arguments
>> argv - arguments
>> name - the argument we search for
>> p - the pointer to the target variable where we want to place the value
>> defaultP - the default value we use if no argument is found
*/
void LoadParamBool(int argc, char** argv, const char* name, bool* p, bool defaultP)
{
char vname[128];
vname[0] = '-';
strcpy(vname + 1, name);
bool hit = false;
for (int i = 0; i < argc; i++) {
if (!strcmp(argv[i], vname)) {
*(bool*)p = true;
hit = true;
break;
}
}
if (!hit)
*p = defaultP;
}
/*
load the value of an argument (in float)
>> argc - number of arguments
>> argv - arguments
>> name - the argument we search for
>> p - the pointer to the target variable where we want to place the value
>> defaultP - the default value we use if no argument is found
*/
void LoadParamFloat(int argc, char** argv, const char* name, float* p, float defaultP)
{
char vname[128];
vname[0] = '-';
strcpy(vname + 1, name);
bool hit = false;
for (int i = 0; i < argc; i++) {
if (!strcmp(argv[i], vname) && i + 1 < argc) {
*p = (float)atof(argv[i + 1]);
hit = true;
break;
}
}
if (!hit)
*p = defaultP;
}
/*
load the value of an argument (in char string)
>> argc - number of arguments
>> argv - arguments
>> name - the argument we search for
>> p - the pointer to the target variable where we want to place the value
>> defaultP - the default value we use if no argument is found
*/
void LoadParamString(int argc, char** argv, const char* name, char* p, const char* defaultP)
{
char vname[128];
vname[0] = '-';
strcpy(vname + 1, name);
bool hit = false;
for (int i = 0; i < argc; i++) {
if (!strcmp(argv[i], vname) && i + 1 < argc) {
strcpy(p, argv[i + 1]);
hit = true;
break;
}
}
if (!hit)
strcpy(p, defaultP);
}
/*
show the argument list
>> argc - number of arguments
>> argv - arguments
*/
void ShowParams(int argc, char** argv)
{
fprintf(stderr, "args:\n");
for (int i = 0; i < argc; i++) {
if (argv[i][1] == 0)
continue;
if (argv[i][0] == '-' && (argv[i][1] < '1' || argv[i][1] > '9')) {
if (i + 1 < argc && argv[i + 1][0] != '-')
fprintf(stderr, " %s=%s\n", argv[i], argv[i + 1]);
else
fprintf(stderr, " %s=yes\n", argv[i]);
}
}
fprintf(stderr, "\n");
}
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
/*
* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2021
* Natural Language Processing Lab, Northeastern University
* and
* NiuTrans Research
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* this class defines a parameter keeper.
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-02-28
* A new semester begins today.
*/
#ifndef __XCONFIG_H__
#define __XCONFIG_H__
#include "XGlobal.h"
#include "XUtility.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#define MAX_WORD_LENGTH_IN_CONFIG 256
/* the parameter keeper */
class XConfig
{
private:
/* number of arguments */
int n;
/* argument list (in char*) */
char ** args;
/* number of items we rellocate for these arguments */
int nReal;
public:
/* constructor */
XConfig();
/* de-constructor */
~XConfig();
/* clear it */
void Clear();
/* create a config */
void Create(const int myN, const char ** myArgs);
/* add an argument */
void Add(const char * myArg, const char * myValue);
/* add an argument (in integer) */
void Add(const char * myArg, int myValue);
/* add an argument (in bool) */
void Add(const char * myArg, bool myValue);
/* add an argument (in float) */
void Add(const char * myArg, float myValue);
/* load the value of an argument to a variable (in integer) */
void LoadInt(const char * name, int * p, int defaultP);
/* load the value of an argument to a variable (in boolean) */
void LoadBool(const char * name, bool * p, bool defaultP);
/* load the value of an argument to a variable (in float) */
void LoadFloat(const char * name, float * p, float defaultP);
/* load the value of an argument to a variable (in char string) */
void LoadString(const char * name, char * p, const char* defaultP);
/* get the value of an argument (in integer) */
int GetInt(const char * name, int defaultP);
/* get the value of an argument (in boolean) */
bool GetBool(const char * name, bool defaultP);
/* get the value of an argument (in float) */
float GetFloat(const char * name, float defaultP);
/* get item number */
int GetItemNum();
/* get the item with offset i */
char * GetItem(int i);
/* initialize with another config model */
void CreateFromMe(XConfig &myConfig);
};
#define MAX_PARAM_NUM 100
/* load arguments */
void extern LoadParamInt(int argc, char** argv, const char* name, int* p, int defaultP);
void extern LoadParamBool(int argc, char** argv, const char* name, bool* p, bool defaultP);
void extern LoadParamFloat(int argc, char** argv, const char* name, float* p, float defaultP);
void extern LoadParamString(int argc, char** argv, const char* name, char* p, const char* defaultP);
/* show arguments */
void extern ShowParams(int argc, char** argv);
} // namespace nts(NiuTrans.Tensor)
#endif
\ No newline at end of file
......@@ -182,10 +182,11 @@ void XDevice::Reset()
XMem * mem = GMems.GetMem(devID);
mem->Free();
#ifdef USE_CUDA
int devIDReset = devID;
Clear();
#ifdef USE_CUDA
if (devIDReset >= 0) {
int devIDBackup = -1;
cudaGetDevice(&devIDBackup);
......@@ -195,6 +196,8 @@ void XDevice::Reset()
cudaSetDevice(devIDBackup);
}
#else
Clear();
#endif
}
......
......@@ -132,6 +132,36 @@ extern int TRAINING_SAMPLE_BUF_SIZE;
extern int CONST_MINUSONE;
extern bool CONST_TRUE;
//////////////////////////////////////////////////
// mutex
#ifdef WIN32
#define THREAD_HANDLE HANDLE
#define MUTEX_HANDLE CRITICAL_SECTION
#define COND_HANDLE HANDLE
#define MUTEX_INIT( x ) InitializeCriticalSection( &(x) )
#define MUTEX_DELE( x ) DeleteCriticalSection( &(x) )
#define MUTEX_LOCK( x ) EnterCriticalSection( &(x) )
#define MUTEX_UNLOCK( x ) LeaveCriticalSection( &(x) )
#define COND_INIT( x ) ( x = CreateEvent( NULL, false, false, NULL ) )
#define COND_DELE( x ) CloseHandle( (x) )
#define COND_WAIT( x, y ) WaitForSingleObject( (x), INFINITE )
#define COND_SIGNAL( x ) SetEvent( (x) )
#define COND_RESET( x) ResetEvent( (x) )
#else
#define THREAD_HANDLE pthread_t
#define MUTEX_HANDLE pthread_mutex_t
#define COND_HANDLE pthread_cond_t
#define MUTEX_INIT( x ) pthread_mutex_init( &(x), NULL )
#define MUTEX_DELE( x ) pthread_mutex_destroy( &(x) )
#define MUTEX_LOCK( x ) pthread_mutex_lock( &(x) )
#define MUTEX_UNLOCK( x ) pthread_mutex_unlock( &(x) )
#define COND_INIT( x ) pthread_cond_init( &(x), NULL )
#define COND_DELE( x ) pthread_cond_destroy( &(x) )
#define COND_WAIT( x, y ) pthread_cond_wait( &(x), &(y) )
#define COND_SIGNAL( x ) pthread_cond_signal( &(x) )
#define COND_BROADCAST( x ) pthread_cond_broadcast( &(x) )
#endif
//#define USE_CUDA_RESURSION 1
#define NIUTRANSNNDEBUG
......
......@@ -36,7 +36,7 @@ TensorListBase<T>::TensorListBase()
{
maxNum = 1;
count = 0;
items = (T*)malloc(sizeof(T) * 1);
items = new T[1];
}
/*
......@@ -49,7 +49,7 @@ TensorListBase<T>::TensorListBase(int myMaxNum)
CheckNTErrors(myMaxNum > 0, "check if the input number > 0");
maxNum = myMaxNum;
count = 0;
items = (T*)malloc(sizeof(T) * myMaxNum);
items = new T[myMaxNum];
}
/*
......@@ -62,7 +62,7 @@ TensorListBase<T>::TensorListBase(const T* inputItems, int inputItemCount)
CheckNTErrors(inputItemCount > 0, "check if the input number > 0");
maxNum = inputItemCount;
count = inputItemCount;
items = (T*)malloc(sizeof(T) * inputItemCount);
items = new T[inputItemCount];
memcpy(items, inputItems, inputItemCount * sizeof(T));
}
......@@ -73,7 +73,7 @@ TensorListBase<T>::TensorListBase(const TensorListBase<T>& l)
CheckNTErrors(l.maxNum > 0, "check if the input number > 0");
maxNum = l.maxNum;
count = l.count;
items = (T*)malloc(sizeof(T) * maxNum);
items = new T[maxNum];
memcpy(items, l.items, l.count * sizeof(T));
}
......@@ -94,7 +94,7 @@ TensorListBase<T> TensorListBase<T>::operator=(const TensorListBase<T>& l)
{
maxNum = l.maxNum;
count = l.count;
items = (T*)malloc(sizeof(T) * maxNum);
items = new T[maxNum];
memcpy(items, l.items, l.count * sizeof(T));
return *this;
}
......@@ -105,7 +105,7 @@ TensorListBase<T> TensorListBase<T>::operator=(TensorListBase<T>&& l)
{
maxNum = l.maxNum;
count = l.count;
items = (T*)malloc(sizeof(T) * maxNum);
items = new T[maxNum];
memcpy(items, l.items, l.count * sizeof(T));
return *this;
}
......@@ -115,10 +115,25 @@ template <typename T>
TensorListBase<T>::~TensorListBase()
{
if(items != NULL)
free(items);
delete[] items;
items = NULL;
}
/*
reallocate
>> itemNum - the number of items
*/
template <typename T>
void TensorListBase<T>::Reallocate(int itemNum)
{
if (maxNum < itemNum) {
T * newItems = new T[itemNum];
memcpy(newItems, items, count * sizeof(T));
delete[] items;
items = newItems;
maxNum = itemNum;
}
}
/*
add an item into the list
......@@ -128,20 +143,10 @@ template <typename T>
void TensorListBase<T>::Add(T&& item)
{
if (count == maxNum) {
T* newItems;
newItems = (T*)realloc(items, sizeof(T) * (count * 2 + 1));
if (newItems != NULL)
items = newItems;
else {
newItems = (T*)malloc(sizeof(T) * (count * 2 + 1));
memcpy(newItems, items, count * sizeof(T));
free(items);
items = newItems;
}
T * newItems = new T[count * 2 + 1];
memcpy(newItems, items, count * sizeof(T));
delete[] items;
items = newItems;
maxNum = count * 2 + 1;
}
items[count++] = item;
......@@ -162,24 +167,49 @@ template <typename T>
void TensorListBase<T>::Add(const T& item)
{
if (count == maxNum) {
T* newItems;
newItems = (T*)realloc(items, sizeof(T) * (count * 2 + 1));
if (newItems != NULL)
items = newItems;
else {
newItems = (T*)malloc(sizeof(T) * (count * 2 + 1));
memcpy(newItems, items, count * sizeof(T));
free(items);
items = newItems;
}
T * newItems = new T[count * 2 + 1];
memcpy(newItems, items, count * sizeof(T));
delete[] items;
items = newItems;
maxNum = count * 2 + 1;
}
items[count++] = item;
}
/* add an item (as an integer) into the list */
template <typename T>
void TensorListBase<T>::AddInt(const int item)
{
if (count == maxNum)
Reallocate(count * 2 + 1);
*(int*)(items + count) = item;
count++;
}
/* add an item (as a float) into the list */
template <typename T>
void TensorListBase<T>::AddFloat(const float item)
{
if (count == maxNum)
Reallocate(count * 2 + 1);
*(float*)(items + count) = item;
count++;
}
/* add an item (as a long long) into the list */
template <typename T>
void TensorListBase<T>::AddLLong(const long long item)
{
if (count == maxNum)
Reallocate(count * 2 + 1);
*(long long*)(items + count) = item;
count++;
}
/*
add a number of items into the list
>> inputItems - pointer to the array of items
......@@ -189,18 +219,10 @@ template <typename T>
void TensorListBase<T>::Add(const T* inputItems, int inputItemCount)
{
if (count + inputItemCount >= maxNum) {
T* newItems;
newItems = (T*)realloc(items, sizeof(T) * (count + inputItemCount + 1));
if (newItems != NULL)
items = newItems;
else {
newItems = (T*)malloc(sizeof(T) * (maxNum + count + inputItemCount + 1));
memcpy(newItems, items, count * sizeof(T));
free(items);
items = newItems;
}
T* newItems = new T[maxNum + count + inputItemCount + 1];
memcpy(newItems, items, count * sizeof(T));
delete[] items;
items = newItems;
maxNum += (count + inputItemCount + 1);
}
memcpy(items + count, inputItems, sizeof(T) * inputItemCount);
......@@ -226,18 +248,10 @@ template <typename T>
void TensorListBase<T>::Insert(int pos, const T& item)
{
if (count == maxNum) {
T* newItems;
newItems = (T*)realloc(items, sizeof(T) * (count * 2 + 1));
if (newItems != NULL)
items = newItems;
else {
newItems = (T*)malloc(sizeof(T) * (count * 2 + 1));
memcpy(newItems, items, count * sizeof(T));
free(items);
items = newItems;
}
T * newItems = new T[count * 2 + 1];
memcpy(newItems, items, count * sizeof(T));
delete[] items;
items = newItems;
maxNum = count * 2 + 1;
}
......@@ -251,18 +265,10 @@ template<typename T>
void TensorListBase<T>::Insert(int pos, T&& item)
{
if (count == maxNum) {
T* newItems;
newItems = (T*)realloc(items, sizeof(T) * (count * 2 + 1));
if (newItems != NULL)
items = newItems;
else {
newItems = (T*)malloc(sizeof(T) * (count * 2 + 1));
memcpy(newItems, items, count * sizeof(T));
free(items);
items = newItems;
}
T * newItems = new T[count * 2 + 1];
memcpy(newItems, items, count * sizeof(T));
delete[] items;
items = newItems;
maxNum = count * 2 + 1;
}
......@@ -274,16 +280,64 @@ void TensorListBase<T>::Insert(int pos, T&& item)
/* get the item at position i */
template <typename T>
T& TensorListBase<T>::GetItem(int i) const
inline T& TensorListBase<T>::GetItem(int i) const
{
CheckNTErrors(i >= -count && i < count, "Index of a list item is out of scope!");
CheckNTErrors(count > 0, "Cannt index the item in an empty list!");
CheckNTErrors(count > 0, "Cannot index the item in an empty list!");
if (i < 0)
return items[count + i];
else
return items[i];
}
/* get the item at position i and force it to an integer */
template <typename T>
inline int TensorListBase<T>::GetItemInt(int i) const
{
CheckNTErrors(i >= -count && i < count, "Index of a list item is out of scope!");
CheckNTErrors(count > 0, "Cannot index the item in an empty list!");
if (i < 0)
return 0;
else {
T r = items[i];
void * p = &r;
return *(int*)p;
}
}
/* get the item at position i and force it to a float number */
template <typename T>
inline float TensorListBase<T>::GetItemFloat(int i) const
{
CheckNTErrors(i >= -count && i < count, "Index of a list item is out of scope!");
CheckNTErrors(count > 0, "Cannot index the item in an empty list!");
if (i < 0)
return 0;
else {
T r = items[i];
void * p = &r;
return *(float*)p;
}
}
/* get the item at position i and force it to an long long number */
template <typename T>
inline long long TensorListBase<T>::GetItemLLong(int i) const
{
CheckNTErrors(i >= -count && i < count, "Index of a list item is out of scope!");
CheckNTErrors(count > 0, "Cannot index the item in an empty list!");
if (i < 0)
return 0;
else {
T r = items[i];
void * p = &r;
return *(long long*)p;
}
}
/* set the item at position i */
template <typename T>
inline void TensorListBase<T>::SetItem(int i, const T& item)
......@@ -299,6 +353,33 @@ inline void TensorListBase<T>::SetItem(int i, T&& item)
items[i] = item;
}
/* set the item (as an integer) at position i */
template<typename T>
inline void TensorListBase<T>::SetItemInt(int i, const int item)
{
if (i >= 0 && i < count) {
*(int*)(items + i) = item;
}
}
/* set the item (as a float) at position i */
template<typename T>
inline void TensorListBase<T>::SetItemFloat(int i, const float item)
{
if (i >= 0 && i < count) {
*(float*)(items + i) = item;
}
}
/* set the item (as a long long) at position i */
template<typename T>
inline void TensorListBase<T>::SetItemLLong(int i, const long long item)
{
if (i >= 0 && i < count) {
*(long long*)(items + i) = item;
}
}
/*
find the position of the first matched item
>> item - the item for matching
......@@ -329,7 +410,7 @@ void TensorListBase<T>::Clear()
count = 0;
maxNum = 0;
if(items != NULL)
free(items);
delete[] items;
items = NULL;
}
......@@ -384,7 +465,7 @@ void TensorListBase<T>::Reserve(int n)
return;
}
items = (T*)malloc(sizeof(T) * n);
items = new T[n];
}
/*
......@@ -430,8 +511,8 @@ void TensorListBase<T>::ReadFromFile(FILE* fp, int num)
if(!items)
Reserve(num - maxNum);
else {
free(items);
items = (T*)malloc(sizeof(T) * num);
delete[] items;
items = new T[num];
}
}
fread(items, sizeof(T), num, fp);
......
......@@ -75,6 +75,9 @@ public:
/* de-constructor */
~TensorListBase();
/* reallocate */
void Reallocate(int itemNum);
/* add an item into the list */
void Add(T&& item);
......@@ -84,6 +87,15 @@ public:
/* add an item into the list */
void Add(const T& item);
/* add an item (as an integer) into the list */
void AddInt(const int item);
/* add an item (as a float) into the list */
void AddFloat(const float item);
/* add an item (as a long long) into the list */
void AddLLong(const long long item);
/* add a number of items into the list */
void Add(const T* inputItems, int inputItemCount);
......@@ -99,12 +111,30 @@ public:
/* get the item at position i */
T& GetItem(int i) const;
/* get the item at position i and force it to an integer */
int GetItemInt(int i) const;
/* get the item at position i and force it to a float number */
float GetItemFloat(int i) const;
/* get the item at position i and force it to an long long number */
long long GetItemLLong(int i) const;
/* set the item at position i */
void SetItem(int i, const T& item);
/* set the item at position i */
void SetItem(int i, T&& item);
/* set the item (as an integer) at position i */
void SetItemInt(int i, const int item);
/* set the item (as a float) at position i */
void SetItemFloat(int i, const float item);
/* set the item (as a long long) at position i */
void SetItemLLong(int i, const long long item);
/* find the position of the first matched item */
int FindFirst(const T& item);
......@@ -135,7 +165,13 @@ public:
/* short */
T& operator[] (int i) const { return GetItem(i); };
T& Get(int i) const { return GetItem(i); };
int GetInt(int i) const { return GetItemInt(i); };
float GetFloat(int i) const { return GetItemFloat(i); };
long long GetLLong(int i) const { return GetItemLLong(i); };
void Set(int i, T item) { SetItem(i, item); };
void SetInt(int i, int item) { SetItemInt(i, item); };
void SetFloat(int i, float item) { SetItemFloat(i, item); };
void SetLLong(int i, long long item) { SetItemLLong(i, item); };
};
struct XTensor;
......
......@@ -54,6 +54,8 @@ XMem::XMem()
signature = 0;
mergeFreeOTF = true;
isInitialized = false;
MUTEX_INIT(allocMutex);
MUTEX_INIT(bufMutex);
}
/*
......@@ -77,6 +79,8 @@ XMem::XMem(int myDevID, MEMPOOL_MODE myMode, MTYPE myBlockSize, int myBlockNum,
strcpy(name, "xmem");
signature = 0;
mergeFreeOTF = true;
MUTEX_INIT(allocMutex);
MUTEX_INIT(bufMutex);
Initialize(myDevID, myMode, myBlockSize, myBlockNum, myBufSize);
}
......@@ -99,6 +103,8 @@ XMem::~XMem()
delete[] memIndex;
delete[] memIndex2;
delete[] minSizeIndex;
MUTEX_DELE(allocMutex);
MUTEX_DELE(bufMutex);
}
/*
......@@ -379,12 +385,18 @@ require a piece of memory
*/
void * XMem::Alloc(int myDevID, MTYPE mySize)
{
void * p = NULL;
MUTEX_LOCK(allocMutex);
if(mode == FREE_ON_THE_FLY)
return AllocStandard(myDevID, mySize);
p = AllocStandard(myDevID, mySize);
else if(isStatic)
return AllocStatic(myDevID, mySize);
p = AllocStatic(myDevID, mySize);
else
return AllocDynamic(myDevID, mySize);
p = AllocDynamic(myDevID, mySize);
MUTEX_UNLOCK(allocMutex);
return p;
}
/*
......@@ -521,6 +533,11 @@ void * XMem::AllocBuf(int myDevID, MTYPE mySize, int pitch)
{
MTYPE backOffset = 0;
/* NOTE THAT this is tricky because we lock the buffer
but DO NOT unlock it in this function. The unlock would
happans when we call ReleaseBuf() */
//MUTEX_LOCK(bufMutex);
if(pitch > 1){
MTYPE address = (MTYPE)((char*)buf + bufUsed);
int offset = address % pitch;
......@@ -560,8 +577,10 @@ release a piece of memory
*/
void XMem::Release(int myDevID, void * p, MTYPE size)
{
MUTEX_LOCK(allocMutex);
if(mode == FREE_ON_THE_FLY)
ReleaseStandard(myDevID, p, size);
MUTEX_UNLOCK(allocMutex);
}
/*
......@@ -583,6 +602,9 @@ void XMem::ReleaseBuf(int myDevID, MTYPE mySize, int pitch)
}
bufUsed -= (mySize + backOffset);
/* NOTE THAT this is a response to the lock in AllocBuf() */
//MUTEX_UNLOCK(bufMutex);
}
/*
......@@ -825,6 +847,18 @@ void * XMem::AllocStandard(int myDevID, MTYPE mySize, bool myIsRebuiltIndex)
return result;
}
/* lock the buffer mutex */
void XMem::LockBuf()
{
MUTEX_LOCK(bufMutex);
}
/* unlock the buffer mutex */
void XMem::UnlockBuf()
{
MUTEX_UNLOCK(bufMutex);
}
/*
find the highest set bit (or most significant set bit) in an integer-64
>> mySize - required size
......@@ -1604,6 +1638,9 @@ void XMemManager::GetBufferSize(MTYPE freeMem, MTYPE * myBufSize)
}
}
}
else {
ShowNTErrors("No enough memory for buffer allocation!");
}
}
/* initialize it and set the global memory information */
......
......@@ -24,6 +24,7 @@
#ifndef __XMEM_H__
#define __XMEM_H__
#include "XGlobal.h"
#include <stdio.h>
#include <stdlib.h>
......@@ -249,6 +250,13 @@ public:
/* indicates whether we merge free memory pieces on the fly */
bool mergeFreeOTF;
private:
/* a mutex for memory allocation and release */
MUTEX_HANDLE allocMutex;
/* a mutex for buffer memory allocation and release */
MUTEX_HANDLE bufMutex;
public:
/* constructor */
......@@ -337,6 +345,12 @@ public:
/* allocate a piece of memory as "malloc" */
void * AllocStandard(int myDevID, MTYPE mySize, bool myIsRebuiltIndex = false);
/* lock the buffer mutex */
void LockBuf();
/* unlock the buffer mutex */
void UnlockBuf();
/* find the highest set bit (or most significant set bit) in an integer-64 */
int GetMSB(MTYPE mySize);
......
......@@ -215,7 +215,8 @@ void XQueue::DequeueJobs(XList * args)
int devID = *(int*)args->GetItem(1);
int devIDBackup = -1;
XDevice::SetDevice(devID, devIDBackup);
if(devID >= 0)
XDevice::SetDevice(devID, devIDBackup);
while(1){
JobQueueNode * node = (JobQueueNode*)q->Dequeue();
......@@ -236,7 +237,8 @@ void XQueue::DequeueJobs(XList * args)
}
XDevice::SetDevice(devIDBackup);
if(devID >= 0)
XDevice::SetDevice(devIDBackup);
}
/* get the break flag */
......@@ -248,7 +250,11 @@ bool XQueue::GetJobBreak()
/* get the number of jobs */
int XQueue::GetJobNum()
{
return runningJobCount;
MUTEX_LOCK(jobQueueMutex);
int c = runningJobCount;
MUTEX_UNLOCK(jobQueueMutex);
return c;
}
} /* end of the nts (NiuTrans.Tensor) namespace */
......@@ -1985,6 +1985,19 @@ void XTensor::FlushToMem(XMem* targetMem)
}
}
/*
flush the data to the target device (with id)
>> myDevID - id of the target device
*/
void XTensor::FlushToDevice(int myDevID)
{
if (myDevID == devID)
return;
XMem * myMem = GMems.GetMem(myDevID);
FlushToMem(myMem);
}
/*
allocate the memory space of the tensor (in the global memory)
>> tensor - the tensor we intend to process
......
......@@ -457,6 +457,9 @@ public:
/* flush the data to the target device */
void FlushToMem(XMem * targetMem);
/* flush the data to the target device (with id) */
void FlushToDevice(int myDevID);
/* allocate the memory space of the tensor (in the global memory) */
static
void AllocateData(XTensor * tensor, XMem * myMem = NULL, bool useBuf = false);
......
......@@ -54,37 +54,6 @@ namespace nts{
(unsigned)(flag), (unsigned *)(id))
#endif
//////////////////////////////////////////////////
// mutex
#ifdef WIN32
#define THREAD_HANDLE HANDLE
#define MUTEX_HANDLE CRITICAL_SECTION
#define COND_HANDLE HANDLE
#define MUTEX_INIT( x ) InitializeCriticalSection( &(x) )
#define MUTEX_DELE( x ) DeleteCriticalSection( &(x) )
#define MUTEX_LOCK( x ) EnterCriticalSection( &(x) )
#define MUTEX_UNLOCK( x ) LeaveCriticalSection( &(x) )
#define COND_INIT( x ) ( x = CreateEvent( NULL, false, false, NULL ) )
#define COND_DELE( x ) CloseHandle( (x) )
#define COND_WAIT( x, y ) WaitForSingleObject( (x), INFINITE )
#define COND_SIGNAL( x ) SetEvent( (x) )
#define COND_RESET( x) ResetEvent( (x) )
#else
#define THREAD_HANDLE pthread_t
#define MUTEX_HANDLE pthread_mutex_t
#define COND_HANDLE pthread_cond_t
#define MUTEX_INIT( x ) pthread_mutex_init( &(x), NULL )
#define MUTEX_DELE( x ) pthread_mutex_destroy( &(x) )
#define MUTEX_LOCK( x ) pthread_mutex_lock( &(x) )
#define MUTEX_UNLOCK( x ) pthread_mutex_unlock( &(x) )
#define COND_INIT( x ) pthread_cond_init( &(x), NULL )
#define COND_DELE( x ) pthread_cond_destroy( &(x) )
#define COND_WAIT( x, y ) pthread_cond_wait( &(x), &(y) )
#define COND_SIGNAL( x ) pthread_cond_signal( &(x) )
#define COND_BROADCAST( x ) pthread_cond_broadcast( &(x) )
#endif
typedef void (*TFunction) (volatile XList*);
/*
......
......@@ -155,13 +155,13 @@ void XMemSet(int devID, void * p, int value, size_t size)
cudaMemcpyKind GetMemcpyKind(int devIDFrom, int devIDTo)
{
if(devIDFrom < 0 && devIDTo < 0)
return cudaMemcpyHostToHost;
return cudaMemcpyKind::cudaMemcpyHostToHost;
else if(devIDFrom < 0 && devIDTo >= 0)
return cudaMemcpyHostToDevice;
return cudaMemcpyKind::cudaMemcpyHostToDevice;
else if(devIDFrom >= 0 && devIDTo < 0)
return cudaMemcpyDeviceToHost;
return cudaMemcpyKind::cudaMemcpyDeviceToHost;
else
return cudaMemcpyDeviceToDevice;
return cudaMemcpyKind::cudaMemcpyDeviceToDevice;
}
#endif
......@@ -485,6 +485,9 @@ unsigned int GetNextPower2(unsigned int n)
/* sleep for a while */
void XSleep(int sleepTime)
{
if (sleepTime <= 0)
return;
#ifdef _WIN32
Sleep((DWORD)sleepTime);
#else
......@@ -553,9 +556,9 @@ void XQSort(void * data, void * index, int num, int width, int stride, int (*com
stackptr = 0;
lo = (char*)data;
hi = (char*)data + realStride * (num - 1);
hi = (char*)data + (long)realStride * (num - 1);
indexlo = (int*)index;
indexhi = index != NULL ? (int*)index + stride * (num - 1) : NULL;
indexhi = index != NULL ? (int*)index + (long)stride * (num - 1) : NULL;
recurse:
......@@ -565,8 +568,8 @@ recurse:
if(size <= MIN_QSORT_NUM)
XShortSort(lo, hi, indexlo, indexhi, width, stride, comp);
else {
mid = lo + (size/2) * realStride;
indexmid = indexlo + (size/2) * stride;
mid = lo + (long)(size/2) * realStride;
indexmid = indexlo + (long)(size/2) * stride;
/* sort the first, last and middle elements into order */
if(comp(lo, mid) > 0)
......@@ -834,8 +837,7 @@ int SplitALine(char* inputString, const char* seperator, StrList* items)
return 0;
if (sepLen == 0) {
char* item = new char[inputLen + 1];
char* item = new char[(long)inputLen + 1];
strcpy(item, inputString);
items->Add(item);
}
......
......@@ -253,15 +253,25 @@ void Div(const XTensor & a, const XTensor & b, XTensor & c, DTYPE alpha, int lea
if (b.order == 0){
DTYPE scale = 1.0F / b.Get0D();
if (a.mem != NULL)
a.mem->LockBuf();
XTensor * tmp1 = NewTensorBufV2(&a, a.devID, a.mem);
if ((c.mem != NULL) && (c.mem != a.mem)) {
c.mem->LockBuf();
}
XTensor * tmp2 = NewTensorBufV2(&c, c.devID, c.mem);
ScaleAndShift(a, *tmp1, scale, 0.0F);
ScaleAndShift(c, *tmp2, alpha, 0.0F);
Sum(*tmp2, *tmp1, c);
DelTensorBuf(tmp1);
DelTensorBuf(tmp2);
if ((c.mem != NULL) && (c.mem != a.mem)) {
c.mem->UnlockBuf();
}
DelTensorBuf(tmp1);
if (a.mem != NULL)
a.mem->UnlockBuf();
}
else {
int n = GetBroadcastDimIndex(a, b);
......
......@@ -61,6 +61,8 @@ XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
float dr = (!x.isSparse || !w.isSparse) ? 1.0F : MAX(x.denseRatio, w.denseRatio);
if (x.mem != NULL)
x.mem->LockBuf();
XTensor * tmp = NewTensorBufV2(order, dimSize, x.dataType, dr, x.devID, x.mem);
/* call _MatrixMul function */
......@@ -101,6 +103,8 @@ XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
/* destroy variables */
delete[] dimSize;
DelTensorBuf(tmp);
if (x.mem != NULL)
x.mem->UnlockBuf();
return c;
}
......@@ -121,8 +125,8 @@ XTensor MulAndShift(const XTensor& x, MATRIX_TRANS_TYPE transposedX,
CheckNTErrors(x.order >= 2 && w.order >= 2, "Input tensors must have a order >= 2!");
int xn = transposedX == X_TRANS ? x.dimSize[x.order - 1] : x.dimSize[x.order - 2];
int xm = transposedX == X_TRANS ? x.dimSize[x.order - 2] : x.dimSize[x.order - 1];
int wn = transposedW == X_TRANS ? w.dimSize[w.order - 1] : w.dimSize[w.order - 2];
//int xm = transposedX == X_TRANS ? x.dimSize[x.order - 2] : x.dimSize[x.order - 1];
//int wn = transposedW == X_TRANS ? w.dimSize[w.order - 1] : w.dimSize[w.order - 2];
int wm = transposedW == X_TRANS ? w.dimSize[w.order - 2] : w.dimSize[w.order - 1];
int order = x.order + w.order - 2;
......@@ -137,6 +141,8 @@ XTensor MulAndShift(const XTensor& x, MATRIX_TRANS_TYPE transposedX,
float dr = (!x.isSparse || !w.isSparse) ? 1.0F : MAX(x.denseRatio, w.denseRatio);
if (x.mem != NULL)
x.mem->LockBuf();
XTensor * tmp = NewTensorBufV2(order, dimSize, x.dataType, dr, x.devID, x.mem);
/* call _MatrixMul function */
......@@ -175,8 +181,10 @@ XTensor MulAndShift(const XTensor& x, MATRIX_TRANS_TYPE transposedX,
/* destroy variables */
delete[] dimSize;
DelTensorBuf(tmp);
if (x.mem != NULL)
x.mem->UnlockBuf();
return c;
}
}
\ No newline at end of file
}
......@@ -277,15 +277,25 @@ void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int l
if (b.order == 0){
DTYPE scale = b.Get0D();
if (a.mem != NULL)
a.mem->LockBuf();
XTensor * tmp1 = NewTensorBufV2(&a, a.devID, a.mem);
if ((c.mem != NULL) && (c.mem != a.mem)) {
c.mem->LockBuf();
}
XTensor * tmp2 = NewTensorBufV2(&c, c.devID, c.mem);
ScaleAndShift(a, *tmp1, scale, 0.0F);
ScaleAndShift(c, *tmp2, alpha, 0.0F);
Sum(*tmp2, *tmp1, c);
DelTensorBuf(tmp1);
DelTensorBuf(tmp2);
if ((c.mem != NULL) && (c.mem != a.mem)) {
c.mem->UnlockBuf();
}
DelTensorBuf(tmp1);
if (a.mem != NULL)
a.mem->UnlockBuf();
}
else {
int n = GetBroadcastDimIndex(a, b);
......
......@@ -290,9 +290,16 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
source = target;
}
target = t->mem != NULL ?
/*target = t->mem != NULL ?
t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize):
XMemAlloc(t->devID, t->unitNum * t->unitSize);
XMemAlloc(t->devID, t->unitNum * t->unitSize);*/
if (t->mem != NULL) {
t->mem->LockBuf();
target = t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize);
}
else {
target = XMemAlloc(t->devID, t->unitNum * t->unitSize);
}
s->data = source;
t->data = target;
......@@ -302,8 +309,9 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
/* free the memory space of the one before the last allocation */
if(count > 0){
int size = s->unitNum * s->unitSize;
if(t->mem != NULL)
if(t->mem != NULL) {
t->mem->ReleaseBuf(t->devID, size);
}
else
XMemFree(t->devID, source);
}
......@@ -312,8 +320,10 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
if(isLast){
CheckNTErrors(t->unitNum == c->unitNum, "Wrong tensor size!");
_Multiply(a, t, c, beta);
if(t->mem != NULL)
if(t->mem != NULL) {
t->mem->ReleaseBuf(t->devID, t->unitNum * t->unitSize);
t->mem->UnlockBuf();
}
else
XMemFree(t->devID, target);
target = NULL;
......
......@@ -293,10 +293,16 @@ void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta
source = target;
}
target = t->mem != NULL ?
/*target = t->mem != NULL ?
t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize):
XMemAlloc(t->devID, t->unitNum * t->unitSize);
XMemAlloc(t->devID, t->unitNum * t->unitSize);*/
if (t->mem != NULL) {
t->mem->LockBuf();
target = t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize);
}
else {
target = XMemAlloc(t->devID, t->unitNum * t->unitSize);
}
s->data = source;
t->data = target;
......@@ -315,8 +321,10 @@ void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta
if(isLast){
CheckNTErrors(t->unitNum == c->unitNum, "Wrong tensor size!");
_Sum(a, t, c, beta);
if(t->mem != NULL)
if(t->mem != NULL) {
t->mem->ReleaseBuf(t->devID, t->unitNum * t->unitSize);
t->mem->UnlockBuf();
}
else
XMemFree(t->devID, target);
target = NULL;
......
......@@ -330,6 +330,7 @@ void _CudaBLASMatrixMULList(cublasHandle_t * handle,
DTYPE ** cpGPU = NULL;
if (mem != NULL) {
mem->LockBuf();
mem->SetPinBuf();
apGPU = (DTYPE**)mem->AllocBuf(mem->devID, sizeof(DTYPE*) * a->count, 256);
bpGPU = (DTYPE**)mem->AllocBuf(mem->devID, sizeof(DTYPE*) * a->count, 256);
......@@ -356,8 +357,10 @@ void _CudaBLASMatrixMULList(cublasHandle_t * handle,
delete[] bp;
delete[] cp;
if(mem != NULL)
if (mem != NULL) {
mem->BackToPinBuf();
mem->UnlockBuf();
}
else {
XMemFree(a0->devID, apGPU);
XMemFree(a0->devID, bpGPU);
......
......@@ -96,9 +96,12 @@ XTensor OnehotToIndex(const XTensor & onehot, int size)
/*
convert index tensor to onehot tensor
>> index - index tensor, which value is an integer num
>> onehot - onehot tensor, which value is 0 or 1
>> size - the last dimension size of the onehot tensor
>> index - index of the output dimension (over the vocabulary)
>> onehot - one-hot representation of the index
>> size - vocabuary size (last dimension size of onehot)
>> labelSmoothingP - the parameter that controls how smooth the output is.
E.g., p = 0 means no smoothing
p = 1 means a uniform distribution (almost)
*/
void _IndexToOnehot(const XTensor * index, XTensor * onehot,
int size, float labelSmoothingP)
......
......@@ -696,13 +696,23 @@ void _SetDataWithOffset(XTensor * tensor, MTYPE * offsets, DTYPE value, MTYPE nu
#ifdef USE_CUDA
XMem * mem = tensor->mem;
MTYPE size = num * sizeof(MTYPE);
MTYPE * offsetsCuda = mem != NULL ? (MTYPE*)mem->AllocBuf(mem->devID, size) : (MTYPE*)XMemAlloc(tensor->devID, size);
//MTYPE * offsetsCuda = mem != NULL ? (MTYPE*)mem->AllocBuf(mem->devID, size) : (MTYPE*)XMemAlloc(tensor->devID, size);
MTYPE * offsetsCuda;
if (mem != NULL) {
mem->LockBuf();
offsetsCuda = (MTYPE*)mem->AllocBuf(mem->devID, size);
}
else {
offsetsCuda = (MTYPE*)XMemAlloc(tensor->devID, size);
}
XMemCopy(offsetsCuda, tensor->devID, offsets, -1, num * sizeof(MTYPE));
_CudaSetDataWithOffset(tensor, offsetsCuda, value, num);
if (mem != NULL)
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, size);
mem->UnlockBuf();
}
else
XMemFree(tensor->devID, offsetsCuda);
#else
......
......@@ -636,12 +636,23 @@ void _CudaSetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * va
int devIDBackup;
ProtectCudaDev(tensor->devID, devIDBackup);
MTYPE * offsetsCuda = mem != NULL ?
/*MTYPE * offsetsCuda = mem != NULL ?
(MTYPE*)mem->AllocBuf(mem->devID, offsetSize) :
(MTYPE*)XMemAlloc(tensor->devID, offsetSize);
void * valuesCuda = mem != NULL ?
mem->AllocBuf(mem->devID, valueSize) :
XMemAlloc(tensor->devID, valueSize);
void * valuesCuda = mem != NULL ?
mem->AllocBuf(mem->devID, valueSize) :
XMemAlloc(tensor->devID, valueSize);*/
MTYPE * offsetsCuda;
void * valuesCuda;
if (mem != NULL) {
mem->LockBuf();
offsetsCuda = (MTYPE*)mem->AllocBuf(mem->devID, offsetSize);
valuesCuda = mem->AllocBuf(mem->devID, valueSize);
}
else {
offsetsCuda = (MTYPE*)XMemAlloc(tensor->devID, offsetSize);
valuesCuda = XMemAlloc(tensor->devID, valueSize);
}
if (mem != NULL) {
XMemCopy(offsetsCuda, mem->devID, offsets, -1, offsetSize);
......@@ -657,6 +668,7 @@ void _CudaSetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * va
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, valueSize);
mem->ReleaseBuf(mem->devID, offsetSize);
mem->UnlockBuf();
}
else {
XMemFree(tensor->devID, valuesCuda);
......
......@@ -45,15 +45,25 @@ void _CopyBlocks(void * source, int unitSize, int blockSize, int blockNum, void
if (devID >= 0) {
#ifdef USE_CUDA
/* copy the index from host to device */
int * targetBlocksTMP = myMem != NULL ?
/*int * targetBlocksTMP = myMem != NULL ?
(int*)myMem->AllocBuf(devID, blockNum * sizeof(int)):
(int*)XMemAlloc(devID, blockNum * sizeof(int));
(int*)XMemAlloc(devID, blockNum * sizeof(int));*/
int * targetBlocksTMP;
if (myMem != NULL) {
myMem->LockBuf();
targetBlocksTMP = (int*)myMem->AllocBuf(devID, blockNum * sizeof(int));
}
else {
targetBlocksTMP = (int*)XMemAlloc(devID, blockNum * sizeof(int));
}
XMemCopy(targetBlocksTMP, devID, targetBlocks, -1, blockNum * sizeof(int));
_CopyBlocksOnSite(source, unitSize, blockSize, blockNum, target, targetBlocksTMP, devID);
if(myMem != NULL)
if (myMem != NULL) {
myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
myMem->UnlockBuf();
}
else
XMemFree(devID, targetBlocksTMP);
#else
......
......@@ -47,14 +47,17 @@ void _CopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum,
#ifdef USE_CUDA
int * indexGPU = index;
if (!isIndexOnDev) {
myMem->LockBuf();
indexGPU = (int*)myMem->AllocBuf(myMem->devID, blockNum * gridNum * sizeof(int));
XMemCopy(indexGPU, myMem->devID, index, -1, blockNum * gridNum * sizeof(int));
}
_CudaCopyBlocksInGrid(source, blockSize, blockNum, gridNum, target, indexGPU, unitSize, myMem);
if (!isIndexOnDev)
if (!isIndexOnDev) {
myMem->ReleaseBuf(myMem->devID, blockNum * gridNum * sizeof(int));
myMem->UnlockBuf();
}
#else
ShowNTErrors("Plesae specify USE_CUDA and recompile the code!");
#endif
......
......@@ -80,12 +80,23 @@ void _CudaCopyBlocksSelected(void * source, int unitSize, int blockSize, int * s
ProtectCudaDev(devID, devIDBackup);
/* copy the index to the GPU memory */
int * sourceBlocksTMP = myMem != NULL ?
/*int * sourceBlocksTMP = myMem != NULL ?
(int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) :
(int *)XMemAlloc(devID, blockNum * sizeof(int));
int * targetBlocksTMP = myMem != NULL ?
(int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) :
(int *)XMemAlloc(devID, blockNum * sizeof(int));
(int *)XMemAlloc(devID, blockNum * sizeof(int));*/
int * sourceBlocksTMP;
int * targetBlocksTMP;
if (myMem != NULL) {
myMem->LockBuf();
sourceBlocksTMP = (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int));
targetBlocksTMP = (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int));
}
else {
sourceBlocksTMP = (int *)XMemAlloc(devID, blockNum * sizeof(int));
targetBlocksTMP = (int *)XMemAlloc(devID, blockNum * sizeof(int));
}
XMemCopy(sourceBlocksTMP, devID, sourceBlocks, -1, blockNum * sizeof(int));
XMemCopy(targetBlocksTMP, devID, targetBlocks, -1, blockNum * sizeof(int));
......@@ -107,6 +118,7 @@ void _CudaCopyBlocksSelected(void * source, int unitSize, int blockSize, int * s
if (myMem != NULL) {
myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
myMem->UnlockBuf();
}
else {
XMemFree(devID, sourceBlocksTMP);
......
......@@ -115,7 +115,7 @@ void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex)
for (int i = 0; i < indexSize; i++) {
int sIndex = sIndexData[i] * stride;
CheckNTErrors(sIndex < s->unitNum, "Wrong index!");
CheckNTErrors(sIndex < s->unitNum && sIndex >= 0, "Wrong index!");
for (int j = 0; j < stride; j++)
tData[i * stride + j] = sData[sIndex + j];
}
......
......@@ -131,9 +131,16 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex)
CheckNTErrors(srcIndexValue < s->unitNum, "Wrong index!");
}
sIndex = mem != NULL ?
/*sIndex = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
(int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
(int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);*/
if (mem != NULL) {
mem->LockBuf();
sIndex = (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize);
}
else {
sIndex = (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
}
XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize);
}
else {
......@@ -169,8 +176,10 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex)
}
if (srcIndex->devID < 0) {
if(mem != NULL)
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize);
mem->UnlockBuf();
}
else
XMemFree(mem->devID, sIndex);
}
......@@ -209,9 +218,16 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
CheckNTErrors(srcIndexValue < s->unitNum, "Wrong index!");
}
sIndex = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
(int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
/*sIndex = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
(int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);*/
if (mem != NULL) {
mem->LockBuf();
sIndex = (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize);
}
else {
sIndex = (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
}
XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize);
}
else {
......@@ -238,6 +254,15 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
else {
ShowNTErrors("Unsupported dataType!");
}
if (srcIndex->devID < 0) {
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize);
mem->UnlockBuf();
}
else
XMemFree(mem->devID, sIndex);
}
}
#endif // USE_CUDA
......
......@@ -231,8 +231,8 @@ And this is a special spread function for backward computation of gather functio
*/
void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index)
{
int dim = 0;
int order = source->order;
//int dim = 0;
//int order = source->order;
CheckNTErrors(source->dataType == DEFAULT_DTYPE, "TODO!");
CheckNTErrors(collection->GetDim(-1) == source->GetDim(-1), "Illegal dimension!");
......@@ -272,4 +272,4 @@ void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index)
}
}
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
} // namespace nts(NiuTrans.Tensor)
......@@ -177,9 +177,17 @@ void _CudaSpread(XTensor * source, XTensor * collection, int dim,
DTYPE * c = (DTYPE*)collection->data;
XMem * mem = source->mem;
int * si = mem != NULL ?
/*int * si = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize * 2) :
(int*)XMemAlloc(mem->devID, sizeof(int) * indexSize * 2);
(int*)XMemAlloc(mem->devID, sizeof(int) * indexSize * 2);*/
int * si;
if (mem != NULL) {
mem->LockBuf();
si = (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize * 2);
}
else {
si = (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize * 2);
}
int * ci = si + indexSize;
XMemCopy(si, mem->devID, srcIndex, -1, sizeof(int) * indexSize);
......@@ -188,8 +196,10 @@ void _CudaSpread(XTensor * source, XTensor * collection, int dim,
KernelSpreadFuzed<<<blocks, threads >>>(s, c, blockNum, blockSizeSrc, blockSizeColl,
stride, indexSize, si, ci);
if(mem != NULL)
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize * 2);
mem->UnlockBuf();
}
else
XMemFree(mem->devID, si);
}
......@@ -393,9 +403,16 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcI
dim3 threads(cudaBlocks[0], cudaBlocks[1]);
if (srcIndex->devID < 0) {
sIndex = mem != NULL ?
/*sIndex = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
(int*)XMemAlloc(devID, sizeof(int) * indexSize);
(int*)XMemAlloc(devID, sizeof(int) * indexSize);*/
if (mem != NULL) {
mem->LockBuf();
sIndex = (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize);
}
else {
sIndex = (int*)XMemAlloc(devID, sizeof(int) * indexSize);
}
XMemCopy(sIndex, devID, srcIndex->data, -1, sizeof(int) * indexSize);
}
else
......@@ -422,8 +439,10 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcI
}
if (srcIndex->devID < 0) {
if(mem != NULL)
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize);
mem->UnlockBuf();
}
else
XMemFree(devID, sIndex);
}
......
......@@ -512,8 +512,8 @@ void funName(DTYPE * input, DTYPE * output,int stride, int strideNum,
KERNELREDUCEFUN1(KernelReduceMaxOp, MAX, shflDownReduceMax, FLOAT_MIN)
KERNELREDUCEFUN1(KernelReduceMinOp, MIN, shflDownReduceMin, MAX_FLOAT)
/*
get the max-valued items along a dimension of the tensor (cuda version).
/*
get the max-valued items along a dimension of the tensor (cuda version).
For a 1-dimensional data array a,
sum_i = max_{0<=j<strideNum} input_{i,j}
>> input - the input tensor
......@@ -574,7 +574,14 @@ void _funcName(const XTensor * input, XTensor * output, int dim)
XMem * mem = input->mem; \
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize); \
int bufSize = input->unitSize * cudaGridSize[0] * stride * blockNum * 2; \
DTYPE * buf = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(devID, bufSize); \
DTYPE * buf; \
if (mem != NULL) { \
mem->LockBuf(); \
buf = (DTYPE*)mem->AllocBuf(mem->devID, bufSize); \
} \
else { \
buf = (DTYPE*)XMemAlloc(devID, bufSize); \
} \
DTYPE * buf1 = buf; \
DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum; \
do { \
......@@ -706,8 +713,10 @@ void _funcName(const XTensor * input, XTensor * output, int dim)
\
} while (strideNum > 1); \
\
if (mem != NULL) \
if (mem != NULL) { \
mem->ReleaseBuf(mem->devID, bufSize); \
mem->UnlockBuf(); \
} \
else \
XMemFree(input->devID, buf); \
} \
......
......@@ -757,7 +757,15 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
int bufSize = input->unitSize * cudaGridSize[0] * stride * blockNum * 2;
DTYPE * buf = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(devID, bufSize);
//DTYPE * buf = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(devID, bufSize);
DTYPE * buf;
if (mem != NULL) {
mem->LockBuf();
buf = (DTYPE*)mem->AllocBuf(mem->devID, bufSize);
}
else {
buf = (DTYPE*)XMemAlloc(devID, bufSize);
}
DTYPE * buf1 = buf;
DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum;
do {
......@@ -907,8 +915,10 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
} while (strideNum > 1);
if (mem != NULL)
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, bufSize);
mem->UnlockBuf();
}
else
XMemFree(devID, buf);
}
......
......@@ -56,12 +56,16 @@ void _ReduceSumAll(const XTensor * source, XTensor * target)
int dims[1] = {source->unitNum};
if (source->mem != NULL)
source->mem->LockBuf();
XTensor * all = NewTensorBufV2(1, dims, source->dataType, source->denseRatio, source->devID, source->mem);
_CopyValues(source, all);
_ReduceSum(all, target, 0);
DelTensorBuf(all);
if (source->mem != NULL)
source->mem->UnlockBuf();
}
/*
......@@ -72,7 +76,8 @@ sum all the items of the tensor (It should be optimized!)
void _ReduceSumAll(const XTensor * source, DTYPE * value)
{
int * dimSize = new int[MAX_TENSOR_DIM_NUM];
float dr = (!source->isSparse) ? 1.0F : source->denseRatio;
if (source->mem != NULL)
source->mem->LockBuf();
XTensor * target = NewTensorBufV2(0, dimSize, source->dataType, source->denseRatio, source->devID, source->mem);
target->SetTMPFlag();
......@@ -82,6 +87,8 @@ void _ReduceSumAll(const XTensor * source, DTYPE * value)
delete[] dimSize;
DelTensorBuf(target);
if (source->mem != NULL)
source->mem->UnlockBuf();
}
/*
......@@ -122,4 +129,4 @@ DTYPE ReduceSumAllValue(const XTensor & source)
return target.Get0D();
}
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
} // namespace nts(NiuTrans.Tensor)
......@@ -32,14 +32,14 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/*
transform a tensor by merging it along with a dimension.
e.g., (N/3, M, 3) -> (N, M)
e.g., (3, M, N/3) -> (M, N)
>> s - the source tensor
>> t - the target tensor (for return)
>> whereToMerge - the merging operation is along with which dimension
>> leadingDim - the leading dimension of merging, take (N/3, M, 3) -> (N, M)
for example, whereToMerge = 0 (i.e., the dimension for "N/3")
leadingDim = 2 (i.e., the dimension for "3")
>> leadingDim - the leading dimension of merging, take (3, M, N/3) -> (M, N)
for example, whereToMerge = 2 (i.e., the dimension for "N/3")
leadingDim = 0 (i.e., the dimension for "3")
*/
void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
{
......@@ -118,30 +118,54 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
void * dataTMP = t->data;
if (!isOnSameDevice)
dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(mem->devID, size);
if (!isOnSameDevice) {
/*dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(mem->devID, size);*/
if (mem != NULL) {
mem->LockBuf();
dataTMP = mem->AllocBuf(mem->devID, size);
}
else {
dataTMP = XMemAlloc(mem->devID, size);
}
}
int blockNumInMerge = s->dimSize[leadingDim];
int splitSizeInGrid = gridSize / blockNumInMerge;
int realBlockSize = blockSize * t->unitSize;
int * blockIndex = (int*)(mem != NULL ?
/*int * blockIndex = (int*)(mem != NULL ?
mem->AllocBuf(mem->devID, blockNum * gridNum * sizeof(int)) :
XMemAlloc(s->devID, blockNum * gridNum * sizeof(int)));
XMemAlloc(s->devID, blockNum * gridNum * sizeof(int)));*/
int * blockIndex;
if (mem != NULL) {
if (isOnSameDevice) {
mem->LockBuf();
}
blockIndex = (int*)mem->AllocBuf(mem->devID, blockNum * gridNum * sizeof(int));
}
else {
blockIndex = (int*)XMemAlloc(s->devID, blockNum * gridNum * sizeof(int));
}
_MakeMergeBlockIndex(blockIndex, blockNum, blockNumInMerge, splitSizeInGrid, gridSize, gridNum, s->devID);
_CopyBlocksOnSite(s->data, s->unitSize, realBlockSize, blockNum * gridNum, dataTMP, blockIndex, s->devID);
if (mem != NULL)
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, blockNum * gridNum * sizeof(int));
if (isOnSameDevice) {
mem->UnlockBuf();
}
}
else
XMemFree(s->devID, blockIndex);
if (!isOnSameDevice) {
XMemCopy(t->data, t->devID, dataTMP, s->devID, size);
if (mem != NULL)
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, size);
mem->UnlockBuf();
}
else
XMemFree(s->devID, dataTMP);
}
......@@ -185,13 +209,13 @@ bool CheckMergeSize(const XTensor * s, const XTensor * t, int whereToMerge, int
transform a tensor by merging it along with a dimension (return an XTensor structure)
make a new tensor to keep the result and return it
e.g., (N/3, M, 3) -> (N, M)
e.g., (3, M, N/3) -> (M, N)
>> s - the source tensor
>> whereToMerge - the merging operation is along with which dimension
>> leadingDim - the leading dimension of merging, take (N/3, M, 3) -> (N, M)
for example, whereToMerge = 0 (i.e., the dimension for "N/3")
leadingDim = 2 (i.e., the dimension for "3")
>> leadingDim - the leading dimension of merging, take (3, M, N/3) -> (M, N)
for example, whereToMerge = 2 (i.e., the dimension for "N/3")
leadingDim = 0 (i.e., the dimension for "3")
<< return - the transformed tensor by merging along with a dimension
*/
XTensor Merge(const XTensor &s, int whereToMerge, int leadingDim)
......@@ -358,8 +382,16 @@ void _Merge(const TensorList * smalls, XTensor * t, int whereToMerge)
void * dataTMP = NULL;
if (uniform)
dataTMP = smallsItem0->data;
else
dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(t->devID, size);
else {
//dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(t->devID, size);
if (mem != NULL) {
mem->LockBuf();
dataTMP = mem->AllocBuf(mem->devID, size);
}
else {
dataTMP = XMemAlloc(t->devID, size);
}
}
tensorTMP->data = dataTMP;
......@@ -378,8 +410,10 @@ void _Merge(const TensorList * smalls, XTensor * t, int whereToMerge)
tensorTMP->data = NULL;
delete tensorTMP;
if ((!uniform) && (mem != NULL))
if ((!uniform) && (mem != NULL)) {
mem->ReleaseBuf(mem->devID, size);
mem->UnlockBuf();
}
else
XMemFree(t->devID, dataTMP);
}
......
......@@ -117,7 +117,7 @@ void _CudaMergeBlockLists(const StrList* sourceList, int * blockSizes, int block
GDevs.GetCudaThread2D(myMem->devID, realMaxBlockSize, newBlockListSize, MAX_INT,
cudaGridSizes, cudaBlockSizes);
myMem->LockBuf();
myMem->SetPinBuf();
int * sizesGPU = (int*)myMem->AllocBuf(myMem->devID, sizeof(int) * newBlockListSize, 256);
......@@ -133,6 +133,7 @@ void _CudaMergeBlockLists(const StrList* sourceList, int * blockSizes, int block
(sourceArraysGPU, sizesGPU, newBlockListSize, targetArraysGPU);
myMem->BackToPinBuf();
myMem->UnlockBuf();
delete[] sourceArrays;
delete[] targetArrays;
......
......@@ -110,22 +110,44 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
void * dataTMP = t->data;
if (!isOnSameDevice)
dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(s->devID, size);
if (!isOnSameDevice) {
//dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(s->devID, size);
if (mem != NULL) {
mem->LockBuf();
dataTMP = mem->AllocBuf(mem->devID, size);
}
else {
dataTMP = XMemAlloc(s->devID, size);
}
}
int realBlockSize = blockSize * t->unitSize;
int blockSplitSize = blockNum / splitNum;
int * blockIndex = (int*)(mem != NULL ?
/*int * blockIndex = (int*)(mem != NULL ?
mem->AllocBuf(mem->devID, blockNum * sizeof(int)) :
XMemAlloc(s->devID, blockNum * sizeof(int)));
XMemAlloc(s->devID, blockNum * sizeof(int)));*/
int * blockIndex;
if (mem != NULL) {
if (isOnSameDevice) {
mem->LockBuf();
}
blockIndex = (int*)mem->AllocBuf(mem->devID, blockNum * sizeof(int));
}
else {
blockIndex = (int*)XMemAlloc(s->devID, blockNum * sizeof(int));
}
_MakeSplitBlockIndex(blockIndex, splitNum, blockSplitSize, blockNum, s->devID);
_CopyBlocksOnSite(s->data, s->unitSize, realBlockSize, blockNum, dataTMP, blockIndex, s->devID);
if (mem != NULL)
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, blockNum * sizeof(int));
if (isOnSameDevice) {
mem->UnlockBuf();
}
}
else
XMemFree(s->devID, blockIndex);
......@@ -133,8 +155,10 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
if (!isOnSameDevice) {
XMemCopy(t->data, t->devID, dataTMP, s->devID, size);
if (mem != NULL)
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, size);
mem->UnlockBuf();
}
else
XMemFree(s->devID, dataTMP);
}
......@@ -333,7 +357,14 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli
dataTMP = first->data;
}
else {
dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(big->devID, size);
//dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(big->devID, size);
if (mem != NULL) {
mem->LockBuf();
dataTMP = mem->AllocBuf(mem->devID, size);
}
else {
dataTMP = XMemAlloc(big->devID, size);
}
}
tensorTMP->data = dataTMP;
......@@ -354,8 +385,10 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli
tensorTMP->data = NULL;
delete tensorTMP;
if ((!uniform) && (mem != NULL))
if ((!uniform) && (mem != NULL)) {
mem->ReleaseBuf(mem->devID, size);
mem->UnlockBuf();
}
else
XMemFree(big->devID, dataTMP);
}
......
......@@ -43,13 +43,11 @@ void _Stack(const TensorList * smalls, XTensor * t, int dim)
int blockSize = 1;
int blockNum = 1;
int gridSize = 1;
int gridNum = 1;
XTensor * smallsItem0 = smalls->GetItem(0);
int unitNum = smallsItem0->unitNum;
//int unitNum = smallsItem0->unitNum;
int unitSize = smallsItem0->unitSize;
int itemSize = unitNum * unitSize;
for (int i = 0; i < smallsItem0->order; i++) {
if (i >= dim)
......@@ -129,7 +127,7 @@ bool CheckStackShape(const TensorList &smalls, XTensor &t, int dim)
XTensor * tensor = (XTensor*)smalls.GetItem(0);
int order = tensor->order;
for (int i = 0; i < tensor->order; i++) {
for (int i = 0; i < order; i++) {
if (i < dim) {
if (t.GetDim(i) != tensor->GetDim(i))
return false;
......
......@@ -234,7 +234,15 @@ void _CudaSortBig(const XTensor * a, XTensor * b, XTensor * indexA, XTensor * in
int m = GetNextPower2(strideNum);
int n = stride * blockNum;
void * buf = mem != NULL ? mem->AllocBuf(a->devID, n * m * a->unitSize) : XMemAlloc(a->devID, n * m * a->unitSize);
//void * buf = mem != NULL ? mem->AllocBuf(a->devID, n * m * a->unitSize) : XMemAlloc(a->devID, n * m * a->unitSize);
void * buf;
if (mem != NULL) {
mem->LockBuf();
buf = mem->AllocBuf(a->devID, n * m * a->unitSize);
}
else {
buf = XMemAlloc(a->devID, n * m * a->unitSize);
}
void * bufIndex = NULL;
if (indexA != NULL && indexB != NULL) {
bufIndex = mem != NULL ? mem->AllocBuf(a->devID, n * m * sizeof(int)) : XMemAlloc(a->devID, n * m * sizeof(int));
......@@ -289,8 +297,10 @@ void _CudaSortBig(const XTensor * a, XTensor * b, XTensor * indexA, XTensor * in
KernelReorganizeBack<int> << <dim3(cudaGrids[1], cudaGrids[0]), dim3(cudaBlocks[1], cudaBlocks[0]) >> >
(bufIndex, indexB->data, m, n, stride, k, blockNum);
if (mem != NULL)
if (mem != NULL) {
mem->ReleaseBuf(a->devID, n * m * a->unitSize);
mem->UnlockBuf();
}
else
XMemFree(a->devID, buf);
if (indexA != NULL && indexB != NULL)
......
......@@ -79,6 +79,8 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
blockSize = stride * dimensionSize;
blockNum = y->unitNum / blockSize;
if (mem != NULL)
mem->LockBuf();
max = NewTensorBufV2(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
sum = NewTensorBufV2(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
......@@ -153,6 +155,8 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
DelTensorBuf(max);
DelTensorBuf(sum);
if (mem != NULL)
mem->UnlockBuf();
if (x->devID >= 0) {
delete blockx;
......
......@@ -54,6 +54,8 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)
XTensor * max = NULL;
XTensor * sum = NULL;
if (mem != NULL)
mem->LockBuf();
max = NewTensorBufV2(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
sum = NewTensorBufV2(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
......@@ -113,6 +115,8 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)
DelTensorBuf(sum);
DelTensorBuf(max);
if (mem != NULL)
mem->UnlockBuf();
delete[] dimSize;
}
......
......@@ -354,8 +354,10 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,
dimSize[i - 1] = output->dimSize[i];
}
if (output->mem != NULL)
output->mem->LockBuf();
XTensor * lossBuf = NewTensorBufV2(output->order - 1, dimSize, output->dataType, output->denseRatio,
output->devID, output->mem);
output->devID, output->mem);
_CrossEntropy(output, gold, lossBuf, weight, padding, leadingDim);
......@@ -367,10 +369,16 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,
nonZeroNum = (DTYPE)lossBuf->unitNum;
}
else {
if ((padding->mem != NULL) && (padding->mem != output->mem)) {
padding->mem->LockBuf();
}
XTensor * tmp = NewTensorBufV2(padding, padding->devID, padding->mem);
_IsNonZero(padding, tmp);
_ReduceSumAll(tmp, &nonZeroNum);
DelTensorBuf(tmp);
if ((padding->mem != NULL) && (padding->mem != output->mem)) {
padding->mem->UnlockBuf();
}
}
loss = loss / nonZeroNum;
......@@ -384,6 +392,8 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,
delete[] dimSize;
DelTensorBuf(lossBuf);
if (output->mem != NULL)
output->mem->UnlockBuf();
return loss;
}
......
......@@ -57,6 +57,9 @@ void _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
{
int n = leadingDim < 0 ? output->order - 1 : leadingDim;
if (output->mem != NULL) {
output->mem->LockBuf();
}
XTensor * interBuf1 = NewTensorBufV2(output, output->devID, output->mem);
XTensor * interBuf2 = NewTensorBufV2(output, output->devID, output->mem);
......@@ -73,6 +76,9 @@ void _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
DelTensorBuf(interBuf2);
DelTensorBuf(interBuf1);
if (output->mem != NULL) {
output->mem->UnlockBuf();
}
}
/*
......@@ -118,6 +124,9 @@ DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
dimSize[i - 1] = output->dimSize[i];
}
if (output->mem != NULL) {
output->mem->LockBuf();
}
XTensor * lossBuf = NewTensorBufV2(output->order - 1, dimSize, output->dataType, output->denseRatio,
output->devID, output->mem);
......@@ -131,10 +140,16 @@ DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
nonZeroNum = (DTYPE)lossBuf->unitNum;
}
else {
if ((padding->mem != NULL) && (padding->mem != output->mem)) {
padding->mem->LockBuf();
}
XTensor * tmp = NewTensorBufV2(padding, padding->devID, padding->mem);
_IsNonZero(padding, tmp);
_ReduceSumAll(tmp, &nonZeroNum);
DelTensorBuf(tmp);
if ((padding->mem != NULL) && (padding->mem != output->mem)) {
padding->mem->UnlockBuf();
}
}
loss = loss / nonZeroNum;
......@@ -148,6 +163,9 @@ DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
delete[] dimSize;
DelTensorBuf(lossBuf);
if (output->mem != NULL) {
output->mem->UnlockBuf();
}
return loss;
}
......
......@@ -215,12 +215,7 @@ bool TestConvertDataType3()
{0.5F, -4.0F},
{0.0F, 6.0F} };
DTYPE data2[2][3] = { {1.0F, 2.0F, 3.0F},
{0.0F, 4.0F, 5.0F} };
DTYPE answer[3][3] = { {1.0F, -6.0F, -7.0F},
{0.5F, -15.0F, -18.5F},
{0.0F, 24.0F, 30.0F} };
/* CPU test */
bool cpuTest = true;
......@@ -241,6 +236,14 @@ bool TestConvertDataType3()
cpuTest = _CheckData(a, data1, unitNum1, 1e-4F);
#ifdef USE_CUDA
DTYPE data2[2][3] = { { 1.0F, 2.0F, 3.0F },
{ 0.0F, 4.0F, 5.0F } };
DTYPE answer[3][3] = { { 1.0F, -6.0F, -7.0F },
{ 0.5F, -15.0F, -18.5F },
{ 0.0F, 24.0F, 30.0F } };
/* GPU test */
bool gpuTest = true;
......
......@@ -67,7 +67,6 @@ bool TestGather1()
DTYPE answer[2][3] = { {0.0F, -1.0F, 2.0F},
{1.0F, 2.0F, 4.0F} };
int dim = 0;
int indexSize = 2;
int srcIndex[2] = {0, 2};
......
......@@ -422,7 +422,7 @@ bool TestSetData6()
for (int i = 0; i < order; i++)
unitNum *= dimSize[i];
DTYPE answer[5] = {5.2F, 3.2F, 1.2F, -0.8F, -2.8F};
//DTYPE answer[5] = {5.2F, 3.2F, 1.2F, -0.8F, -2.8F};
/* CPU test */
bool cpuTest = true;
......
/*
* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2016-2021
* Natural Language Processing Lab, Northeastern University
* and
* NiuTrans Research
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* We test XTrain here. It is simple, we design a simple task in that we
* make the model to predict an integer D (0-100) from four input integers
* A, B, C and D (0-100). We generate a number of samples with different values
* of A, B, C and D. The gold standard is
*
* D = (int)(sqrt(A * B) + abs(C - D))/2
*
* Our model is a two-layer feed-forward neural network. It can be treated
* as a classifier rather than a regression model.
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-03-03
*/
#include "TTrain.h"
#include "../tensor/core/CHeader.h"
#include "../tensor/function/FHeader.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
XTensor * tmpTT = NULL;
/* genreate the training data file */
void GeneateTTrainData(const char * fileName)
{
FILE * file = fopen(fileName, "wb");
CheckNTErrors(file, "Cannot open the file");
XPRINT(1, stderr, "[INFO] Generating data ... ");
int sampleNum = MAX_SAMPLE_NUM_IN_TTRAIN;
int range = MAX_INT_IN_TTRAIN;
fprintf(file, "%d\n", sampleNum);
srand(1);
for (int i = 0; i < sampleNum; i++) {
int A = (int)(((float)rand() / RAND_MAX) * range);
int B = (int)(((float)rand() / RAND_MAX) * range);
int C = (int)(((float)rand() / RAND_MAX) * range);
int D = (int)(((float)rand() / RAND_MAX) * range);
int E = (int)((sqrt(A * B) + abs(C - D)) / 2);
fprintf(file, "%d %d %d %d %d\n", A, B, C, D, E);
}
XPRINT2(1, stderr, "%d samples in \"%s\" [DONE]\n", sampleNum, fileName);
fclose(file);
}
/* run the test */
void TestTrain()
{
GeneateTTrainData("ttrain.txt");
XConfig config;
//config.Add("dev", -1);
config.Add("lrate", 0.1F);
config.Add("nstep", 100000);
config.Add("nepoch", 5);
config.Add("jobdev0", 0);
//config.Add("jobdev4", -1);
int serverDevID = config.GetInt("jobdev0", -1);
TTDataLoader loader;
loader.SetFileName("ttrain.txt");
loader.SetBatchSize(config.GetInt("batchsize", TT_BATCH_SIZE));
TTModel model;
model.Init(config, serverDevID);
tmpTT = model.params[0].param;
XOptimizer optimizer;
optimizer.Init(config);
XTrainer trainer;
trainer.Run(&config, &loader, &model, &optimizer);
}
/*****************************
* data loader
******************************/
/* constructor */
TTDataLoader::TTDataLoader()
{
fileName = new char[MAX_FILE_NAME_LENGTH];
file = NULL;
batchSize = TT_BATCH_SIZE;
}
/* de-constructor */
TTDataLoader::~TTDataLoader()
{
delete[] fileName;
}
/* set file name */
void TTDataLoader::SetFileName(const char * myFileName)
{
strcpy(fileName, myFileName);
}
/* set batch size */
void TTDataLoader::SetBatchSize(int myBatchSize)
{
batchSize = myBatchSize;
}
/* start the process */
bool TTDataLoader::Start()
{
file = fopen(fileName, "rb");
CheckNTErrors(file != NULL, "Cannot open the file");
/* skip the first line */
char * line = new char[MAX_SAMPLE_LINE_LENGTH];
fgets(line, MAX_SAMPLE_LINE_LENGTH, file);
delete[] line;
return true;
}
/* end the process */
bool TTDataLoader::End()
{
fclose(file);
return true;
}
/*
get a batch of samples
>> inputs - inputs of the model
>> golds - gold standards
*/
bool TTDataLoader::GetBatchSimple(XList * inputs, XList * golds)
{
CheckNTErrors(file != NULL, "No input file specificed!");
CheckNTErrors(inputs != NULL && inputs->count >= 1, "Wrong argument!");
CheckNTErrors(golds != NULL && golds->count >= 1, "Wrong argument!");
XTensor * input = (XTensor*)inputs->GetItem(0);
XTensor * gold = (XTensor*)golds->GetItem(0);
int count = 0;
int sampleSize = MAX_SAMPLE_SIZE;
char * line = new char[MAX_SAMPLE_LINE_LENGTH];
int * inputBatch = new int[batchSize * sampleSize];
int * goldBatch = new int[batchSize];
int A, B, C, D, E;
while (fgets(line, MAX_SAMPLE_LINE_LENGTH, file)) {
if (count == batchSize)
break;
if (sscanf(line, "%d %d %d %d %d", &A, &B, &C, &D, &E) < sampleSize + 1) {
ShowNTErrors("Wrong format in the training file!");
}
inputBatch[count * sampleSize] = A;
inputBatch[count * sampleSize + 1] = B;
inputBatch[count * sampleSize + 2] = C;
inputBatch[count * sampleSize + 3] = D;
goldBatch[count] = E;
count++;
}
if (count > 0) {
InitTensor2D(input, count, 4, X_INT);
InitTensor2D(gold, count, 1, X_INT);
input->SetData(inputBatch, count * 4);
gold->SetData(goldBatch, count);
}
delete[] line;
delete[] inputBatch;
delete[] goldBatch;
if (count > 0)
return true;
else
return false;
}
/*****************************
* the neural model
******************************/
/* constructor */
TTModel::TTModel()
{
devID = -1;
vSize = 0;
eSize = 0;
hSize = 0;
}
/* de-constructor */
TTModel::~TTModel()
{
}
/* config it */
void TTModel::SetConfig(XConfig &myConfig)
{
config.CreateFromMe(myConfig);
}
/*
initialize the model
>> myConfig - configuration
>> devID - device id
*/
void TTModel::Init(XConfig &myConfig, int myDevID)
{
Clear();
SetConfig(myConfig);
devID = myDevID;
vSize = MAX_INT_IN_TTRAIN + 1;
eSize = config.GetInt("esize", TT_EMBEDDING_SIZE);
hSize = config.GetInt("hsize", TT_HIDDEN_SIZE);
InitTensor2D(&embeddingW, vSize, eSize, X_FLOAT, devID);
InitTensor2D(&hiddenW, MAX_SAMPLE_SIZE * eSize, hSize, X_FLOAT, devID);
InitTensor2D(&outputW, hSize, vSize, X_FLOAT, devID);
embeddingW.SetName("embeddingw");
hiddenW.SetName("hiddenw");
outputW.SetName("outputw");
embeddingW.SetDataRand(-0.1F, 0.1F);
hiddenW.SetDataRand(-0.1F, 0.1F);
outputW.SetDataRand(-0.1F, 0.1F);
AddParam(&embeddingW);
AddParam(&hiddenW);
AddParam(&outputW);
}
/*
create the model
>> devID - device id
>> input - as it is
>> output - as it is
*/
void TTModel::Forward(int devID, XTensor * input, XTensor * output)
{
XTensor embedding;
XTensor embeddingCat;
XTensor hidden;
/* [e_0, e_1, e_2] = w_e * input(one-hot) */
embedding = Gather(embeddingW, *input);
/* e = merge(e_0, e_1, e_2) */
embeddingCat = Merge(embedding, embedding.order - 1, embedding.order - 2);
/* h = hardtanh(e * w_h) */
hidden = HardTanH(MMul(embeddingCat, hiddenW));
/* output = Softmax(h * w_o) */
*output = Softmax(MMul(hidden, outputW), -1);
}
/* clear the model */
void TTModel::Clear()
{
config.Clear();
}
/*
clone the model
>> devID - device id
*/
XModel * TTModel::Clone(int devID)
{
TTModel * model = new TTModel();
model->SetConfig(config);
model->Init(config, devID);
CopyValues(embeddingW, model->embeddingW);
CopyValues(hiddenW, model->hiddenW);
CopyValues(outputW, model->outputW);
return model;
}
/*
run the neural network
>> inputs - inputs of the model
>> outputs - outputs of the model
>> golds - gold standards
>> losses - losses of the output respect to the gold standards
*/
bool TTModel::RunSimple(XList * inputs, XList * outputs, XList * golds, XList* losses)
{
//fprintf(stderr, "run simple 0\n");
CheckNTErrors(inputs != NULL && inputs->count >= 1, "Wrong arguments!");
CheckNTErrors(outputs != NULL && outputs->count >= 1, "Wrong arguments!");
CheckNTErrors(golds != NULL && golds->count >= 1, "Wrong arguments!");
CheckNTErrors(losses != NULL && losses->count >= 1, "Wrong arguments!");
XTensor * input = (XTensor*)inputs->GetItem(0);
XTensor * output = (XTensor*)outputs->GetItem(0);
XTensor * gold = (XTensor*)golds->GetItem(0);
XTensor * loss = (XTensor*)losses->GetItem(0);
XTensor goldOneHot;
/* place all input data on the correct device */
input->FlushToDevice(devID);
output->FlushToDevice(devID);
gold->FlushToDevice(devID);
XNet net;
/* create the neural network and run it */
Forward(devID, input, output);
/* gold standard in ong-hot representaiton */
goldOneHot = IndexToOnehot(*gold, vSize, 0.0F);
int * dims = new int[goldOneHot.order];
for (int i = 0; i < goldOneHot.order - 2; i++)
dims[i] = goldOneHot.GetDim(i);
dims[goldOneHot.order - 2] = goldOneHot.GetDim(goldOneHot.order - 1);
goldOneHot.Reshape(goldOneHot.order - 1, dims);
/* loss */
*loss = CrossEntropy(*output, goldOneHot);
/* back-propagation */
net.Backward(*loss);
delete[] dims;
//fprintf(stderr, "run simple 1\n");
return true;
}
}
/*
* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2016-2021
* Natural Language Processing Lab, Northeastern University
* and
* NiuTrans Research
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* We test XTrain here. It is simple, we design a simple task in that we
* make the model to predict an integer D (0-100) from three input integers
* A, B and C (0-100). We generate a number of samples with different values
* of A, B and C. The gold standard is
*
* D = (int)(sqrt(A * B) + C)/2
*
* Our model is a two-layer feed-forward neural network. It can be treated
* as a classifier rather than a regression model.
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-03-03
* The express train was updated this year. It just takes me two hours and
* a half from Shenyang to Beijing.
*/
#ifndef __TTRAIN_H__
#define __TTRAIN_H__
#include <stdio.h>
#include <stdlib.h>
#include "XTrainer.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#define MAX_SAMPLE_NUM_IN_TTRAIN 200000
#define MAX_INT_IN_TTRAIN 100
#define MAX_SAMPLE_LINE_LENGTH 128
#define MAX_SAMPLE_SIZE 4
#define TT_BATCH_SIZE 256
#define TT_EMBEDDING_SIZE 128
#define TT_HIDDEN_SIZE 512
extern XTensor * tmpTT;
/* genreate the training data file */
void GeneateTTrainData(const char * fileName);
/* run the test */
extern
void TestTrain();
/* data loader */
class TTDataLoader : public DataDistributeBase
{
protected:
/* file name */
char * fileName;
/* file handle */
FILE * file;
/* batch size */
int batchSize;
public:
/* constructor */
TTDataLoader();
/* de-constructor */
~TTDataLoader();
/* set file name */
void SetFileName(const char * myFileName);
/* set batch size */
void SetBatchSize(int myBatchSize);
/* start the process */
bool Start();
/* end the process */
bool End();
/* get a batch of samples */
bool GetBatchSimple(XList * inputs, XList * golds);
};
/* the model */
class TTModel : public XModel
{
protected:
/* device id */
int devID;
/* configuration */
XConfig config;
/* embedding matrix of the input */
XTensor embeddingW;
/* parameter matrix of the hidden layer */
XTensor hiddenW;
/* parameter matrix of the output layer */
XTensor outputW;
/* vocabulary size */
int vSize;
/* embedding size */
int eSize;
/* hidden layer size */
int hSize;
public:
/* constructor */
TTModel();
/* de-constructor */
~TTModel();
/* config it */
void SetConfig(XConfig &myConfig);
/* initialize the parameters */
void Init(XConfig &myConfig, int myDevID);
/* create the model */
void Forward(int devID, XTensor * input, XTensor * output);
/* clear the model */
void Clear();
/* clone the model */
XModel * Clone(int devID);
/* run the neural network */
bool RunSimple(XList * inputs, XList * outputs, XList * golds, XList * losses);
};
/* */
}
#endif
\ No newline at end of file
/*
* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2016-2021
* Natural Language Processing Lab, Northeastern University
* and
* NiuTrans Research
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* We define various template classes here. They will be overloaded and used
* in applications.
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-02-25
*/
#include "XBaseTemplate.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*******************************
* data loader template
*******************************/
/* constructor */
DataDistributeBase::DataDistributeBase()
{
MUTEX_INIT(loadMutex);
}
/* de-constructor */
DataDistributeBase::~DataDistributeBase()
{
MUTEX_DELE(loadMutex);
}
/* * start the job (e.g., open the file) */
bool DataDistributeBase::Start()
{
ShowNTErrors("DataDistributeBase::Start must be overloaded!");
return true;
}
/* end the job (e.g., close the file) */
bool DataDistributeBase::End()
{
ShowNTErrors("DataDistributeBase::End must be overloaded!");
return true;
}
/*
get a batch of samples
>> inputs - inputs of the model
>> golds - gold standards
*/
bool DataDistributeBase::GetBatchSimple(XList * inputs, XList * golds)
{
return false;
}
/* get a batch of samples */
bool DataDistributeBase::GetBatch(XList * args)
{
CheckNTErrors(args->count >= 2, "More input arguments are required!");
XList * input = (XList*)args->GetItem(0);
XList * gold = (XList*)args->GetItem(1);
if (GetBatchSimple(input, gold))
return true;
ShowNTErrors("You must be overload one of these: DataDistributeBase::GetBatchSimple ... !");
return false;
}
/* get a batch of samples (for multi-threading) */
bool DataDistributeBase::GetBatchSafe(XList * args)
{
bool r;
MUTEX_LOCK(loadMutex);
r = GetBatch(args);
MUTEX_UNLOCK(loadMutex);
return r;
}
}
/*
* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2016-2021
* Natural Language Processing Lab, Northeastern University
* and
* NiuTrans Research
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* We define various template classes here. They will be overloaded and used
* in applications.
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-02-25
* The meeting at 3:00pm today was canceled. More time for coding.
*/
#ifndef __XNETTEMPLATE_H__
#define __XNETTEMPLATE_H__
#include "../tensor/XTensor.h"
#include "../tensor/XThread.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
data distributor template. It distributes batches of data to workers.
The use of data distributor follows:
Start() -> GetBatch() -> ... -> GetBatch() -> End()
In addition, GetBatch() should be thread-safe, and thus could be
called by different threads simultaneously.
*/
class DataDistributeBase
{
protected:
/* mutex of batch loading */
MUTEX_HANDLE loadMutex;
public:
/* constructor */
DataDistributeBase();
/* de-constructor */
~DataDistributeBase();
/* start the job (e.g., open the file).
NOTE THAT before calling Start() one should initialize
the distributor if neccessary */
virtual
bool Start();
/* end the job (e.g., close the file) */
virtual
bool End();
/* get a batch of samples */
virtual
bool GetBatchSimple(XList * inputs, XList * golds);
public:
/* get a batch of samples */
bool GetBatch(XList * args);
/* get a batch of samples (for multi-threading) */
bool GetBatchSafe(XList * args);
};
}
#endif // __XNETTEMPLATE_H__
/*
* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2016-2021
* Natural Language Processing Lab, Northeastern University
* and
* NiuTrans Research
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* A "leader" manages a number of "workers". The leader recieves jobs from
* the central server (can be remote), or acts as an independent server itself.
* For workers, the leader is the one who issues orders and organizes them.
* Note that the leader and workers must be on the same machine. In case of
* multi-machine training, one can deploy different leaders on different
* machines. BUT, at this time, we need an additional way of distributing
* data across machines.
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-02-25
*/
#include "XLeader.h"
/* the nts (NiuTrans.Tensor) namespace */
namespace nts {
/* constructor */
XLeader::XLeader()
{
id = -1;
}
/* de-constructor */
XLeader::~XLeader()
{
}
/* intialize the leader */
void XLeader::Init()
{
for (int i = 0; i < jworkers.count; i++)
delete (XWorkerJob*)jworkers.GetItem(i);
jworkers.Clear();
for (int i = 0; i < cworkers.count; i++)
delete (XWorkerCollect*)cworkers.GetItem(i);
cworkers.Clear();
for (int i = 0; i < uworkers.count; i++)
delete (XWorkerUpdate*)uworkers.GetItem(i);
uworkers.Clear();
for (int i = 0; i < bworkers.count; i++)
delete (XWorkerBroadcast*)bworkers.GetItem(i);
bworkers.Clear();
serverRecord.Clear();
}
/* set id */
void XLeader::SetID(int myID)
{
id = myID;
}
/* get id */
int XLeader::GetID()
{
return id;
}
/*
Set the server model. It distributes the server-side parameters on different devices.
>> config - the configuration
>> model - the base model
>> memberModels - the models that run on different devices. We can place
the server-side parameters on different member models.
*/
void XLeader::SetServerModel(XConfig * config, XModel * model, XList * memberModels)
{
serverModel.Clear();
for (int i = 0; i < model->paramNum; i++) {
XTensor * param = model->params[i].param;
serverModel.AddParam(param);
}
/* TODO: we can place parameters on different devices */
}
/*
set the server model. It distributes the server-side parameters on different devices.
>> config - the configuration
>> model - the base model*/
void XLeader::SetServerModel(XConfig * config, XModel * model)
{
XList members;
for (int i = 0; i < jworkers.count; i++) {
XModel * member = ((XWorkerJob*)jworkers[i])->GetModel();
members.Add(member);
}
SetServerModel(config, model, &members);
}
/* initialize the models for running them */
void XLeader::InitForRun()
{
serverModel.InitForRun();
for (int i = 0; i < jworkers.count; i++) {
XModel* model = ((XWorkerJob*)jworkers[i])->GetModel();
model->InitForRun();
}
XList workers;
workers.AddList(&jworkers);
workers.AddList(&cworkers);
workers.AddList(&uworkers);
workers.AddList(&bworkers);
for (int i = 0; i < workers.count; i++) {
XWorker* worker = (XWorker*)workers[i];
CheckNTErrors(worker->IsEmpty(), "Something is wrong with the finishedQueue!");
}
}
/* set grad = 0 */
void XLeader::ResetParamGrad()
{
for (int i = 0; i < serverModel.paramNum; i++) {
XTensor* param = serverModel.params[i].param;
if (param->grad != NULL) {
param->grad->SetZeroAll();
}
}
for (int j = 0; j < jworkers.count; j++) {
XWorkerJob * worker = (XWorkerJob*)jworkers[j];
XModel * model = worker->GetModel();
for (int i = 0; i < model->paramNum; i++) {
XTensor* param = model->params[i].param;
if (param->grad != NULL) {
param->grad->SetZeroAll();
}
}
}
}
/*
wait for finished states (i.e., all workers finish their jobs)
>> activeJobWorkers - indicates whether each job worker is active
>> isToUpdate - indicates whether the model is updated
*/
void XLeader::WaitForFinishing(const int* activeJobWorkers, const int isToUpdate)
{
int activeCount = 0;
for (int i = 0; i < jworkers.count; i++) {
if (activeJobWorkers[i] > 0) {
XWorker* worker = (XWorker*)jworkers[i];
worker->DequeueFinishedJob();
activeCount++;
}
}
if (activeCount > 0 && isToUpdate) {
for (int i = 0; i < cworkers.count; i++) {
XWorker* worker = (XWorker*)cworkers[i];
worker->DequeueFinishedJob();
}
for (int i = 0; i < uworkers.count; i++) {
XWorker* worker = (XWorker*)uworkers[i];
for (int j = 0; j < serverModel.paramNum; j++)
worker->DequeueFinishedJob();
}
for (int i = 0; i < bworkers.count; i++) {
XWorker* worker = (XWorker*)bworkers[i];
for (int j = 0; j < serverModel.paramNum; j++)
worker->DequeueFinishedJob();
}
}
}
/* get loss */
float XLeader::GetLoss()
{
return serverRecord.lossAll;
}
/* get sample number */
int XLeader::GetSampleNum()
{
return serverRecord.sampleNum;
}
/* get prediction number */
int XLeader::GetPredictNum()
{
return serverRecord.predictNum;
}
/*
set the communication mode
>> myMode - the mode
*/
void XLeader::SetMode(XLEADER_MODE myMode)
{
mode = myMode;
}
/* set the flag of instant run */
void XLeader::SetInstantRun(bool flag)
{
for (int i = 0; i < jworkers.count; i++) {
XWorkerJob * worker = (XWorkerJob*)jworkers.GetItem(i);
worker->SetInstantRun(flag);
}
for (int i = 0; i < cworkers.count; i++) {
XWorkerJob * worker = (XWorkerJob*)cworkers.GetItem(i);
worker->SetInstantRun(flag);
}
for (int i = 0; i < uworkers.count; i++) {
XWorkerJob * worker = (XWorkerJob*)uworkers.GetItem(i);
worker->SetInstantRun(flag);
}
for (int i = 0; i < bworkers.count; i++) {
XWorkerJob * worker = (XWorkerJob*)bworkers.GetItem(i);
worker->SetInstantRun(flag);
}
}
/* start the workers */
void XLeader::Start()
{
serverModel.CheckParam();
for (int i = 0; i < jworkers.count; i++) {
XWorkerJob * worker = (XWorkerJob*)jworkers.GetItem(i);
worker->GetModel()->CheckParam();
worker->Start();
}
for (int i = 0; i < cworkers.count; i++) {
XWorkerJob * worker = (XWorkerJob*)cworkers.GetItem(i);
worker->Start();
}
for (int i = 0; i < uworkers.count; i++) {
XWorkerJob * worker = (XWorkerJob*)uworkers.GetItem(i);
worker->Start();
}
for (int i = 0; i < bworkers.count; i++) {
XWorkerJob * worker = (XWorkerJob*)bworkers.GetItem(i);
worker->Start();
}
}
/*
add a number of job workers (given their device ids)
>> model - the neural network
>> n - number of the models
>> ids - the array of device ids
*/
void XLeader::AddJobWorker(XModel * model, int n, int * ids)
{
/* we keep the input model */
if (n >= 1) {
XWorkerJob * worker = new XWorkerJob();
worker->SetModel(model);
jworkers.Add(worker);
}
/* we clone the input model */
for (int i = 1; i < n; i++) {
XWorkerJob * worker = new XWorkerJob();
worker->SetModel(model->Clone(ids[i]));
jworkers.Add(worker);
}
}
/*
add a data-collecting worker
>> mode - the data-transfer mode of the worker
*/
void XLeader::AddJobCollectWorker(DATA_COLLECT_TYPE mode)
{
XWorkerCollect * worker = new XWorkerCollect();
worker->SetCollectMode(mode);
cworkers.Add(worker);
}
/*
add a model-update worker
>> model - the model
>> optimizer - the optimizer
*/
void XLeader::AddJobUpdateWorker(XModel * model, XOptimizer * optimizer)
{
XWorkerUpdate * worker = new XWorkerUpdate();
worker->SetOptimizer(optimizer);
uworkers.Add(worker);
}
/* add a data-broadcasting worker */
void XLeader::AddJobBroadcastWorker()
{
XWorkerBroadcast * worker = new XWorkerBroadcast();
bworkers.Add(worker);
}
/*
run the model (for one time). Basically this is a map-reduce process.
>> config - the configuration
>> dataDistributor - data distributor
>> model - the neural network that we want to run
>> optimizer - the optimization method
<< return - if we can fetch the new data
*/
bool XLeader::Run(XConfig * config, DataDistributeBase * dataDistributor,
XModel * model, XOptimizer * optimizer)
{
CheckNTErrors(jworkers.count > 0, "No jworkers!");
CheckNTErrors(cworkers.count > 0, "No cworkers!");
CheckNTErrors(uworkers.count > 0, "No uworkers!");
CheckNTErrors(bworkers.count > 0, "No bworkers!");
bool isDataOK = true;
bool isToUpdate = (optimizer != NULL);
int activeJobCount = 0;
int* active = new int[jworkers.count];
InitForRun();
for (int i = 0; i < jworkers.count; i++)
active[i] = 0;
/* Feed the input to each worker and geneate the output.
For each worker, we define a job queue and enqueue jobs
into it.
*/
for (int i = 0; i < jworkers.count; i++) {
XWorkerJob * worker = (XWorkerJob*)jworkers[i];
XModel * jmodel = worker->GetModel();
/* get a batch of samples */
bool fetched = dataDistributor->GetBatchSimple(worker->GetInput(), worker->GetGold());
if (!fetched)
isDataOK = false;
else {
/* job in queue 1: refresh the model */
worker->AddJobRefresh(jmodel);
/* job in queue 1: run the model */
worker->AddJobNeuralNet(jmodel,
worker->GetInput(), worker->GetOutput(),
worker->GetGold(), worker->GetLoss());
/* job in queue 1: make a record of the run */
worker->AddJobRecord(&serverRecord);
/* job in queue 1: mark finished */
worker->AddJobEnqueueFinished();
active[i] = 1;
activeJobCount++;
}
}
if (activeJobCount > 0 && isToUpdate) {
/* workers */
XWorkerCollect * collecter = (XWorkerCollect*)cworkers.GetItem(0);
XWorkerUpdate * updater = (XWorkerUpdate*)uworkers.GetItem(0);
XWorkerBroadcast * broadcaster = (XWorkerBroadcast*)bworkers.GetItem(0);
/* member models that are active in this run */
XList members(jworkers.count);
/* all member models */
XList membersAll(jworkers.count);
for (int i = 0; i < jworkers.count; i++) {
XWorkerJob* worker = (XWorkerJob*)jworkers[i];
membersAll.Add(worker->GetModel());
if (active[i] == 1)
members.Add(worker->GetModel());
}
/* jobs in queue 2: collect the (gradient) data and other stuff. This
is a reduce process. The collector will add a job in queue 3
to update the model. The updater will add a job job in queue 4 to
broadcast the lastest parameters to workers. NOTE that we would update
a worker to the laster model parameters, even if it is not involved
in this run. */
collecter->AddJobUpdateAll(&members, &membersAll, &serverModel,
optimizer, updater, broadcaster);
collecter->AddJobEnqueueFinished();
}
WaitForFinishing(active, isToUpdate);
for (int i = 0; i < jworkers.count; i++) {
XWorkerJob * worker = (XWorkerJob*)jworkers[i];
worker->Clear();
}
delete[] active;
return isDataOK;
}
/* wait until all workers finish their job */
void XLeader::WaitForFinishing(int sleepTime)
{
while (1) {
bool finished = true;
if (finished) {
for (int i = 0; i < jworkers.count; i++) {
XWorkerJob* worker = (XWorkerJob*)jworkers[i];
if (worker->GetJobNum() > 0) {
finished = false;
break;
}
}
}
if (finished) {
for (int i = 0; i < cworkers.count; i++) {
XWorkerJob* worker = (XWorkerJob*)cworkers[i];
if (worker->GetJobNum() > 0) {
finished = false;
break;
}
}
}
if (finished) {
for (int i = 0; i < uworkers.count; i++) {
XWorkerJob* worker = (XWorkerJob*)uworkers[i];
if (worker->GetJobNum() > 0) {
finished = false;
break;
}
}
}
if (finished) {
for (int i = 0; i < bworkers.count; i++) {
XWorkerJob* worker = (XWorkerJob*)bworkers[i];
if (worker->GetJobNum() > 0) {
finished = false;
break;
}
}
}
if (finished)
break;
XSleep(sleepTime);
}
}
} /* end of the nts (NiuTrans.Tensor) namespace */
/*
* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2016-2021
* Natural Language Processing Lab, Northeastern University
* and
* NiuTrans Research
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* A "leader" manages a number of "workers". The leader recieves jobs from
* the central server (can be remote), or acts as an independent server itself.
* For workers, the leader is the one who issues orders and organizes them.
* Note that the leader and workers must be on the same machine. In case of
* multi-machine training, one can deploy different leaders on different
* machines. BUT, at this time, we need an additional way of distributing
* data across machines.
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-02-25
* We will go for a business trip. The first trip after the Spring Festival.
*/
#ifndef __XLEADER_H__
#define __XLEADER_H__
#include "XModel.h"
#include "XOptimizer.h"
#include "XBaseTemplate.h"
#include "XWorkerJob.h"
#include "XWorkerCollect.h"
#include "XWorkerUpdate.h"
#include "XWorkerBroadcast.h"
#include "./optimizer/OHeader.h"
#include "../tensor/XConfig.h"
#include "../tensor/XList.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#define MAX_NUM_OF_WORKERS 1024
#define SLEEP_TIME_IN_WAITING_FOR_JOBS 20
/*
conmmunication mode of a leader. This offers a way of organizing a hierachy of the work
1) run as a standalone program
2) give orders to another leader (probably remote)
3) recieve orders from anothe leader (probably remote)
4) give (and recieve) orders to (and from) different leaders
*/
enum XLEADER_MODE { XLEADER_STANDALONE, XLEADER_SEND, XLEADER_RECIEVE, XLEADER_SEND_AND_RECIEVE };
/* a leader who manages workers */
class XLeader
{
protected:
/* id of the leader */
int id;
/* a model that keeps the parameters (as a server) */
XModel serverModel;
/* a record that keeps the information of the run */
XNNRecord serverRecord;
/* communication mode */
XLEADER_MODE mode;
/* job workers */
XList jworkers;
/* data-collecting workers */
XList cworkers;
/* model-update workers */
XList uworkers;
/* data-broadcasting workers */
XList bworkers;
public:
/* constructor */
XLeader();
/* de-constructor */
~XLeader();
/* intialize the leader */
void Init();
/* set id */
void SetID(int myID);
/* get id */
int GetID();
/* set the server model */
void SetServerModel(XConfig * config, XModel * model, XList * memberModels);
/* set the server model */
void SetServerModel(XConfig * config, XModel * model);
/* initialize the models for running them */
void InitForRun();
/* set grad = 0 */
void ResetParamGrad();
/* wait for finished states (i.e., all workers finish their jobs) */
void WaitForFinishing(const int * activeJobWorkers, const int isToUpdate);
/* get loss */
float GetLoss();
/* get sample number */
int GetSampleNum();
/* get prediction number */
int GetPredictNum();
/* start the workers */
void Start();
/* set the communication mode */
void SetMode(XLEADER_MODE myMode);
/* set the flag of instant run */
void SetInstantRun(bool flag = true);
/* add a number of job workers (given their device ids) */
void AddJobWorker(XModel * model, int n, int * ids);
/* add a data-collecting worker */
void AddJobCollectWorker(DATA_COLLECT_TYPE mode = DATA_COLLECT_P2P);
/* add a model-update worker */
void AddJobUpdateWorker(XModel * model, XOptimizer * optimizer);
/* add a data-broadcasting worker */
void AddJobBroadcastWorker();
/* run the model (for one time) */
bool Run(XConfig * config, DataDistributeBase * dataDistributor,
XModel * model, XOptimizer * optimizer);
/* wait until all workers finish their job */
void WaitForFinishing(int sleepTime = SLEEP_TIME_IN_WAITING_FOR_JOBS);
};
}
#endif // __XLEADER_H__
/*
* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2016-2021
* Natural Language Processing Lab, Northeastern University
* and
* NiuTrans Research
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-03-16
* I wore my coat again after the rain yesterday.
*/
#include "XLearningRate.h"
#include <math.h>
namespace nts { // namespace nts(NiuTrans.Tensor)
/* constructor */
XLearningRate::XLearningRate()
{
}
/* de-constructor */
XLearningRate::~XLearningRate()
{
}
/* a Transformer-style scheduler. For more details, see
"Attention is all need" by Vaswani at al.
>> lrate - the learning rate
>> nstep - the update step number
>> nwarmup - the warmup step number
*/
float XLearningRate::MakeLRTransformer(const float lrate, const int nstep, const int nwarmup)
{
float lr = 0;
float warmupEndLR = lrate;
float warmupInitLR = 1e-7F;
float lrStep = (warmupEndLR - warmupInitLR) / nwarmup;
float decayFactor = warmupEndLR * (float)pow(float(nwarmup), 0.5F);
/* learning rate, scheduled by inverse square root */
if (nstep < nwarmup)
lr = warmupInitLR + nstep * lrStep;
else
lr = decayFactor * (float)pow((float)nstep, -0.5F);
return lr;
}
}
\ No newline at end of file
/*
* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2016-2021
* Natural Language Processing Lab, Northeastern University
* and
* NiuTrans Research
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* This a learning rate generator. E.g., one can adjust learning rate as
* the training process proceeds.
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-03-16
* I wore my coat again after the rain yesterday.
*/
#ifndef __XLEARNINGRATE_H__
#define __XLEARNINGRATE_H__
namespace nts { // namespace nts(NiuTrans.Tensor)
/* Learning rate scheduler */
class XLearningRate
{
public:
/* constructor */
XLearningRate();
/* de-constructor */
~XLearningRate();
/* a Transformer-style scheduler */
float MakeLRTransformer(const float lrate, const int nstep, const int nwarmup);
};
}
#endif
\ No newline at end of file
/*
* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2016-2021
* Natural Language Processing Lab, Northeastern University
* and
* NiuTrans Research
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* This class maintains the parameters (and other stuff) for training. It
* could be used to manage the parameter copy and update in training. E.g.,
* one can use this class to keep the parameters on the server side, or
* treat it as an individual model on the worker side.
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-02-24
*/
#include "XModel.h"
/* the nts (NiuTrans.Tensor) namespace */
namespace nts {
/* constructor */
XParamKeeper::XParamKeeper()
{
param = NULL;
flag = PARAM_STATE_NOT_READY;
trainFlag = PARAM_STATE_NOT_READY;
MUTEX_INIT(accessLock);
MUTEX_INIT(trainLock);
}
/* constructor */
XParamKeeper::~XParamKeeper()
{
MUTEX_DELE(accessLock);
MUTEX_DELE(trainLock);
}
/* constructor */
XModel::XModel()
{
params = NULL;
paramNum = 0;
MUTEX_INIT(modelMutex);
}
/* de-constructor */
XModel::~XModel()
{
Clear();
MUTEX_DELE(modelMutex);
}
/* clear the model */
void XModel::Clear()
{
delete[] params;
paramNum = 0;
}
/*
clone the model (would be overloaded)
>> devID - the device on that we keep the model
<< return - a cloned model
*/
XModel * XModel::Clone(int devID)
{
ShowNTErrors("XModel::Clone() should be overloaded!");
return NULL;
}
/*
run the neural network
>> inputs - inputs of the model
>> outputs - outputs of the model
>> golds - gold standards
>> losses - losses of the input with respect to the gold standards
*/
bool XModel::RunSimple(XList * inputs, XList * outputs, XList * golds, XList * losses)
{
return false;
}
/*
run the neural network
>> args - the arguments
*/
bool XModel::RunMe(XList * args)
{
CheckNTErrors(args->count >= 3, "More arguments are required!");
XList * inputs = (XList*)args->GetItem(0);
XList * outputs = (XList*)args->GetItem(1);
XList * golds = (XList*)args->GetItem(2);
XList* losses = (XList*)args->GetItem(3);
if (RunSimple(inputs, outputs, golds, losses))
return true;
ShowNTErrors("You must be overload one of these: XModel::RunSimple ... !");
return false;
}
/*
add a parameter tensor
>> param - add a
*/
void XModel::AddParam(XTensor* param)
{
param->SetVarFlag();
XParamKeeper * newParams = new XParamKeeper[paramNum + 1];
for (int i = 0; i < paramNum; i++) {
newParams[i].param = params[i].param;
newParams[i].flag = params[i].flag;
}
newParams[paramNum].param = param;
newParams[paramNum].flag = PARAM_STATE_NOT_READY;
delete[] params;
params = newParams;
paramNum++;
}
/* check if the parameters are well-defined for training */
bool XModel::CheckParam()
{
for (int i = 0; i < paramNum; i++) {
XTensor * param = params[i].param;
if (!param->isGrad)
return false;
}
return true;
}
/* initial model for running the it */
void XModel::InitForRun()
{
RefreshMe();
}
/* lock the parameter states (wait for unlocking them when
a run of training is finished) */
void XModel::LockParamsForTraining()
{
for (int i = 0; i < paramNum; i++) {
params[i].trainFlag = PARAM_STATE_NOT_READY;
MUTEX_LOCK(params[i].trainLock);
/* where is UNLOCK? We will do this when the training (a step)
is finsished. Then, WaitForUnlockedParams() can continue. In
such a way, we implement a START-WAIT process in each run
of training (a step) */
}
}
/* unlock the parameter states */
void XModel::WaitForUnlockedParams()
{
for (int i = 0; i < paramNum; i++) {
/* the lock proceeds only when the trainLock is unlocked
in training. In this way, we are actually waiting for
the FINISHED signal from other workers/threads. */
MUTEX_LOCK(params[i].trainLock);
CheckNTErrors(params[i].trainFlag == PARAM_STATE_UPDATED,
"the state of the parameter is wrong!");
MUTEX_UNLOCK(params[i].trainLock);
}
}
/* refresh the model */
void XModel::RefreshMe()
{
for (int i = 0; i < paramNum; i++) {
params[i].param->isGradFinished = false;
params[i].flag = PARAM_STATE_NOT_READY;
params[i].trainFlag = PARAM_STATE_NOT_READY;
}
}
/* wrapper of RefreshMe */
void XModel::Refresh(XList * args)
{
CheckNTErrors(args != NULL || args->count == 0, "no arguments for XModel::Refresh");
XModel * model = (XModel*)args->GetItem(0);
model->RefreshMe();
}
/* wrapper of Run() */
bool XModel::Run(XList * args)
{
CheckNTErrors(args != NULL || args->count == 0, "no arguments for XModel::Refresh");
XModel * model = (XModel*)args->GetItem(0);
XList newArgs;
for (int i = 1; i < args->count; i++) {
void * arg = args->GetItem(i);
newArgs.Add(arg);
}
return model->RunMe(&newArgs);
}
} /* end of the nts (NiuTrans.Tensor) namespace */
/*
* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2016-2021
* Natural Language Processing Lab, Northeastern University
* and
* NiuTrans Research
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* This class maintains the parameters (and other stuff) for training. It
* could be used to manage the parameter copy and update in training. E.g.,
* one can use this class to keep the parameters on the server side, or
* treat it as an individual model on the worker side.
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-02-24
* I created more than one file today, hahaha
*/
#ifndef __XMODEL_H__
#define __XMODEL_H__
#include "../network/XNet.h"
#include "../tensor/XQueue.h"
#include "../tensor/XList.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
parameter state
1) not ready
2) ready
3) the parameter has been collected from other models
4) the updated parameter
*/
enum PARAM_STATE { PARAM_STATE_NOT_READY,
PARAM_STATE_READY,
PARAM_STATE_COLLECTED,
PARAM_STATE_UPDATED };
/* parameter keeper */
class XParamKeeper
{
public:
/* the parameter */
XTensor * param;
/* the parameter state */
PARAM_STATE flag;
/* the state of the entire training process
(choosing from PARAM_STATE_NOT_READY and
PARAM_STATE_UPDATED */
PARAM_STATE trainFlag;
/* a mutex for locking and unlocking the parameter */
MUTEX_HANDLE accessLock;
/* a mutex of the overall training */
MUTEX_HANDLE trainLock;
public:
/* constructor */
XParamKeeper();
/* constructor */
~XParamKeeper();
};
/* a model template for training */
class XModel
{
protected:
/* mutex of the model */
MUTEX_HANDLE modelMutex;
public:
/* the list of model parameters */
XParamKeeper * params;
/* parameter number */
int paramNum;
public:
/* constructor */
XModel();
/* de-constructor */
~XModel();
/* clear the model (would be overloaded) */
virtual
void Clear();
/* clone the model (would be overloaded) */
virtual
XModel * Clone(int devID);
/* run the neural network */
virtual
bool RunSimple(XList * inputs, XList * outputs, XList * golds, XList * losses);
protected:
/* run the neural network */
bool RunMe(XList * args);
public:
/* add a parameter tensor */
void AddParam(XTensor * param);
/* check if the parameters are well-defined for training */
bool CheckParam();
/* lock the parameter states (wait for unlocking them when
a run of training is finished) */
void LockParamsForTraining();
/* wait for unlocked the parameter states */
void WaitForUnlockedParams();
/* initial model for running the it */
void InitForRun();
/* refresh the model */
void RefreshMe();
/* wrapper of RefreshMe() */
static
void Refresh(XList * args);
/* wrapper of Run() */
static
bool Run(XList * args);
};
}
#endif // __XMODEL_H__
/*
* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2016-2021
* Natural Language Processing Lab, Northeastern University
* and
* NiuTrans Research
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* A record that keeps some information in running and training neural networks
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-03-06
* I will climb mountains with my wife and son this afternoon, hahaha :)
*/
#include "XNNRecord.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* constructor */
XNNRecord::XNNRecord()
{
Clear();
MUTEX_INIT(mutex);
}
/* de-constructor */
XNNRecord::~XNNRecord()
{
MUTEX_DELE(mutex);
}
/* clear it */
void XNNRecord::Clear()
{
lossAll = 0;
sampleNum = 0;
predictNum = 0;
state = XWORKER_UNSTARTED;
}
/* update me with another record */
void XNNRecord::Update(XNNRecord & record)
{
lossAll += record.lossAll;
sampleNum += record.sampleNum;
predictNum += record.predictNum;
}
}
/*
* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2016-2021
* Natural Language Processing Lab, Northeastern University
* and
* NiuTrans Research
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* A record that keeps some information in running and training neural networks
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-03-06
* I will climb mountains with my wife and son this afternoon, hahaha :)
*/
#ifndef __XNNRECORD_H__
#define __XNNRECORD_H__
#include "XWorker.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* a record of keeping some stuff during training */
class XNNRecord
{
public:
/* loss over all samples */
float lossAll;
/* sample number */
int sampleNum;
/* prediction number */
int predictNum;
/* state */
XWORKER_STATE state;
/* mutex */
MUTEX_HANDLE mutex;
public:
/* constructor */
XNNRecord();
/* de-constructor */
~XNNRecord();
/* clear it */
void Clear();
/* update me with another record */
void Update(XNNRecord & record);
};
}
#endif
/*
* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2016-2021
* Natural Language Processing Lab, Northeastern University
* and
* NiuTrans Research
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* This class define the template of the update rule in gradient based methods
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-03-01
*/
#include "XOptimizer.h"
#include "../tensor/core/CHeader.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* constructor */
XOptimizer::XOptimizer()
{
Clear();
}
/* de-constructor */
XOptimizer::~XOptimizer()
{
Clear();
}
/*
initialize the optimizer
>> config - the configuration
*/
void XOptimizer::Init(XConfig &config)
{
nstep = config.GetInt("nstep", 100000);
nepoch = config.GetInt("nepoch", 50);
lrate = config.GetFloat("lrate", 0.1F);
}
/* clear the optimizer */
void XOptimizer::Clear()
{
nstep = 0;
nepoch = 0;
lrate = 0;
}
/* reset the optimizer (re-start) */
void XOptimizer::Reset()
{
}
void XOptimizer::ShowSettings()
{
XPRINT(1, stderr, "[INFO] Optimizer Setup:\n");
XPRINT2(1, stderr, "%25s = %d\n", "nstep", nstep);
XPRINT2(1, stderr, "%25s = %d\n", "nepoch", nepoch);
XPRINT2(1, stderr, "%25s = %.3f\n", "lrate", lrate);
}
/*
record the update
>> model - the model that we want to update
*/
void XOptimizer::Note(XModel * model)
{
nstep++;
}
/*
update a parameter matrix
>> param - the parameter matrix
>> gard - the gradient
>> pid - the id of the parameter matrix
*/
void XOptimizer::UpdateParam(XTensor * param, XTensor * grad, int pid)
{
/* the delta rule
\theta_new = \theta_old - \grad * \lrate */
_Sum(param, grad, param, -lrate);
}
/* get learning rate */
float XOptimizer::GetLearningRate()
{
return lrate;
}
/*
set learning rate
>> myLRate - the learning rate that we want to use
*/
void XOptimizer::SetLearningRate(float myLRate)
{
lrate = myLRate;
}
}
/*
* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2016-2021
* Natural Language Processing Lab, Northeastern University
* and
* NiuTrans Research
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* This class define the template of the update rule in gradient based methods
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-03-01
* March came finally but there was a snow last night.
*/
#ifndef __XOPTIMIZER_H__
#define __XOPTIMIZER_H__
#include "XModel.h"
#include "../tensor/XConfig.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* this class defines a template of the optimizer and
implement the simple delta-rule in SGD. */
class XOptimizer
{
public:
/* update step number */
int nstep;
/* training epoch number */
int nepoch;
/* learning rate */
float lrate;
public:
/* constructor */
XOptimizer();
/* de-constructor */
~XOptimizer();
/* initialize the optimizer */
virtual
void Init(XConfig &config);
/* clear the optimizer */
virtual
void Clear();
/* reset the optimizer (re-start) */
virtual
void Reset();
/* show settings */
virtual
void ShowSettings();
/* record the update */
virtual
void Note(XModel * model);
/* update a parameter matrix */
virtual
void UpdateParam(XTensor * param, XTensor * grad, int pid);
/* get learning rate */
float GetLearningRate();
/* set learning rate */
void SetLearningRate(float myLRate);
};
}
#endif
\ No newline at end of file
/*
* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2016-2021
* Natural Language Processing Lab, Northeastern University
* and
* NiuTrans Research
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-02-23
*
*/
#include "XTrainer.h"
#include "XLearningRate.h"
/* the nts (NiuTrans.Tensor) namespace */
namespace nts {
/* constructor */
XTrainer::XTrainer()
{
}
/* de-constructor */
XTrainer::~XTrainer()
{
}
/*
get the device ids of the jobs
>> config - configuration
>> ids - the array of device ids
>> num - number of the jobs
>> maxDevNum - the maximum number of devices
*/
void XTrainer::GetDevIDs(XConfig * config, int * ids, int & num, int maxDevNum)
{
CheckNTErrors(maxDevNum > 0, "No data array for input!");
num = 0;
for (int i = 0; i < maxDevNum; i++) {
char dev[16];
sprintf(dev, "jobdev%d", i);
int id = config->GetInt(dev, -128);
if (id != -128) {
ids[num++] = id;
}
else
break;
}
if (num == 0) {
char dev[16];
sprintf(dev, "jobdev");
int id = config->GetInt(dev, -128);
if (id != -128)
ids[num++] = id;
}
if (num == 0) {
char dev[16];
sprintf(dev, "dev");
int id = config->GetInt(dev, -128);
if (id != -128)
ids[num++] = id;
}
}
/*
run the trainer (this is the core process)
>> config - configuration
>> dataDistributor - the data distributor that generates an input for the net each time
>> model - the neural network
>> optimizer - the optimizer
*/
void XTrainer::Run(XConfig * config, DataDistributeBase * dataDistributor,
XModel * model, XOptimizer * optimizer)
{
CheckNTErrors(config != NULL, "No input config!");
CheckNTErrors(dataDistributor != NULL, "No input data distributor!");
CheckNTErrors(model != NULL, "No input neural network!");
int epoch = 0;
int step = 0;
int stepAll = 0;
int jobNum = 0;
int accumulation = config->GetInt("accumulation", 1);
int nwarmup = config->GetInt("nwarmup", 0);
int lrate = optimizer->GetLearningRate();
CheckNTErrors(accumulation >= 1, "accumulation must be larger than 0!");
int * ids = new int[MAX_DEVICE_NUM_TRAINING];
GetDevIDs(config, ids, jobNum, MAX_DEVICE_NUM_TRAINING);
optimizer->ShowSettings();
this->ShowSettings(config);
/* create the server and workers */
XLeader leader;
leader.Init();
leader.AddJobWorker(model, jobNum, ids);
leader.AddJobCollectWorker();
leader.AddJobUpdateWorker(model, optimizer);
leader.AddJobBroadcastWorker();
//leader.SetInstantRun();
leader.SetServerModel(config, model);
leader.Start();
/* learning rate scheduler */
XLearningRate LRScheduler;
double startT = GetClockSec();
XPRINT(1, stderr, "[INFO] Initializing the model ... [DONE]\n");
/* train the model */
for (epoch = 0; epoch < optimizer->nepoch; epoch++) {
bool ok = true;
dataDistributor->Start();
while (ok) {
if (++stepAll % accumulation == 0) {
/* learning rate scheduling */
if (nwarmup > 0)
optimizer->SetLearningRate(LRScheduler.MakeLRTransformer(lrate, step + 1, nwarmup));
/* one step of udpate */
ok = leader.Run(config, dataDistributor, model, optimizer);
float loss = leader.GetLoss() / leader.GetSampleNum();
if ((step + 1) % 100 == 0)
XPRINT5(1, stderr, "[INFO] elapsed=%.1fs epoch:%d step:%d sample:%d loss:%f\n",
GetClockSec() - startT, epoch + 1, step + 1, leader.GetSampleNum(), loss);
leader.ResetParamGrad();
if (++step >= optimizer->nstep)
break;
}
else {
/* one step with no udpate */
ok = leader.Run(config, dataDistributor, model, NULL);
}
}
dataDistributor->End();
if (step >= optimizer->nstep)
break;
}
delete[] ids;
}
/* show settings of training */
void XTrainer::ShowSettings(XConfig* config)
{
int workerNum = 0;
int* ids = new int[MAX_DEVICE_NUM_TRAINING];
GetDevIDs(config, ids, workerNum, MAX_DEVICE_NUM_TRAINING);
XPRINT(1, stderr, "[INFO] Training Setup:\n");
XPRINT2(1, stderr, "%25s = %d\n", "nworker", workerNum);
if (workerNum > 0) {
if (ids[0] < 0) {
XPRINT2(1, stderr, "%25s = CPU[%d]\n", "worker0(server)", ids[0]);
}
else{
XPRINT2(1, stderr, "%25s = GPU[%d]\n", "worker0(server)", ids[0]);
}
for (int i = 1; i < workerNum; i++) {
char name[32];
sprintf(name, "worker%d", i);
if (ids[i] < 0) {
XPRINT2(1, stderr, "%25s = CPU[%d]\n", name, ids[i]);
}
else {
XPRINT2(1, stderr, "%25s = GPU[%d]\n", name, ids[i]);
}
}
}
XPRINT2(1, stderr, "%25s = %d\n", "accumulation", config->GetInt("accumulation", 1));
delete[] ids;
}
} /* end of the nts (NiuTrans.Tensor) namespace */
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2021
/*
* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2016-2021
* Natural Language Processing Lab, Northeastern University
* and
* NiuTrans Research
......@@ -23,18 +24,22 @@
* Distributed training is supported.
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-02-23
* I start coding in 2021 after one year since I typed last C code.
* I start coding in 2021 after one year since I typed last line of C code.
* BUT i was a GOOD tex writter in 2020 :)
*/
#ifndef __XTRAINER_H__
#define __XTRAINER_H__
#include "XLeader.h"
#include "../network/XNet.h"
#include "../tensor/XQueue.h"
#include "../tensor/XConfig.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#define MAX_DEVICE_NUM_TRAINING 128
/*
Training of neural networks with gradient methods. Here we suppose that we
are training NLP models. The routine could be:
......@@ -56,14 +61,25 @@ the job to the workers and maintain the model.
*/
class XTrainer
{
private:
public:
/* constructor */
XTrainer();
/* de-constructor */
~XTrainer();
protected:
/* get the device ids of the jobs */
void GetDevIDs(XConfig * config, int * ids, int & num, int maxDevNum);
public:
/* run the leader (this is the core process) */
virtual
void Run(XConfig * config, DataDistributeBase * dataDistributor,
XModel * model, XOptimizer * optimizer);
/* show settings of training */
void ShowSettings(XConfig * config);
};
}
#endif // __XTRAINER_H__
\ No newline at end of file
/*
* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2016-2021
* Natural Language Processing Lab, Northeastern University
* and
* NiuTrans Research
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* The base class of worker. It maintains a job queue and offers utilities
* of controlling the working pipeline.
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-02-24
*/
#include "XWorker.h"
/* the nts (NiuTrans.Tensor) namespace */
namespace nts {
/* constructor */
XWorker::XWorker()
{
devID = -1;
id = -1;
state = XWORKER_UNSTARTED;
isInstantRun = false;
}
/* de-constructor */
XWorker::~XWorker()
{
Stop();
}
/* set device id */
void XWorker::SetDeviceID(int myDevID)
{
devID = myDevID;
}
/* get device id */
int XWorker::GetDeviceID()
{
return devID;
}
/* set worker id */
void XWorker::SetID(int myID)
{
id = myID;
}
/* get worker id */
int XWorker::GetID()
{
return id;
}
/* set the flag of instant run */
void XWorker::SetInstantRun(bool flag)
{
isInstantRun = flag;
}
/*
enqueue a new job
>> job - the job function
>> jobArgs - the arguments of the function
*/
void XWorker::AddJob(void * job, XList * jobArgs)
{
queue.EnqueueJob(job, jobArgs);
}
/* start the work */
void XWorker::Start()
{
queue.RunJobConsumer();
}
/* stop the work */
void XWorker::Stop()
{
queue.StopJobConsumer();
}
/* get the number of remaining jobs */
int XWorker::GetJobNum()
{
return queue.GetJobNum();
}
/* whether the job queue is empty? */
bool XWorker::IsEmpty()
{
return queue.IsEmpty();
}
/* enqueue a counting job of a finished job */
void XWorker::EnqueueFinishedJob()
{
finishedQueue.Enqueue(NULL);
}
/* dequeue a counting job of a finished job */
void XWorker::DequeueFinishedJob()
{
finishedQueue.Dequeue();
}
/* wrapper of EnqueueFinished() */
void XWorker::EnqueueFinished(XList* args)
{
XWorker* worker = (XWorker*)args->GetItem(0);
worker->EnqueueFinishedJob();
}
/* wrapper of DequeueFinished() */
void XWorker::DequeueFinished(XList* args)
{
XWorker* worker = (XWorker*)args->GetItem(0);
worker->DequeueFinishedJob();
}
/* add a job of enqueuing a counting a finished job */
void XWorker::AddJobEnqueueFinished()
{
XList args;
args.Add(this);
if (isInstantRun)
XWorker::EnqueueFinished(&args);
else
queue.EnqueueJob((void*)(char*)XWorker::EnqueueFinished, &args);
}
/* add a job of dequeuing a counting a finished job */
void XWorker::AddJobDequeueFinished()
{
XList args;
args.Add(this);
if (isInstantRun)
XWorker::DequeueFinished(&args);
else
queue.EnqueueJob((void*)(char*)XWorker::DequeueFinished, &args);
}
} /* end of the nts (NiuTrans.Tensor) namespace */
/*
* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2016-2021
* Natural Language Processing Lab, Northeastern University
* and
* NiuTrans Research
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* The base class of worker. It maintains a job queue and offers utilities
* of controlling the working pipeline.
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-02-24
* People started to go back to the normal life after the Spring Festival.
* Traffic jams again.
*/
#ifndef __XWORKER_H__
#define __XWORKER_H__
#include "../tensor/XQueue.h"
#include "../tensor/XUtility.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
state of a worker
1) unstarted
2) started
3) finished
*/
enum XWORKER_STATE { XWORKER_UNSTARTED, XWORKER_STARTED, XWORKER_FINISHED };
/* the worker class */
class XWorker
{
protected:
/* id of the device where we run the worker (we suppose that
the worker is insite. */
int devID;
/* id of the worker */
int id;
/* the queue of jobs */
XQueue queue;
/* state of the worker */
XWORKER_STATE state;
/* fire the flag of instant run */
bool isInstantRun;
/* the queue of counting finished jobs */
XQueue finishedQueue;
public:
/* constructor */
XWorker();
/* de-constructor */
~XWorker();
/* set device id */
void SetDeviceID(int myDevID);
/* get device id */
int GetDeviceID();
/* set worker id */
void SetID(int myID);
/* get worker id */
int GetID();
/* set the flag of instant run */
void SetInstantRun(bool flag = true);
/* enqueue a new job */
void AddJob(void * job, XList * jobArgs);
/* start the work */
void Start();
/* stop the work */
void Stop();
/* get the number of remaining jobs */
int GetJobNum();
/* whether the job queue is empty? */
bool IsEmpty();
/* enqueue a counting job of a finished job */
void EnqueueFinishedJob();
/* dequeue a counting job of a finished job */
void DequeueFinishedJob();
/* wrapper of EnqueueFinished() */
static
void EnqueueFinished(XList* args);
/* wrapper of DequeueFinished() */
static
void DequeueFinished(XList* args);
/* add a job of enqueuing a counting a finished job */
void AddJobEnqueueFinished();
/* add a job of dequeuing a counting a finished job */
void AddJobDequeueFinished();
};
}
#endif
/*
* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2016-2021
* Natural Language Processing Lab, Northeastern University
* and
* NiuTrans Research
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* The worker that boradcast the lastest parameters from the server to
* the workers.
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-03-03
*/
#include "XWorkerBroadcast.h"
#include "../tensor/core/CHeader.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* constructor */
XWorkerBroadcast::XWorkerBroadcast()
{
}
/* de-constructor */
XWorkerBroadcast::~XWorkerBroadcast()
{
}
/* set the broadcasting type */
void XWorkerBroadcast::SetBroadcastMode(DATA_BROADCAST_TYPE myMode)
{
broadcastMode = myMode;
}
/*
broadcast data for a parameter
>> source - the data (as a model) that we want to broadcast
>> targetList - the target places that we recieve the data
>> pid - the parameter index
*/
void XWorkerBroadcast::BroadcastDataSingle(XModel * source, XList * targetList, int pid)
{
CheckNTErrors(source->params[pid].flag == PARAM_STATE_UPDATED,
"The parameter is not ready for broadcasting");
for (int i = 0; i < targetList->count; i++) {
XModel * target = (XModel*)targetList->GetItem(i);
/* data transmit */
BroadcastP2P(source->params[pid].param, target->params[pid].param);
/* update the flag */
target->params[pid].flag = PARAM_STATE_UPDATED;
}
}
/*
broadcast data for a model
>> source - the data that we want to broadcast
>> targetList - the target places that we recieve the data
>> sleepTime - the waiting time in broadcasting
*/
void XWorkerBroadcast::BroadcastData(XModel * source, XList * targetList, int sleepTime)
{
int finished = 0;
int * finishedFlag = new int[source->paramNum];
memset(finishedFlag, 0, sizeof(int) * source->paramNum);
/* check */
for (int i = 0; i < targetList->count; i++) {
XModel * target = (XModel*)targetList->GetItem(i);
CheckNTErrors(source->paramNum == target->paramNum, "Incompatiable models!");
}
/* the major body of broadcasting */
while (1) {
for (int i = 0; i < source->paramNum; i++) {
if (source->params[i].flag == PARAM_STATE_UPDATED && finishedFlag[i] == 0) {
/* broadcasting */
BroadcastDataSingle(source, targetList, i);
/* counting */
finished += targetList->count;
finishedFlag[i] = 1;
}
}
if (finished == source->paramNum * targetList->count)
break;
XSleep(sleepTime);
}
delete[] finishedFlag;
}
/*
wrapper of BroadcastDataSingle
>> args - the list of arguments
*/
void XWorkerBroadcast::BroadcastSingle(XList * args)
{
XWorkerBroadcast * broadcaster = (XWorkerBroadcast*)args->GetItem(0);
XModel * source = (XModel*)args->GetItem(1);
/* target models */
int targetNum = args->GetItemInt(2);
XList target;
for (int i = 0; i < targetNum; i++) {
XModel * model = (XModel*)args->GetItem(3 + i);
target.Add(model);
}
/* parameter index */
int p = args->GetInt(3 + targetNum);
broadcaster->BroadcastDataSingle(source, &target, p);
}
/*
wrapper of BroadcastData
>> args - the list of arguments
*/
void XWorkerBroadcast::Broadcast(XList * args)
{
//fprintf(stderr, "broadcast 0\n");
XWorkerBroadcast * broadcaster = (XWorkerBroadcast*)args->GetItem(0);
XModel * source = (XModel*)args->GetItem(1);
/* target models */
int targetNum = args->GetItemInt(2);
XList target;
for (int i = 0; i < targetNum; i++) {
XModel * model = (XModel*)args->GetItem(3 + i);
target.Add(model);
}
broadcaster->BroadcastData(source, &target, SLEEP_TIME_IN_BROADCASTING);
//fprintf(stderr, "broadcast 1\n");
}
/*
P2P data broadcasting
>> source - the source data
>> target - the target data
*/
void XWorkerBroadcast::BroadcastP2P(XTensor * source, XTensor * target)
{
CheckNTErrors(source != NULL, "The source tensor should not be NULL!");
CheckNTErrors(target != NULL, "The target tensor should not be NULL!");
CheckNTErrors(IsSameShaped(*source, *target), "The two tensors should be of the same shape!");
if(source != target)
CopyValues(*source, *target);
}
/*
add a new job of broadcasting data (for a parameter)
>> source - the data that we want to broadcast
>> targetList - the target places that we recieve the data
>> pid - the parameter index
*/
bool XWorkerBroadcast::AddJobBroadcastSingle(XModel * source, XList * targetList, int pid)
{
CheckNTErrors(source != NULL, "no input source tensor!");
CheckNTErrors(targetList != NULL, "no input target tensor list!");
CheckNTErrors(pid >= 0 && pid < source->paramNum, "illegal parameter index!");
XList args;
args.Add(this);
args.Add(source);
args.AddInt(targetList->count);
args.AddList(targetList);
args.AddInt(pid);
if (isInstantRun)
XWorkerBroadcast::BroadcastSingle(&args);
else
queue.EnqueueJob((void*)(char*)XWorkerBroadcast::BroadcastSingle, &args);
return true;
}
/*
add a new job of broadcasting data (for a model)
>> source - the data that we want to broadcast
>> targetList - the target places that we recieve the data
*/
bool XWorkerBroadcast::AddJobBroadcast(XModel * source, XList * targetList)
{
CheckNTErrors(source != NULL, "no input source tensor!");
CheckNTErrors(targetList != NULL, "no input target tensor list!");
XList args;
args.Add(this);
args.Add(source);
args.AddInt(targetList->count);
args.AddList(targetList);
if (isInstantRun)
XWorkerBroadcast::Broadcast(&args);
else
queue.EnqueueJob((void*)(char*)XWorkerBroadcast::Broadcast, &args);
return true;
}
}
/*
* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2016-2021
* Natural Language Processing Lab, Northeastern University
* and
* NiuTrans Research
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* The worker that boradcast the lastest parameters from the server to
* the workers.
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-03-03
* Several visiters will come today, so i have less time for coding.
*/
#ifndef __XWORKERBROADCAST_H__
#define __XWORKERBROADCAST_H__
#include "XWorker.h"
#include "XModel.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#define SLEEP_TIME_IN_BROADCASTING 5
/*
data broadcasting method
1) point-to-point
*/
enum DATA_BROADCAST_TYPE { DATA_BROADCAST_P2P };
/* This class defines a broadcaster that transmits parameters from
a server to workers. */
class XWorkerBroadcast : public XWorker
{
protected:
DATA_BROADCAST_TYPE broadcastMode;
public:
/* constructor */
XWorkerBroadcast();
/* de-constructor */
~XWorkerBroadcast();
/* set the broadcasting type */
void SetBroadcastMode(DATA_BROADCAST_TYPE myMode);
/* broadcast data for a parameter */
void BroadcastDataSingle(XModel * source, XList * targetList, int pid);
/* broadcast data for a model */
void BroadcastData(XModel * source, XList * targetList, int sleepTime);
/* wrapper of BroadcastDataSingle */
static
void BroadcastSingle(XList * args);
/* wrapper of BroadcastData */
static
void Broadcast(XList * args);
/* P2P data broadcasting */
void BroadcastP2P(XTensor * source, XTensor * target);
/* add a new job of broadcasting data (for a parameter) */
bool AddJobBroadcastSingle(XModel * source, XList * targetList, int pid);
/* add a new job of broadcasting data (for a model) */
bool AddJobBroadcast(XModel * source, XList * targetList);
};
}
#endif
\ No newline at end of file
/*
* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2016-2021
* Natural Language Processing Lab, Northeastern University
* and
* NiuTrans Research
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* The worker that collects data from workers.
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-03-01
*/
#include "XWorkerCollect.h"
#include "../tensor/core/CHeader.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* constructor */
XWorkerCollect::XWorkerCollect()
{
collectMode = DATA_COLLECT_P2P;
}
/* de-constructor */
XWorkerCollect::~XWorkerCollect()
{
}
/* set the collection type */
void XWorkerCollect::SetCollectMode(DATA_COLLECT_TYPE myMode)
{
collectMode = myMode;
}
/*
collect the gradient data, update the parameters, and broadcast the
new parameters to all models. NOTE that this method just collect graident
from member models. Then it calls an XWorkerUpdate to update the parameters.
The XWorkerUpdate also calls an XWorkerBroadcast to broadcast the new parameter
to member models back.
>> memberActive - member models that are active, i.e., have generated gradients
>> memberAll - all member models
>> server - the server model
>> optimizer - the optimizer
>> updater - the worker that updates the parameters
>> broadcaster - the worker that broadcasts the new parameters to all member
models
>> sleepTime - waiting time in collecting
*/
void XWorkerCollect::UpdateDataAll(XList * memberActive, XList * memberAll, XModel * server,
XOptimizer * optimizer, XWorkerUpdate * updater,
XWorkerBroadcast * broadcaster, int sleepTime)
{
int finished = 0;
for (int j = 0; j < server->paramNum; j++)
server->params[j].flag = PARAM_STATE_NOT_READY;
/* check */
for (int i = 0; i < memberAll->count; i++) {
XModel * source = (XModel*)memberAll->GetItem(i);
CheckNTErrors(source->paramNum == server->paramNum, "Incompatiable models!");
}
for (int i = 0; i < memberActive->count; i++) {
XModel * source = (XModel*)memberActive->GetItem(i);
CheckNTErrors(source->paramNum == server->paramNum, "Incompatiable models!");
}
/* counts how many member models are collect for each parameters */
int * finishedCount = new int[server->paramNum];
memset(finishedCount, 0, sizeof(int) * server->paramNum);
/* This is a simple implementation of the wait-and-collect process. But
there is a risk that some models are not available, that is, the
loop would never stop. A solution might be that we force the loop
to break after waiting for a short time. */
while (1) {
if (collectMode == DATA_COLLECT_P2P) {
for (int j = 0; j < server->paramNum; j++) {
XParamKeeper &paramServer = server->params[j];
/* tp[j]->isGradFinished is true only if the model finishes the computation
(in another process) */
if (paramServer.flag != PARAM_STATE_NOT_READY || !paramServer.param->isGradFinished)
continue;
/* check if all the models (or part of them) are ready */
for (int i = 0; i < memberActive->count; i++) {
XModel * source = (XModel*)memberActive->GetItem(i);
XParamKeeper &paramSource = source->params[j];
/* sp[j]->isGradFinished is true only if the model finishes the computation
(in another process) */
if (paramSource.flag == PARAM_STATE_NOT_READY && paramSource.param->isGradFinished) {
/* data transmit */
CollectP2P(paramSource.param->grad, paramServer.param->grad);
/* reset the flag */
paramSource.flag = PARAM_STATE_COLLECTED;
finished++;
finishedCount[j]++;
/* we call model update (in another thread) and then
broadcast the new parameters to member models
(in another thread) */
if (finishedCount[j] == memberActive->count) {
paramServer.flag = PARAM_STATE_COLLECTED;
if (updater != NULL) {
updater->AddJobUpdateSingle(server, memberAll, j, optimizer, broadcaster);
updater->AddJobEnqueueFinished();
}
}
else if (finishedCount[j] > memberActive->count) {
ShowNTErrors("Something is wrong with finishedCount!");
}
}
}
}
}
else {
ShowNTErrors("Unsupported data collection mode!");
}
/* the collection finishes if all data tensors are processed */
if (finished == server->paramNum * memberActive->count)
break;
XSleep(sleepTime);
}
delete[] finishedCount;
}
/* wrapper of UpdateDataAll */
void XWorkerCollect::UpdateAll(XList * args)
{
XWorkerCollect * collecter = (XWorkerCollect*)args->GetItem(0);
int activeNum = args->GetInt(1);
XList memberActive;
for (int i = 0; i < activeNum; i++) {
XModel * member = (XModel*)args->GetItem(2 + i);
memberActive.Add(member);
}
int allNum = args->GetInt(2 + activeNum);
XList memberAll;
for (int i = 0; i < allNum; i++) {
XModel * member = (XModel*)args->GetItem(2 + activeNum + 1 + i);
memberAll.Add(member);
}
XModel * server = (XModel*)args->GetItem(2 + activeNum + 1 + allNum);
XOptimizer * optimizer = (XOptimizer*)args->GetItem(2 + activeNum + 1 + allNum + 1);
XWorkerUpdate * updater = (XWorkerUpdate*)args->GetItem(2 + activeNum + 1 + allNum + 2);
XWorkerBroadcast * broadcaster = (XWorkerBroadcast*)args->GetItem(2 + activeNum + 1 + allNum + 3);
collecter->UpdateDataAll(&memberActive, &memberAll, server,
optimizer, updater, broadcaster,
SLEEP_TIME_IN_COLLECTING);
}
/*
P2P data collection
target += source
>> source - the source tensor
>> target - the target tensor
*/
void XWorkerCollect::CollectP2P(XTensor * source, XTensor * target)
{
CheckNTErrors(source != NULL, "The source tensor should not be NULL!");
CheckNTErrors(target != NULL, "The target tensor should not be NULL!");
CheckNTErrors(IsSameShaped(*source, *target), "The two tensors should be of the same shape!");
/* target += source */
if (source != target) {
XTensor * sourceOnSite = source;
if (source->devID != target->devID) {
sourceOnSite = new XTensor(target);
_CopyValues(source, sourceOnSite);
}
_Sum(target, sourceOnSite, target);
if (sourceOnSite != source)
delete sourceOnSite;
}
}
/*
sum-reduce for given tensors
target += source_0
target += source_1
...
target += source_n
>> source - the source tensor
>> target - the target tensor
*/
void XWorkerCollect::CollectReduceSum(XList * source, XTensor * target)
{
for (int i = 0; i < source->count; i++) {
XTensor * s = (XTensor*)source->GetItem(i);
CollectP2P(s, target);
}
}
/*
all-reduce: the well-known all-reduce method
every tensor is involved in every data transmition. The final outcome
is that all input tensors share the same value (i.e., the sum of them).
>> all - the tensors for sum
*/
void XWorkerCollect::CollectAllReduce(XList * all)
{
ShowNTErrors("TODO!");
}
/*
add a new job of collecting data, update the parameter and
broadcast the new parameter
>> memberActive - member models that are active, i.e., have generated gradients
>> memberAll - all member models
>> server - the server model
>> optimizer - the optimizer
>> updater - the worker that updates the parameters
>> broadcaster - the worker that broadcasts the new parameters to all member
models
<< return - successful or not
*/
bool XWorkerCollect::AddJobUpdateAll(XList * memberActive, XList * memberAll, XModel * server,
XOptimizer * optimizer, XWorkerUpdate * updater, XWorkerBroadcast * broadcaster)
{
CheckNTErrors(memberActive != NULL, "No input (active) member list!");
CheckNTErrors(memberAll != NULL, "No input (all) member list!");
CheckNTErrors(server != NULL, "No input server model!");
CheckNTErrors(optimizer != NULL, "No input optimizer!");
CheckNTErrors(updater != NULL, "No input updater!");
CheckNTErrors(broadcaster != NULL, "No input broadcaster!");
XList args;
args.Add(this);
args.AddInt(memberActive->count);
args.AddList(memberActive);
args.AddInt(memberAll->count);
args.AddList(memberAll);
args.Add(server);
args.Add(optimizer);
args.Add(updater);
args.Add(broadcaster);
if (isInstantRun)
XWorkerCollect::UpdateAll(&args);
else
queue.EnqueueJob((void*)(char*)XWorkerCollect::UpdateAll, &args);
return true;
}
/*
add a new job of collecting data
>> sourceList - the list of models that we want collect data from
>> target - the destination of the collection
<< return - successful or not
*/
bool XWorkerCollect::AddJobCollect(XList * sourceList, XModel * target)
{
CheckNTErrors(sourceList != NULL, "no input source model list!");
CheckNTErrors(target != NULL, "no input target model!");
XList args;
args.Add(this);
args.AddInt(sourceList->count);
args.AddList(sourceList);
args.AddInt(0);
args.Add(target);
args.Add(NULL);
args.Add(NULL);
args.Add(NULL);
if (isInstantRun)
XWorkerCollect::UpdateAll(&args);
else
queue.EnqueueJob((void*)(char*)XWorkerCollect::UpdateAll, &args);
return true;
}
/*
collect the data of the run (i.e., loss). This is a reducer.
>> sourceList - the list of record
>> target - the record that we keep the reduce result
>> sleepTime - waiting time in collecting data
*/
void XWorkerCollect::CollectOtherData(XList* sourceList, XNNRecord* target, int sleepTime)
{
int finished = 0;
int* flags = new int[sourceList->count];
for (int i = 0; i < sourceList->count; i++)
flags[i] = 0;
while (1) {
for (int i = 0; i < sourceList->count; i++) {
if (flags[i] != 0)
continue;
XNNRecord* source = (XNNRecord*)sourceList->GetItem(i);
if (source->state == XWORKER_FINISHED) {
if(target != source)
target->Update(*source);
flags[i] = 1;
finished++;
}
}
if (finished == sourceList->count)
break;
XSleep(sleepTime);
}
delete[] flags;
}
/* wrapper of CollectOtherData */
void XWorkerCollect::CollectOther(XList* args)
{
//fprintf(stderr, "collect data other 0\n");
XWorkerCollect* collecter = (XWorkerCollect*)args->GetItem(0);
int sourceNum = args->GetItemInt(1);
/* the source records */
XList source;
for (int i = 0; i < sourceNum; i++) {
XNNRecord * record = (XNNRecord*)args->GetItem(2 + i);
source.Add(record);
}
/* the target record */
XNNRecord* target = (XNNRecord*)args->GetItem(2 + sourceNum);
collecter->CollectOtherData(&source, target, SLEEP_TIME_IN_COLLECTING_OTHER);
//fprintf(stderr, "collect data other 1\n");
}
/*
add a new job of collecting data of the run (i.e., loss)
collect the data of the run (i.e., loss). This is a reducer.
>> sourceList - the list of record
>> target - the record that we keep the reduce result
*/
bool XWorkerCollect::AddJobCollectOther(XList* sourceList, XNNRecord* target)
{
CheckNTErrors(sourceList != NULL, "no input source record list!");
CheckNTErrors(target != NULL, "no input target record!");
XList args;
args.Add(this);
args.AddInt(sourceList->count);
args.AddList(sourceList);
args.Add(target);
if (isInstantRun)
XWorkerCollect::CollectOther(&args);
else
queue.EnqueueJob((void*)(char*)XWorkerCollect::CollectOther, &args);
return true;
}
}
/*
* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2016-2021
* Natural Language Processing Lab, Northeastern University
* and
* NiuTrans Research
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* The worker that collects data from workers.
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-03-02
* minus 10 degrees centigrade comes again!
*/
#ifndef __XWORKERCOLLECT_H__
#define __XWORKERCOLLECT_H__
#include "XWorker.h"
#include "XModel.h"
#include "XWorkerJob.h"
#include "XWorkerUpdate.h"
#include "XWorkerBroadcast.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#define SLEEP_TIME_IN_COLLECTING 5
#define SLEEP_TIME_IN_COLLECTING_OTHER 5
/*
data collection method
1) point-to-point
2) reduce sum
3) all-reduce
*/
enum DATA_COLLECT_TYPE { DATA_COLLECT_P2P, DATA_COLLECT_REDUCESUM};
/* The class defines the collecting-data worker. It collect (gradient) data
from workers for the leader (server). */
class XWorkerCollect : public XWorker
{
protected:
DATA_COLLECT_TYPE collectMode;
public:
/* constructor */
XWorkerCollect();
/* de-constructor */
~XWorkerCollect();
/* set the collection type */
void SetCollectMode(DATA_COLLECT_TYPE myMode);
/* collect the gradient data, update the parameters, and broadcast the
new parameters to all models. NOTE that this method just collects graidents
from member models. Then it calls an XWorkerUpdate to update the parameters.
The XWorkerUpdate also calls an XWorkerBroadcast to broadcast the new parameter
to member models back. */
void UpdateDataAll(XList * memberActive, XList * memberAll, XModel * server,
XOptimizer * optimizer, XWorkerUpdate * updater, XWorkerBroadcast * broadcaster,
int sleepTime);
/* wrapper of UpdateDataAll */
static
void UpdateAll(XList * args);
/* P2P data collection */
void CollectP2P(XTensor * source, XTensor * target);
/* sum-reduce for given tensors */
void CollectReduceSum(XList * source, XTensor * target);
/* all-reduce */
void CollectAllReduce(XList * all);
/* add a new job of collecting data, update the parameter and broadcast the new parameter */
bool AddJobUpdateAll(XList * memberActive, XList * memberAll, XModel * server,
XOptimizer * optimizer, XWorkerUpdate * updater, XWorkerBroadcast * broadcaster);
/* add a new job of collecting data */
bool AddJobCollect(XList * sourceList, XModel * target);
/* collect the data of the run (i.e., loss). This is a reducer. */
void CollectOtherData(XList * sourceList, XNNRecord * target, int sleepTime);
/* wrapper of CollectOtherData */
static
void CollectOther(XList * args);
/* add a new job of collecting data of the run (i.e., loss) */
bool AddJobCollectOther(XList * sourceList, XNNRecord * target);
};
}
#endif
/*
* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2016-2021
* Natural Language Processing Lab, Northeastern University
* and
* NiuTrans Research
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* The worker of running the neural network.
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-02-24
*/
#include "XWorkerJob.h"
#include "../tensor/XList.h"
#include "../tensor/core/CHeader.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* constructor */
XWorkerJob::XWorkerJob()
{
Clear();
}
/* de-constructor */
XWorkerJob::~XWorkerJob()
{
for (int i = 0; i < inputs.count; i++)
delete (XTensor*)inputs[i];
for (int i = 0; i < outputs.count; i++)
delete (XTensor*)outputs[i];
for (int i = 0; i < golds.count; i++)
delete (XTensor*)golds[i];
for (int i = 0; i < losses.count; i++)
delete (XTensor*)losses[i];
}
/* set the model */
void XWorkerJob::SetModel(XModel * myModel)
{
model = myModel;
}
/* get the model */
XModel * XWorkerJob::GetModel()
{
return model;
}
/* set the state of the worker */
void XWorkerJob::SetState(XWORKER_STATE myState)
{
state = myState;
record.state = myState;
}
/* clear the worker */
void XWorkerJob::Clear()
{
for (int i = 0; i < inputs.count; i++)
delete (XTensor*)inputs[i];
inputs.Clear();
inputs.Add(new XTensor());
for (int i = 0; i < outputs.count; i++)
delete (XTensor*)outputs[i];
outputs.Clear();
outputs.Add(new XTensor());
for (int i = 0; i < golds.count; i++)
delete (XTensor*)golds[i];
golds.Clear();
golds.Add(new XTensor());
for (int i = 0; i < losses.count; i++)
delete (XTensor*)losses[i];
losses.Clear();
losses.Add(new XTensor());
record.Clear();
SetState(XWORKER_UNSTARTED);
}
/* get the input list */
XList * XWorkerJob::GetInput()
{
return &inputs;
}
/* get the output list */
XList * XWorkerJob::GetOutput()
{
return &outputs;
}
/* get the gold standard */
XList * XWorkerJob::GetGold()
{
return &golds;
}
/* get the loss */
XList * XWorkerJob::GetLoss()
{
return &losses;
}
/* get the record of the run */
XNNRecord * XWorkerJob::GetRecord()
{
return &record;
}
/* record some stuff */
void XWorkerJob::RecordMe()
{
float lossAll = 0;
int sampleNum = 0;
for (int i = 0; i < losses.count; i++) {
XTensor* loss = (XTensor*)losses[i];
lossAll += ReduceSumAllValue(*loss);
sampleNum += loss->GetSize();
}
record.lossAll = lossAll;
record.sampleNum = sampleNum;
int predictNum = 0;
for (int i = 0; i < outputs.count; i++) {
XTensor* output = (XTensor*)outputs[i];
predictNum += output->GetSize();
}
record.predictNum = predictNum;
}
/* get the sum of losses over samples */
float XWorkerJob::GetLossAll()
{
return record.lossAll;
}
/* get the number of samples */
int XWorkerJob::GetSampleNum()
{
return record.sampleNum;
}
/* get the number of outputs (predictoins) */
int XWorkerJob::GetPredictNum()
{
return record.predictNum;
}
/*
add a new job of model refreshment
>> myModel - the model
<< return - succeeded or not
*/
bool XWorkerJob::AddJobRefresh(XModel * myModel)
{
//fprintf(stderr, "refresh 0\n");
CheckNTErrors(myModel != NULL, "no parameter keeper!");
XList args(1);
args.Add(myModel);
if(isInstantRun)
XModel::Refresh(&args);
else
queue.EnqueueJob((void*)(char*)XModel::Refresh, &args);
//fprintf(stderr, "refresh 1\n");
return true;
}
/*
add a new job of neural network forward and backward computation (with the input)
>> myModel - the model
>> inputs - inputs of the neural network
>> outputs - outputs of the neural network
>> golds - gold standards
>> losses - losses of the outputs respect to the gold standards
<< return - succeeded or not
*/
bool XWorkerJob::AddJobNeuralNet(XModel * myModel,
XList * inputs, XList * outputs, XList * golds, XList * losses)
{
CheckNTErrors(myModel != NULL, "no input neural network!");
CheckNTErrors(inputs != NULL, "no inputs of the model!");
CheckNTErrors(outputs != NULL, "no outputs of the model!");
XList args;
args.Add(myModel);
args.Add(inputs);
args.Add(outputs);
args.Add(golds);
args.Add(losses);
if(isInstantRun)
XModel::Run(&args);
else
queue.EnqueueJob((void*)(char*)XModel::Run, &args);
SetState(XWORKER_STARTED);
return true;
}
/* wrapper of RecordMe */
void XWorkerJob::RecordMeStatic(XList* args)
{
//fprintf(stderr, "record static 0\n");
CheckNTErrors(args != NULL && args->count > 0, "Illegal arguments!");
XWorkerJob * worker = (XWorkerJob*)args->GetItem(0);
XNNRecord * serverRecord = (XNNRecord *)args->GetItem(1);
worker->RecordMe();
/* push information to the server end */
MUTEX_LOCK(serverRecord->mutex);
serverRecord->Update(*worker->GetRecord());
MUTEX_UNLOCK(serverRecord->mutex);
worker->SetState(XWORKER_FINISHED);
//fprintf(stderr, "record static 1\n");
}
/*
add a new job of recording the running of the nerual network
>>
*/
bool XWorkerJob::AddJobRecord(XNNRecord * serverRecord)
{
XList args;
args.Add(this);
args.Add(serverRecord);
if (isInstantRun)
XWorkerJob::RecordMeStatic(&args);
else
queue.EnqueueJob((void*)(char*)XWorkerJob::RecordMeStatic, &args);
return true;
}
} /* end of the nts (NiuTrans.Tensor) namespace */
/*
* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2016-2021
* Natural Language Processing Lab, Northeastern University
* and
* NiuTrans Research
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* The worker of running the neural network.
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-02-24
* My son had new glasses yesterday.
*/
#ifndef __XWORDERJOB_H__
#define __XWORDERJOB_H__
#include "XWorker.h"
#include "XModel.h"
#include "XNNRecord.h"
#include "XBaseTemplate.h"
#include "../tensor/XList.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* a model template for training */
class XWorkerJob : public XWorker
{
protected:
/* the model */
XModel * model;
/* the input tensors of the model */
XList inputs;
/* the output tensors of the model */
XList outputs;
/* the gold standard */
XList golds;
/* the loss */
XList losses;
/* record the information in running the neural network */
XNNRecord record;
public:
/* constructor */
XWorkerJob();
/* de-constructor */
~XWorkerJob();
/* set the parameter keeper */
void SetModel(XModel * myModel);
/* get the parameter keeper */
XModel * GetModel();
/* set the state of the worker */
void SetState(XWORKER_STATE myState);
/* clear the worker */
void Clear();
/* get the input list */
XList * GetInput();
/* get the output list */
XList * GetOutput();
/* get the gold standard */
XList * GetGold();
/* get the loss */
XList * GetLoss();
/* get the record of the run */
XNNRecord * GetRecord();
/* record some stuff */
void RecordMe();
/* get the sum of losses over samples */
float GetLossAll();
/* get the number of samples */
int GetSampleNum();
/* get the number of outputs (predictoins) */
int GetPredictNum();
/* add a new job of model refreshment */
bool AddJobRefresh(XModel * myModel);
/* add a new job of neural network forward and backward computation (with the input) */
bool AddJobNeuralNet(XModel * myModel, XList * inputs, XList * outputs, XList * golds, XList * losses);
/* add a new job of recording the running of the nerual network */
bool AddJobRecord(XNNRecord * serverRecord);
private:
/* wrapper of RecordMe */
static
void RecordMeStatic(XList * args);
};
}
#endif
/*
* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2016-2021
* Natural Language Processing Lab, Northeastern University
* and
* NiuTrans Research
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* The worker that updates the model.
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-03-01
*/
#include "XWorkerUpdate.h"
namespace nts { // namespace nts (NiuTrans.Tensor)
/* constructor */
XWorkerUpdate::XWorkerUpdate()
{
optimizer = NULL;
}
/* de-constructor */
XWorkerUpdate::~XWorkerUpdate()
{
}
/* set the optimizer */
void XWorkerUpdate::SetOptimizer(XOptimizer * myOptimizer)
{
optimizer = myOptimizer;
}
/* get the optimizer */
XOptimizer * XWorkerUpdate::GetOptimizer()
{
return optimizer;
}
/*
update a parameter of a model
>> model - the model that we want to update (on the server side)
>> members - models that would share the updated parameters
>> pid - the parameter index
>> optimizer - the optimizer
>> broadcaster - the worker that would broadcast the new parameter to members
*/
void XWorkerUpdate::UpdateParameter(XModel * server, XList * members, int pid,
XOptimizer * optimizer, XWorkerBroadcast * broadcaster)
{
CheckNTErrors(server->params[pid].flag == PARAM_STATE_COLLECTED, "The state of the parameter is wrong!");
XTensor * param = server->params[pid].param;
XTensor * grad = param->grad;
CheckNTErrors(grad != NULL, "No gradient!");
/* update the parameter */
optimizer->UpdateParam(param, grad, pid);
/* set the flag */
server->params[pid].flag = PARAM_STATE_UPDATED;
/* broadcast the new parameter to other models (in anotehr worker/thread) */
broadcaster->AddJobBroadcastSingle(server, members, pid);
broadcaster->AddJobEnqueueFinished();
}
/*
update the model
>> model - the model that we want to update
>> optimizer - the optimizer
>> sleepTime - waiting time in each update
*/
void XWorkerUpdate::UpdateModel(XModel * model, XOptimizer * optimizer, int sleepTime)
{
int finished = 0;
while (1) {
for (int i = 0; i < model->paramNum; i++) {
if (model->params[i].flag == PARAM_STATE_COLLECTED) {
XTensor * param = model->params[i].param;
XTensor * grad = param->grad;
CheckNTErrors(grad != NULL, "No gradient!");
/* update the parameter */
optimizer->UpdateParam(param, grad, i);
/* set the flag */
model->params[i].flag = PARAM_STATE_UPDATED;
finished++;
}
}
if (finished == model->paramNum)
break;
XSleep(sleepTime);
}
optimizer->Note(model);
}
/*
wrapper of UpdateParameter
>> args - arguments of the update
*/
void XWorkerUpdate::UpdateSingle(XList * args)
{
CheckNTErrors(args != NULL && args->count >= 6, "Illegal argument list!");
XWorkerUpdate * updater = (XWorkerUpdate*)args->GetItem(0);
XModel * server = (XModel*)args->GetItem(1);
int memNum = args->GetInt(2);
XList members;
for (int i = 0; i < memNum; i++) {
XModel * member = (XModel*)args->GetItem(3 + i);
members.Add(member);
}
int pid = args->GetInt(3 + memNum);
XOptimizer * optimizer = (XOptimizer*)args->GetItem(3 + memNum + 1);
XWorkerBroadcast * broadcaster = (XWorkerBroadcast*)args->GetItem(3 + memNum + 2);
if(updater != NULL)
updater->UpdateParameter(server, &members, pid, optimizer, broadcaster);
}
/*
wrapper of UpdateModel
>> args - arguments of the update
*/
void XWorkerUpdate::Update(XList * args)
{
//fprintf(stderr, "update 0\n");
CheckNTErrors(args != NULL && args->count >= 3, "Illegal argument list!");
XWorkerUpdate * updater = (XWorkerUpdate*)args->GetItem(0);
XModel * model = (XModel*)args->GetItem(1);
XOptimizer * optimizer = (XOptimizer*)args->GetItem(2);
if(updater != NULL)
updater->UpdateModel(model, optimizer, SLEEP_TIME_IN_MODEL_UPDATE);
//fprintf(stderr, "update 1\n");
}
/*
add a new job of model update (for a parameter)
>> model - the model that we want to update (on the server side)
>> members - models that would share the updated parameters
>> pid - the parameter index
>> optimizer - the optimizer
>> broadcaster - the worker that would broadcast the new parameter to members
*/
bool XWorkerUpdate::AddJobUpdateSingle(XModel * model, XList * members, int pid,
XOptimizer * optimizer, XWorkerBroadcast * broadcaster)
{
CheckNTErrors(model != NULL, "No input model!");
CheckNTErrors(members != NULL, "No member model list!");
CheckNTErrors(optimizer != NULL, "No optimizer!");
CheckNTErrors(broadcaster != NULL, "No broadcaster!");
CheckNTErrors(pid >= 0 && pid < model->paramNum, "Illegal parameter index!");
XList args;
args.Add(this);
args.Add(model);
args.AddInt(members->count);
args.AddList(members);
args.AddInt(pid);
args.Add(optimizer);
args.Add(broadcaster);
if (isInstantRun)
XWorkerUpdate::UpdateSingle(&args);
else
queue.EnqueueJob((void*)(char*)XWorkerUpdate::UpdateSingle, &args);
return true;
}
/*
add a new job of model update
>> model - the model that we want to update
>> optimizer - the optimizer
*/
bool XWorkerUpdate::AddJobUpdate(XModel * model, XOptimizer * optimizer)
{
CheckNTErrors(model != NULL, "No input model!");
CheckNTErrors(optimizer != NULL, "No optimizer!");
XList args;
args.Add(this);
args.Add(model);
args.Add(optimizer);
if(isInstantRun)
XWorkerUpdate::Update(&args);
else
queue.EnqueueJob((void*)(char*)XWorkerUpdate::Update, &args);
return true;
}
}
/*
* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2016-2021
* Natural Language Processing Lab, Northeastern University
* and
* NiuTrans Research
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* The worker that updates the model.
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-03-01
*/
#ifndef __XWORKERUPDATE_H__
#define __XWORKERUPDATE_H__
#include "XWorker.h"
#include "XOptimizer.h"
#include "XWorkerBroadcast.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#define SLEEP_TIME_IN_MODEL_UPDATE 5
/* The class defines the model-update worker */
class XWorkerUpdate : public XWorker
{
protected:
/* the optimizer */
XOptimizer * optimizer;
public:
/* constructor */
XWorkerUpdate();
/* de-constructor */
~XWorkerUpdate();
/* set the optimizer */
void SetOptimizer(XOptimizer * myOptimizer);
/* get the optimizer */
XOptimizer * GetOptimizer();
/* update the parameter */
void UpdateParameter(XModel * server, XList * members, int pid,
XOptimizer * optimizer, XWorkerBroadcast * broadcaster);
/* update the model */
void UpdateModel(XModel * model, XOptimizer * optimizer, int sleepTime);
/* wrapper of UpdateParameter */
static
void UpdateSingle(XList * args);
/* wrapper of UpdateModel */
static
void Update(XList * args);
/* add a new job of model update (for a parameter) */
bool AddJobUpdateSingle(XModel * model, XList * members, int pid,
XOptimizer * optimizer, XWorkerBroadcast * broadcaster);
/* add a new job of model update */
bool AddJobUpdate(XModel * model, XOptimizer * optimizer);
};
}
#endif
/*
* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2016-2021
* Natural Language Processing Lab, Northeastern University
* and
* NiuTrans Research
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* An implementation of the Adam optimizer.
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-03-15
*/
#include "Adam.h"
#include "../../tensor/core/CHeader.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* constructor */
Adam::Adam() : XOptimizer()
{
Clear();
}
/* de-constructor */
Adam::~Adam()
{
Clear();
}
/*
initialize the optimizer
>> config - the configuration
*/
void Adam::Init(XConfig &config)
{
XOptimizer::Init(config);
adamBeta1 = config.GetFloat("adambeta1", 0.9F);
adamBeta2 = config.GetFloat("adambeta2", 0.98F);
adamDelta = config.GetFloat("adamdelta", 1e-9F);
}
/* clear the optimizer */
void Adam::Clear()
{
XOptimizer::Clear();
for (int i = 0; i < moments.count; i++) {
XTensor * m = moments[i];
delete m;
}
moments.Clear();
for (int i = 0; i < moments2nd.count; i++) {
XTensor * m2nd = moments2nd[i];
delete m2nd;
}
moments2nd.Clear();
adamBeta1T = 1.0F;
adamBeta2T = 1.0F;
}
/* reset the optimizer (re-start) */
void Adam::Reset()
{
for (int i = 0; i < moments.count; i++) {
XTensor * m = moments[i];
m->SetZeroAll();
}
for (int i = 0; i < moments2nd.count; i++) {
XTensor * m2nd = moments2nd[i];
m2nd->SetZeroAll();
}
adamBeta1T = 1.0F;
adamBeta2T = 1.0F;
}
/* show settings */
void Adam::ShowSettings()
{
XPRINT(1, stderr, "[INFO] Optimizer = Adam\n");
XOptimizer::ShowSettings();
XPRINT2(1, stderr, "%25s = %f\n", "adambeta1", adamBeta1);
XPRINT2(1, stderr, "%25s = %f\n", "adambeta2", adamBeta2);
XPRINT2(1, stderr, "%25s = %f\n", "adamdelta", adamDelta);
}
/* record the update */
void Adam::Note(XModel * model)
{
nstep++;
}
/*
update a parameter matrix using Adam
>> param - the parameter to update
>> grad - the gradient of the parameter
>> pid - index of the parameter
*/
void Adam::UpdateParam(XTensor * param, XTensor * grad, int pid)
{
adamBeta1T *= adamBeta1;
adamBeta2T *= adamBeta2;
float e = lrate * (float)sqrt(1 - adamBeta2T) / (1 - adamBeta1T);
float d = adamDelta * (float)sqrt(1 - adamBeta2T);
/* m = beta_1 * m + (1-beta_1) * grad */
XTensor * m = moments[pid];
_ScaleAndShiftMe(m, adamBeta1, 0);
_Sum(m, grad, m, (1.0F - adamBeta1));
/* v = beta_2 * v + (1-beta_2) * grad * grad*/
XTensor * v = moments2nd[pid];
_Multiply(grad, grad, v, adamBeta2 / (1.0F - adamBeta2));
_ScaleAndShiftMe(v, (1.0F - adamBeta2), 0);
/* allocate a piece of buffer memory */
GMems.GetMem(v->devID)->LockBuf();
XTensor* v2 = NewTensorBuf(v, v->devID);
/* v2 = m / (sqrt(v) + delta) */
_Power(v, v2, 0.5F);
_ScaleAndShiftMe(v2, 1.0F, d);
_Div(m, v2, v2);
/* the delta rule */
_Sum(param, v2, param, -e);
/* release a piece of buffer memory */
DelTensorBuf(v2);
GMems.GetMem(v->devID)->UnlockBuf();
}
}
\ No newline at end of file
/*
* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2016-2021
* Natural Language Processing Lab, Northeastern University
* and
* NiuTrans Research
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* An implementation of the Adam optimizer.
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-03-15
* A foggy day. But all my students come back for work after the holiday
* - full of happiness to see a new start.
*/
#ifndef __ADAM_H__
#define __ADAM_H__
#include "../XOptimizer.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* an implementation of the Adam optimizer */
class Adam : public XOptimizer
{
protected:
/* list of the moment of the parameter matrices */
TensorList moments;
/* list of the 2nd order moment of the parameter matrices */
TensorList moments2nd;
/* hyper parameters of Adam */
float adamBeta1;
float adamBeta2;
float adamDelta;
float adamBeta1T;
float adamBeta2T;
public:
/* constructor */
Adam();
/* de-constructor */
~Adam();
/* initialize the optimizer */
void Init(XConfig &config);
/* clear the optimizer */
void Clear();
/* reset the optimizer (re-start) */
void Reset();
/* show settings */
void ShowSettings();
/* record the update */
void Note(XModel * model);
/* update a parameter matrix */
void UpdateParam(XTensor * param, XTensor * grad, int pid);
};
}
#endif
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2021
/*
* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2016-2021
* Natural Language Processing Lab, Northeastern University
* and
* NiuTrans Research
......@@ -19,24 +20,14 @@
*/
/*
* This ia header that includes all optimizer headers.
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-02-23
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-03-15
*/
#include "XTrainer.h"
/* the nts (NiuTrans.Tensor) namespace */
namespace nts {
/* constructor */
XTrainer::XTrainer()
{
}
#ifndef __OHEADER_H__
#define __OHEADER_H__
/* de-constructor */
XTrainer::~XTrainer()
{
}
#include "Adam.h"
} /* end of the nts (NiuTrans.Tensor) namespace */
\ No newline at end of file
#endif
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论