Commit b891e547 by xiaotong

create the source files

parent 1cbd6218
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
*
* This is the entrance of the low-level tensor library : NiuTrans.Tensor
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2015-12-14
*
*/
#include <stdio.h>
#include <math.h>
#include <time.h>
#include "XTensor.h"
#include "XDevice.h"
#include "./sample/fnnlm/FNNLM.h"
#include "test/Test.h"
using namespace nts;
using namespace samplefnnlm;
//#define CRTDBG_MAP_ALLOC
//#include <stdlib.h>
//#include <crtdbg.h>
int main( int argc, const char ** argv )
{
srand((unsigned)time(0));
if(argc > 1 && !strcmp(argv[1], "-test"))
Test();
if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
return FNNLMMain(argc - 1, argv + 1);
else{
fprintf(stderr, "Thanks for using NiuTrans.Tensor! This is a library that eases the\n");
fprintf(stderr, "use of tensors. All you need is to ... \n\n");
fprintf(stderr, "Run this program with \"-test\" for unit test!\n");
fprintf(stderr, "Or run this program with \"-fnnlm\" for sample FNNLM!\n");
}
return 0;
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
*
* This is a wrapper of the BLAS (Basic Linear Algebra Subprograms http://www.netlib.org/blas/)
* libraries. By using BLAS, we can access very fast matrix operations although they
* are also implemented in NiuTrans in a native manner. To use BLAS,
* set USE_BLAS.
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2016-04-08
*
*/
#ifdef WIN32
#include <wtypes.h>
#endif
#include <stdlib.h>
#include <stdio.h>
#include "XBLAS.h"
#include "XGlobal.h"
/* the nts (NiuTrans.Tensor) namespace */
namespace nts{
#ifdef WIN32
HINSTANCE hBLASDll;
#endif
/* single-precision floating matrix-matrix multiplication */
void (*XBLAS_SGEMM)(OPENBLAS_CONST enum CBLAS_ORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE, OPENBLAS_CONST enum CBLAS_TRANSPOSE,
OPENBLAS_CONST BLASINT, OPENBLAS_CONST BLASINT, OPENBLAS_CONST BLASINT, OPENBLAS_CONST float,
OPENBLAS_CONST float *, OPENBLAS_CONST BLASINT,
OPENBLAS_CONST float *, OPENBLAS_CONST BLASINT, OPENBLAS_CONST float,
float *, OPENBLAS_CONST BLASINT);
/* double-precision floating matrix-matrix multiplication */
void (*XBLAS_DGEMM)(OPENBLAS_CONST enum CBLAS_ORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE, OPENBLAS_CONST enum CBLAS_TRANSPOSE,
OPENBLAS_CONST BLASINT, OPENBLAS_CONST BLASINT, OPENBLAS_CONST BLASINT, OPENBLAS_CONST double,
OPENBLAS_CONST double *, OPENBLAS_CONST BLASINT,
OPENBLAS_CONST double *, OPENBLAS_CONST BLASINT, OPENBLAS_CONST double,
double *, OPENBLAS_CONST BLASINT);
/* single-precision floating vector-vector multiplication (rank-1) */
void (*XBLAS_SGER)(OPENBLAS_CONST enum CBLAS_ORDER, OPENBLAS_CONST BLASINT M, OPENBLAS_CONST BLASINT N, OPENBLAS_CONST float alpha,
OPENBLAS_CONST float *Y, OPENBLAS_CONST BLASINT, OPENBLAS_CONST float *, OPENBLAS_CONST BLASINT,
float *, OPENBLAS_CONST BLASINT);
/* double-precision floating vector-vector multiplication (rank-1) */
void (*XBLAS_DGER)(OPENBLAS_CONST enum CBLAS_ORDER, OPENBLAS_CONST BLASINT M, OPENBLAS_CONST BLASINT N, OPENBLAS_CONST double alpha,
OPENBLAS_CONST double *Y, OPENBLAS_CONST BLASINT, OPENBLAS_CONST double *, OPENBLAS_CONST BLASINT,
double *, OPENBLAS_CONST BLASINT);
/* set the number of threads */
void (*XBLAS_SET_THREAD_NUM)(int);
/* get the number of threads */
//int (*XBLAS_GET_THREAD_NUM)();
/* get the number of physical processors (cores).*/
int (*XBLAS_GET_CORE_NUM)();
/* get the CPU corename */
//char * (*XBLAS_GET_CORE_NAME)();
/* get the parallelization type used by OpenBLAS */
//int (*XBLAS_GET_PARALLEL_TYPE)(void);
#if defined(USE_BLAS)
/* load some stuff for BLAS */
void LoadBLAS(const char * dllFileName)
{
#ifndef CUDA_BLAS
#ifdef _WIN32
#if defined(OPENBLAS)
/* non-ascii characters are not supported yet */
wchar_t * fn = new wchar_t[strlen(dllFileName) + 1];
memset(fn, 0, sizeof(wchar_t) * (strlen(dllFileName) + 1));
for(int i = 0; i < strlen(dllFileName); i++)
fn[i] = dllFileName[i];
hBLASDll = LoadLibrary((LPCWSTR)fn);
if(!hBLASDll){
XPRINT1(0, stderr, "[LoadBLAS] Error! Cannot load dll %s!\n", dllFileName);
exit(1);
}
/* matrix-matrix multiplicatoin */
(FARPROC&)XBLAS_SGEMM = GetProcAddress(hBLASDll, "cblas_sgemm");
(FARPROC&)XBLAS_DGEMM = GetProcAddress(hBLASDll, "cblas_dgemm");
/* vector-vector multiplication */
(FARPROC&)XBLAS_SGER = GetProcAddress(hBLASDll, "cblas_sger");
(FARPROC&)XBLAS_DGER = GetProcAddress(hBLASDll, "cblas_dger");
/* multi-threading */
(FARPROC&)XBLAS_SET_THREAD_NUM = GetProcAddress(hBLASDll, "openblas_set_num_threads");
//(FARPROC&)XBLAS_SET_THREAD_NUM = GetProcAddress(hBLASDll, "goto_set_num_threads");
//(FARPROC&)XBLAS_GET_THREAD_NUM = GetProcAddress(hBLASDll, "openblas_get_num_threads");
(FARPROC&)XBLAS_GET_CORE_NUM = GetProcAddress(hBLASDll, "openblas_get_num_procs");
//(FARPROC&)XBLAS_GET_CORE_NAME = GetProcAddress(hBLASDll, "openblas_get_corename");
//(FARPROC&)XBLAS_GET_PARALLEL_TYPE = GetProcAddress(hBLASDll, "openblas_get_parallel");
delete[] fn;
#endif // defined(OPENBLAS)
#if defined(MKL)
/* non-ascii characters are not supported yet */
wchar_t * fn = new wchar_t[strlen(dllFileName) + 1];
memset(fn, 0, sizeof(wchar_t) * (strlen(dllFileName) + 1));
for(int i = 0; i < strlen(dllFileName); i++)
fn[i] = dllFileName[i];
hBLASDll = LoadLibrary((LPCWSTR)fn);
if(!hBLASDll){
XPRINT1(0, stderr, "[LoadBLAS] Error! Cannot load dll %s!\n", dllFileName);
exit(1);
}
/* matrix-matrix multiplicatoin */
(FARPROC&)XBLAS_SGEMM = GetProcAddress(hBLASDll, "cblas_sgemm");
(FARPROC&)XBLAS_DGEMM = GetProcAddress(hBLASDll, "cblas_dgemm");
/* vector-vector multiplication */
(FARPROC&)XBLAS_SGER = GetProcAddress(hBLASDll, "cblas_sger");
(FARPROC&)XBLAS_DGER = GetProcAddress(hBLASDll, "cblas_dger");
/* multi-threading */
(FARPROC&)XBLAS_SET_THREAD_NUM = GetProcAddress(hBLASDll, "MKL_Set_Num_Threads");
(FARPROC&)XBLAS_GET_CORE_NUM = GetProcAddress(hBLASDll, "MKL_Get_Max_Threads");
#endif // defined(MKL)
#else // _WIN32
XBLAS_SGEMM = &cblas_sgemm;
XBLAS_DGEMM = &cblas_dgemm;
XBLAS_SGER = &cblas_sger;
XBLAS_DGER = &cblas_dger;
#if defined(OPENBLAS)
XBLAS_SET_THREAD_NUM = &openblas_set_num_threads;
XBLAS_GET_CORE_NUM = &openblas_get_num_procs;
#endif // defined(OPENBLAS)
#if defined(MKL)
XBLAS_SET_THREAD_NUM = &mkl_set_num_threads;
XBLAS_GET_CORE_NUM = &mkl_get_max_num_threads;
#endif // defined(MKL)
#endif // _WIN32
XBLAS_SET_THREAD_NUM(1);
#endif // ndef(CUDA_BLAS)
}
/* unload the libs */
void UnloadBLAS()
{
#ifdef _WIN32
if(!FreeLibrary(hBLASDll)){
XPRINT(0, stderr, "[UnloadBLAS] Error! Cannot free the BLAS dll!\n");
exit(1);
}
#else
#endif
}
#else // undefined(USE_BLAS) || undefined(OPENBLAS)
void LoadBLAS(const char * dllFileName)
{
XPRINT(0, stderr, "[LoadBLAS] Error! No Blas lib is available. Please use OPENBLAS or MKL!\n");
exit(1);
}
void UnloadBLAS()
{
XPRINT(0, stderr, "[UnloadBLAS] Error! No Blas lib is available. Please use OPENBLAS or MKL!\n");
exit(1);
}
#endif // defined(USE_BLAS) && defined(OPENBLAS)
} /* end of the nts (NiuTrans.Tensor) namespace */
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
*
* This is a wrapper of the BLAS (Basic Linear Algebra Subprograms http://www.netlib.org/blas/)
* libraries. By using BLAS, we can access very fast matrix operations although they
* are also implemented in NiuTrans in a native manner. To use BLAS,
* specify USE_BLAS when compiling the code.
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2016-04-08
*
*/
#ifndef __XBLAS_H__
#define __XBLAS_H__
/* the nts (NiuTrans.Tensor) namespace */
namespace nts{
/* some of the code below is from OpenBLAS (https://github.com/xianyi/OpenBLAS) */
//#define OPENBLAS
#define OPENBLAS_CONST const
typedef int BLASINT;
typedef enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER;
typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114} CBLAS_TRANSPOSE;
typedef enum CBLAS_UPLO {CblasUpper=121, CblasLower=122} CBLAS_UPLO;
typedef enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG;
typedef enum CBLAS_SIDE {CblasLeft=141, CblasRight=142} CBLAS_SIDE;
#if defined(USE_BLAS)
/*
single/double-precision floating matrix-matrix multiplication (rank-3)
- SGEMM (ORDER, TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
It implements C = \alpha * op(A)*op(B) + \beta * C
where A, B and C are matrices,
\alpha and \beta are coefficients,
TRANSA specifies we need a transposed matrix (op(A)=A**T); otherwise op(A) = A,
M specifies the row number of op(A),
N specifies the column number of op(B),
K specifies the column number of op(A),
LDA(=K) specifies the size of the first(or leading) dimension of A as declared in the calling (sub) program,
E.g., if we are using CblasRowMajor, the leading dimension is the number of columns.
LDB(=N) specifies the size of the first dimension of B as declared in the calling (sub) program,
and LDC(=N) specifies the size of the first dimension of C as declared in the calling (sub) program.
*/
extern "C" void (*XBLAS_SGEMM)(OPENBLAS_CONST enum CBLAS_ORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE, OPENBLAS_CONST enum CBLAS_TRANSPOSE,
OPENBLAS_CONST BLASINT, OPENBLAS_CONST BLASINT, OPENBLAS_CONST BLASINT, OPENBLAS_CONST float,
OPENBLAS_CONST float *, OPENBLAS_CONST BLASINT,
OPENBLAS_CONST float *, OPENBLAS_CONST BLASINT, OPENBLAS_CONST float,
float *, OPENBLAS_CONST BLASINT);
/* double-precision floating matrix-matrix multiplication */
extern "C" void (*XBLAS_DGEMM)(OPENBLAS_CONST enum CBLAS_ORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE, OPENBLAS_CONST enum CBLAS_TRANSPOSE,
OPENBLAS_CONST BLASINT, OPENBLAS_CONST BLASINT, OPENBLAS_CONST BLASINT, OPENBLAS_CONST double,
OPENBLAS_CONST double *, OPENBLAS_CONST BLASINT,
OPENBLAS_CONST double *, OPENBLAS_CONST BLASINT, OPENBLAS_CONST double,
double *, OPENBLAS_CONST BLASINT);
/*
single/double-precision floating vector-vector multiplication (rank-2)
- SGER (ORDER,M, N, ALPHA, X, INCX, Y, INCY, A, LDA)
It implements A = \alpha * X * (Y^T) + A
where X and Y are vectors with m and n elements respectively,
A is an m by n matrix,
\alpha is the scalar,
INCX specifies the increment for the elements of X,
INCY specifies the increment for the elements of Y,
LDA specifies the size of the first(or leading) dimension of A as declared in the calling (sub) program,
E.g., if we are using CblasRowMajor, the leading dimension is the number of columns of A.
*/
extern "C" void (*XBLAS_SGER)(OPENBLAS_CONST enum CBLAS_ORDER, OPENBLAS_CONST BLASINT M, OPENBLAS_CONST BLASINT N, OPENBLAS_CONST float alpha,
OPENBLAS_CONST float *Y, OPENBLAS_CONST BLASINT, OPENBLAS_CONST float *, OPENBLAS_CONST BLASINT,
float *, OPENBLAS_CONST BLASINT);
/* double-precision floating vector-vector multiplication (rank-1) */
extern "C" void (*XBLAS_DGER)(OPENBLAS_CONST enum CBLAS_ORDER, OPENBLAS_CONST BLASINT M, OPENBLAS_CONST BLASINT N, OPENBLAS_CONST double alpha,
OPENBLAS_CONST double *Y, OPENBLAS_CONST BLASINT, OPENBLAS_CONST double *, OPENBLAS_CONST BLASINT,
double *, OPENBLAS_CONST BLASINT);
/* set the number of threads */
extern "C" void (*XBLAS_SET_THREAD_NUM)(int);
/* get the number of threads */
//extern "C" int (*XBLAS_GET_THREAD_NUM)();
/* get the number of physical processors (cores).*/
extern "C" int (*XBLAS_GET_CORE_NUM)();
/* get the CPU corename */
//extern "C" char * (*XBLAS_GET_CORE_NAME)();
/* get the parallelization type used by OpenBLAS */
//extern "C" int (*XBLAS_GET_PARALLEL_TYPE)(void);
/* linux systems */
#ifndef _WIN32
/* cblas functions that are imported from the lib. See cblas.h in OpenBlas for more information */
extern "C" void cblas_sgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB,
OPENBLAS_CONST BLASINT M, OPENBLAS_CONST BLASINT N, OPENBLAS_CONST BLASINT K, OPENBLAS_CONST float alpha,
OPENBLAS_CONST float *A, OPENBLAS_CONST BLASINT lda,
OPENBLAS_CONST float *B, OPENBLAS_CONST BLASINT ldb,
OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST BLASINT ldc);
extern "C" void cblas_dgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB,
OPENBLAS_CONST BLASINT M, OPENBLAS_CONST BLASINT N, OPENBLAS_CONST BLASINT K, OPENBLAS_CONST double alpha,
OPENBLAS_CONST double *A, OPENBLAS_CONST BLASINT lda,
OPENBLAS_CONST double *B, OPENBLAS_CONST BLASINT ldb,
OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST BLASINT ldc);
extern "C" void cblas_sger (OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST BLASINT M, OPENBLAS_CONST BLASINT N, OPENBLAS_CONST float alpha,
OPENBLAS_CONST float *X, OPENBLAS_CONST BLASINT incX, OPENBLAS_CONST float *Y, OPENBLAS_CONST BLASINT incY,
float *A, OPENBLAS_CONST BLASINT lda);
extern "C" void cblas_dger (OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST BLASINT M, OPENBLAS_CONST BLASINT N, OPENBLAS_CONST double alpha,
OPENBLAS_CONST double *X, OPENBLAS_CONST BLASINT incX, OPENBLAS_CONST double *Y, OPENBLAS_CONST BLASINT incY,
double *A, OPENBLAS_CONST BLASINT lda);
#if defined(OPENBLAS)
/* better control of multi-threading */
extern "C" void openblas_set_num_threads(int num_threads);
extern "C" void goto_set_num_threads(int num_threads);
//extern "C" int openblas_get_num_threads(void);
extern "C" int openblas_get_num_procs(void);
//extern "C" char* openblas_get_config(void);
//extern "C" char* openblas_get_corename(void);
//extern "C" int openblas_get_parallel(void);
#endif
#endif
#if defined(MKL)
/* better control of multi-threading */
//_Mkl_Api(void,MKL_Set_Num_Threads,(int nth))
//_Mkl_Api(int,MKL_Get_Max_Threads,(void))
extern "C" void MKL_Set_Num_Threads(int num_threads);
extern "C" int MKL_Get_Max_Threads();
#define mkl_set_num_threads MKL_Set_Num_Threads
#define mkl_get_max_num_threads MKL_Get_Max_Threads
//extern "C" void mkl_set_num_threads(int num_threads);
//extern "C" void omp_set_num_threads(int num_threads);
//extern "C" int mkl_get_max_num_threads();
#endif
#if defined(CUDA_BLAS)
// Utilities and system includes
#include <assert.h>
#include <helper_string.h> // helper for shared functions common to CUDA Samples
// CUDA runtime
#include <cuda_runtime.h>
#include <cublas_v2.h>
// CUDA and CUBLAS functions
#include <helper_functions.h>
#include <helper_cuda.h>
/* Matrix multiplication */
extern void BLASMatrixMULS(int deviceID, float * a, float * b, float * c, int na, int ma, int nb, int mb, int nc, int mc, float alpha = 1.0F, float beta = 0);
extern void BLASMatrixMULD(int deviceID, double * a, double * b, double * c, int na, int ma, int nb, int mb, int nc, int mc, double alpha = 1.0F, double beta = 0);
#endif
#endif
#ifdef _WIN32
#include "windows.h"
extern HINSTANCE hBLASDll;
#else
#endif
/* load some stuff for BLAS */
extern void LoadBLAS(const char * dllFileName);
/* unload the libs */
extern void UnloadBLAS();
} /* end of the nts (NiuTrans.Tensor) namespace */
#endif
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-05-30
*
*/
#include <stdio.h>
#include <stdlib.h>
#include "XDataType.h"
/* the nts (NiuTrans.Tensor) namespace */
namespace nts{
const char * GetDataTypeName(TENSOR_DATA_TYPE type)
{
if (type == X_INT)
return "X_INT";
else if (type == X_INT8)
return "X_INT8";
else if (type == X_FLOAT)
return "X_FLOAT";
else if (type == X_FLOAT16)
return "X_FLOAT16";
else if (type == X_DOUBLE)
return "X_DOUBLE";
return "NULL";
}
TENSOR_DATA_TYPE GetDataType(const char * typeName)
{
if (!strcmp(typeName, "X_INT"))
return X_INT;
else if (!strcmp(typeName, "X_INT8"))
return X_INT8;
else if (!strcmp(typeName, "X_FLOAT"))
return X_FLOAT;
else if (!strcmp(typeName, "X_FLOAT16"))
return X_FLOAT16;
else if (!strcmp(typeName, "X_DOUBLE"))
return X_DOUBLE;
else {
ShowNTErrors("Unknown data type!");
}
}
/****************************************************
Below is for calling CPU BLAS for fast matrix operations
I'm not sure how fast it is. But it seems that other
guys are crazy about this. So I decided to have a try.
*/
/* float -> float16 */
_XINLINE_ unsigned short FloatToFloat16(float f)
{
unsigned int x = *((unsigned int*)&f);
unsigned short h = ((x>>16)&0x8000)|((((x&0x7f800000)-0x38000000)>>13)&0x7c00)|((x>>13)&0x03ff);
return h;
}
/* float16 -> float */
_XINLINE_ float Float16ToFloat(unsigned short h)
{
float f = float(((h&0x8000)<<16) | (((h&0x7c00)+0x1C000)<<13) | ((h&0x03FF)<<13));
return f;
}
/*
data conversion
>> devID - device id
>> s - source data array
>> typeS - source data type
>> t - target data array
>> typeT - target data type
>> size - number of the items in s (and t)
*/
void ConvertDataType(int devID, void * s, TENSOR_DATA_TYPE typeS, void * t, TENSOR_DATA_TYPE typeT, int size)
{
CheckNTErrors((devID < 0), "This code must be run on GPUs!");
if(typeS == typeT)
return;
if(typeS == X_FLOAT && typeT == X_FLOAT16){
for(int i = 0; i < size; i++){
((unsigned short*)t)[i] = FloatToFloat16(((float*)s)[i]);
}
}
else if(typeS == X_FLOAT16 && typeT == X_FLOAT){
for(int i = 0; i < size; i++){
((float*)t)[i] = Float16ToFloat(((unsigned short*)s)[i]);
}
}
else{
ShowNTErrors("Unsupported data types for conversion!");
}
}
} /* end of the nts (NiuTrans.Tensor) namespace */
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-05-30
*
*/
#ifndef __XDATATYPE_H__
#define __XDATATYPE_H__
#include "XGlobal.h"
/* the nts (NiuTrans.Tensor) namespace */
namespace nts{
/* data type of the tensor, e.g., int, float, and double. */
enum TENSOR_DATA_TYPE {X_INT, X_INT8, X_FLOAT, X_FLOAT16, X_DOUBLE};
/* transposed matrix type */
enum MATRIX_TRANS_TYPE{X_TRANS, X_NOTRANS};
/* default data type */
#ifdef DOUBELPRICSION
#define DEFAULT_DTYPE X_DOUBLE
#else
#define DEFAULT_DTYPE X_FLOAT
#endif
/* get data type name */
extern const char * GetDataTypeName(TENSOR_DATA_TYPE type);
extern TENSOR_DATA_TYPE GetDataType(const char * typeName);
/* data conversion (for lower precision computation) */
extern "C" unsigned short FloatToFloat16(float f);
extern "C" float Float16ToFloat(unsigned short h);
extern "C" void ConvertDataType(int devID,
void * s, TENSOR_DATA_TYPE typeS,
void * t, TENSOR_DATA_TYPE typeT, int size);
#ifdef USE_CUDA
void CudaConvertDataType(int devID,
void * s, TENSOR_DATA_TYPE typeS,
void * t, TENSOR_DATA_TYPE typeT, int size);
#endif
} /* end of the nts (NiuTrans.Tensor) namespace */
#endif
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2016-06-23
*
*/
#ifndef __XDEVICE_H__
#define __XDEVICE_H__
#include "XThread.h"
#ifdef USE_CUDA
/* the CUDA stuff */
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cuda.h>
#endif
/* the nts (NiuTrans.Tensor) namespace */
namespace nts{
#define MAX_LENGTH_OF_DEVICE_NAME 64
#define MAX_CPU_NUM 16
#define MAX_GPU_NUM 16
#define MAX_DEVICE_NUM MAX_CPU_NUM+MAX_GPU_NUM
#define INVALID_DEVICE_ID -1000
#define CURRENT_GPU 1000
//#define CUDA_UVA 1 // Unified Virtual Address Space of Cuda
/*
a class that records the basic information for each GPU/CPU device
e.g., the memory limit, warp size of a GPU and etc.
*/
class XDevice
{
public:
/*
device id
<0: CPU memory
>=0: GPU device ID
*/
int devID;
/* size of the memory */
int memSize;
/* warp size of an (Navida) GPU */
int GPUWarpSize;
/*
max grid size (or number of blocks) of an (Navida) GPU
NOTE: the grid size is alone with three dimensions (x, y, z)
*/
int GPUMaxGridSize[3];
/*
max block size (or number of threads per block) of an (Navida) GPU
NOTE: the block size is alone with three dimensions (x, y, z)
*/
int GPUMaxBlockSize[3];
/* max thread number that is supported */
int GPUMaxThreadNum;
/* max (and optimal) thread number for a block */
int GPUMaxThreadNumPerBlock;
/* name of the device */
char name[MAX_LENGTH_OF_DEVICE_NAME];
/* name of the device */
char name2[MAX_LENGTH_OF_DEVICE_NAME];
/* specify whether Unified Virtual Address Space (UVA) is supported */
bool isUVASupported;
#ifdef USE_CUDA
/* mutex for handle (GPU cublas) */
MUTEX_HANDLE cublasMutex;
/* handle used for cublas */
cublasHandle_t cublasHandle;
/* specify if the handle is initialized */
bool isHandleReady;
#endif
public:
/* constructor */
XDevice();
/* de-constructor */
~XDevice();
/* initialize it and get the device information */
void Init(int myDevID);
/* clear it */
void Clear();
#ifdef USE_CUDA
/* get cublas handle */
cublasHandle_t * GetCublasHandle();
#endif
/* switch to a device */
static
void SetGPUDevice(int devID);
/* switch to a device (with fast GPU execution mode) */
static
void SetGPUDeviceFast(int devID);
/* switch to a get current dev */
static
int GetGPUDevice();
/* reset cuda flag for more efficient cuda execution */
static
void SetFastFlags();
/* reset cuda flag for more efficient cuda execution (all devices) */
static
void SetFastFlagsAllDevices();
};
/*
a class for the management of devices
*/
class XDevManager
{
public:
/* CPU device information */
XDevice CPUs[MAX_CPU_NUM];
/* number of CPUs */
int nCPU;
/* GPU device information */
XDevice GPUs[MAX_GPU_NUM];
/* number of GPUs */
int nGPU;
public:
/* constructor */
XDevManager();
/* de-constructor */
~XDevManager();
/* initialize it and get the CPU and GPU information */
void Init();
/* clear it */
void Clear();
#ifdef USE_CUDA
/* get the handle of GPU */
cublasHandle_t * GetCudaHandle(const int devID);
#endif
/* get grid and block sizes that max potential */
int GetCudaThread(const int devID, const int n, int * gridSize, int * blockSize);
/* get grid and block sizes that max potential (2-dimension assignment) */
int GetCudaThread2D(const int devID, const int n, const int m, int nLimit, int * gridSize, int * blockSize);
/* get device ids for the given device information */
int GetDeviceIDs(char * devInfo, int * devIDs);
/* show id sequence */
void ShowDeviceIDs(char * devInfo, char * msg);
/* show device information */
void ShowDevInfo();
/* get the device information in string */
char * GetDevString(int devID);
};
/* managing the devices */
extern XDevManager GDevs;
/* keep the device config */
#define ProtectCudaDev(devID, devIDBackup) \
{ \
cudaGetDevice(&devIDBackup); \
if(devIDBackup != devID) \
cudaSetDevice(devID); \
} \
#define BacktoCudaDev(devID, devIDBackup) \
{ \
if(devIDBackup != devID) \
cudaSetDevice(devIDBackup); \
} \
} /* end of the nts (NiuTrans.Tensor) namespace */
#endif
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2016-01-20
*
*/
#include <stdlib.h>
#include <stdio.h>
#include "XGlobal.h"
#if !defined( WIN32 ) && !defined( _WIN32 )
#include "sys/time.h"
#include "time.h"
#include "iconv.h"
#else
#include "time.h"
#include "windows.h"
#include "process.h"
#endif
/* the nts (NiuTrans.Tensor) namespace */
namespace nts{
/* memory pool setting */
int MAX_MEM_BLOCK_NUM = 1024;
int MAX_MEM_BLOCK_SIZE = 1024 * 1024 * 256;
int MIN_MEM_BLOCK_SIZE = 1024 * 1024 * 64;
int MINOR_MEM_BLOCK_SIZE = 1024 * 1024 * 256;
int MAX_MEM_BUF_SIZE = 1024 * 1024 * 256;
int MIN_MEM_BUF_SIZE = 1024 * 1024 * 32;
int TRAINING_SAMPLE_BUF_SIZE = 1024 * 1024 * 16;
int CONST_MINUSONE = -1;
bool CONST_TRUE = true;
int verboseLevel = 0;
bool useBLAS = false;
bool useCUDA = false;
FILE * tmpLog = NULL;
double myTime = 0;
double myTime2 = 0;
double myTime3 = 0;
double myTime4 = 0;
double myTime5 = 0;
double myTime6 = 0;
double myTime7 = 0;
double myTime8 = 0;
double myTime9 = 0;
double myTimeForward1 = 0;
double myTimeForward2 = 0;
double myTimeForward3 = 0;
double myTimeBackward1 = 0;
double myTimeBackward2 = 0;
double myTimeBackward3 = 0;
double myTimeBackward4 = 0;
int dEdWCount = 0;
FILE * tF;
/* initialization of the global stuff */
void InitGlobalAll()
{
srand((unsigned int)time(NULL));
}
} /* end of the nts (NiuTrans.Tensor) namespace */
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2016-01-20
*
*/
#ifndef __XGLOBAL_H__
#define __XGLOBAL_H__
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <float.h>
#ifndef WIN32
#include <sys/time.h>
#include <unistd.h>
#endif
// the CUDA stuff
#ifdef USE_CUDA
#include <cuda_runtime.h>
#include <cublas_v2.h>
#endif
/* the nts (NiuTrans.Tensor) namespace */
namespace nts {
#define _XINLINE_ inline
//#define DOUBELPRICSION
#ifdef DOUBELPRICSION
#define DTYPE double
#define DTYPE_MIN (DTYPE)1.79E+308
#else
#define DTYPE float
#define DTYPE_MIN (DTYPE)-3.40E+38
#endif
#if WIN32
#define DELIMITER '\\'
#else
#define DELIMITER '/'
#endif
#ifndef MIN
#define MIN(a,b) ((a < b) ? a : b)
#endif
#ifndef MAX
#define MAX(a,b) ((a > b) ? a : b)
#endif
#define __FILENAME__ ( strrchr(__FILE__, DELIMITER) != NULL ? strrchr(__FILE__, DELIMITER)+1 : __FILE__ )
#define CheckNTErrors(x, msg) \
{ \
if(!(x)) \
{ \
fprintf(stderr, "Error! calling '%s' (%s line %d): %s\n", #x, __FILENAME__, __LINE__, msg); \
exit(1); \
} \
} \
#define CheckNTErrorsV0(x) \
{ \
if(!(x)) \
{ \
fprintf(stderr, "Error! calling '%s' (%s line %d): %s\n", #x, __FILENAME__, __LINE__); \
exit(1); \
} \
} \
#define ShowNTErrors(msg) \
{ \
{ \
fprintf(stderr, "Error! (%s line %d): %s\n", __FILENAME__, __LINE__, msg); \
exit(1); \
} \
} \
#define MAX_FILE_NAME_LENGTH 1024 * 2
#define MAX_LINE_LENGTH 1024*1024
#define MAX_SENTENCE_LEN 512
#define X_MILLION 1000000
#define MAX_INT 2147483647
#define MAX_FLOAT FLT_MAX
#define FIELD_SEP " ||| "
#define FLOAT_MIN float(-1.0E38)
#define FLOAT16_MIN float(-65504)
#define MILLION 1000000
#define LOG_E_10 2.302585
#define LEADING_DIM 1
/* cuda setting */
#define MAX_CUDA_THREAD_NUM_PER_BLOCK 512
#define MIN_CUDA_SHARED_MEM_COL_SIZE 8
#define MAX_MODEL_NUM 512
#define SHARED_MEMORY_SIZE (48 << 10)
/* memory pool setting */
extern int MAX_MEM_BLOCK_NUM;
extern int MAX_MEM_BLOCK_SIZE;
extern int MIN_MEM_BLOCK_SIZE;
extern int MINOR_MEM_BLOCK_SIZE;
extern int MAX_MEM_BUF_SIZE;
extern int MIN_MEM_BUF_SIZE;
extern int TRAINING_SAMPLE_BUF_SIZE;
extern int CONST_MINUSONE;
extern bool CONST_TRUE;
//#define USE_CUDA_RESURSION 1
#define NIUTRANSNNDEBUG
extern int verboseLevel;
extern bool useBLAS;
extern bool useCUDA;
#define FFLUSH(FILEH) \
{ \
fflush(FILEH); \
} \
#define XPRINT(VERBOSE,FILEH,STR) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR);FFLUSH(FILEH);}}
#define XPRINT1(VERBOSE,FILEH,STR,ARG) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG);FFLUSH(FILEH);}}
#define XPRINT2(VERBOSE,FILEH,STR,ARG,ARG2) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG,ARG2);FFLUSH(FILEH);}}
#define XPRINT3(VERBOSE,FILEH,STR,ARG,ARG2,ARG3) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG,ARG2,ARG3);FFLUSH(FILEH);}}
#define XPRINT4(VERBOSE,FILEH,STR,ARG,ARG2,ARG3,ARG4) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG,ARG2,ARG3,ARG4);FFLUSH(FILEH);}}
#define XPRINT5(VERBOSE,FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5);FFLUSH(FILEH);}}
#define XPRINT6(VERBOSE,FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6);FFLUSH(FILEH);}}
#define B2I(V) V==0?false:true
/* BLAS interfaces */
#ifdef DOUBELPRICSION
#define GEMM XBLAS_DGEMM
#else
#define GEMM XBLAS_SGEMM
#endif
extern void InitGlobalAll();
extern FILE * tmpLog;
extern int dEdWCount;
extern FILE * tF;
extern int tmpCountV2;
extern int nnnTotal;
} /* end of the nts (NiuTrans.Tensor) namespace */
#endif
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
*
* As it is, this is a heap.
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2017-12-20
*
*/
#include "XGlobal.h"
#include "XHeap.h"
/* the nts (NiuTrans.Tensor) namespace */
namespace nts{
/* constructor */
template<HeapType hType, typename T>
XHeap<hType, T>::XHeap(int mySize, XMem * myMem)
{
mem = myMem;
size = mySize;
count = 0;
if (mem == NULL)
items = new HeapNode<T>[mySize];
else
mem->Alloc(mem->devID, mySize * sizeof(T));
}
/* deconstructor */
template<HeapType hType, typename T>
XHeap<hType, T>::~XHeap()
{
delete[] items;
}
template<HeapType hType, typename T>
void XHeap<hType, T>::Clear(T initValue)
{
count = 0;
for (int i = 0; i < size; i++) {
items[i].index = 0;
items[i].value = initValue;
}
}
/* compare node i and node j */
template<HeapType hType, typename T>
_XINLINE_ bool XHeap<hType, T>::Compare(int i, int j)
{
if (hType == MIN_HEAP)
return items[i].value < items[j].value;
else
return items[j].value < items[i].value;
}
/* top most item */
template<HeapType hType, typename T>
_XINLINE_ HeapNode<T> XHeap<hType, T>::Top()
{
HeapNode<T> node = items[0];
return node;
}
/* last item */
template<HeapType hType, typename T>
_XINLINE_ HeapNode<T> XHeap<hType, T>::End()
{
HeapNode<T> node = items[count - 1];
return node;
}
/* push an item into the heap */
template<HeapType hType, typename T>
_XINLINE_ void XHeap<hType, T>::Push(HeapNode<T> node)
{
//CheckNTErrors((count < size), "Heap is full!");
items[count] = node;
Up(count);
count++;
}
/* replace the top-most item and update the heap */
template<HeapType hType, typename T>
_XINLINE_ void XHeap<hType, T>::ReplaceTop(HeapNode<T> node)
{
items[0] = node;
Down(0);
}
/* pop the top most item */
template<HeapType hType, typename T>
_XINLINE_ HeapNode<T> XHeap<hType, T>::Pop()
{
//CheckNTErrors((size > 0), "Empty heap!");
HeapNode<T> node = items[0];
items[0] = items[count - 1];
count--;
items[count].index = 0;
items[count].value = 0;
Down(0);
return node;
}
/* move item k down the tree */
template<HeapType hType, typename T>
_XINLINE_ void XHeap<hType, T>::Down(int k)
{
int i = k;
while (2 * i + 1 < count) {
int l = 2 * i + 1, r = 2 * i + 2;
int m = (r >= count || Compare(l, r)) ? l : r;
if (Compare(i, m))
break;
HeapNode<T> tmp = items[i];
items[i] = items[m];
items[m] = tmp;
i = m;
}
}
/* move item k up the tree */
template<HeapType hType, typename T>
_XINLINE_ void XHeap<hType, T>::Up(int k)
{
int i = k;
int parent = (i - 1) / 2;
while (i > 0 && !Compare(parent, i)) {
HeapNode<T> tmp = items[i];
items[i] = items[parent];
items[parent] = tmp;
i = parent;
parent = (i - 1) / 2;
}
}
/* explicit instantiation */
template class XHeap<MAX_HEAP, float>;
template class XHeap<MAX_HEAP, double>;
template class XHeap<MAX_HEAP, int>;
template class XHeap<MIN_HEAP, float>;
template class XHeap<MIN_HEAP, double>;
template class XHeap<MIN_HEAP, int>;
} /* end of the nts (NiuTrans.Tensor) namespace */
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
*
* As it is, this is a heap.
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2017-12-20
* Wedding anniversary !!!
*
*/
#ifndef __XHEAP_H__
#define __XHEAP_H__
#include "XMem.h"
/* the nts (NiuTrans.Tensor) namespace */
namespace nts{
enum HeapType{MIN_HEAP, MAX_HEAP};
/* an item in the heap */
template <typename T>
struct HeapNode
{
/* node index */
int index;
/* value of the node */
T value;
HeapNode()
{
index = -1;
value = 0;
};
HeapNode(int i, T v)
{
index = i;
value = v;
};
};
/* a heap that keeps a data array of T */
template<HeapType hType, typename T>
class XHeap
{
public:
/* memory pool */
XMem * mem;
/* number of the items the heap keeps */
int size;
/* number of the items that are already in the heap */
int count;
/* items */
HeapNode<T> * items;
public:
/* constructor */
XHeap(int mySize, XMem * myMem = NULL);
/* deconstructor */
~XHeap();
/* clear the data */
void Clear(T initValue);
/* compare node i and node j */
bool Compare(int i, int j);
/* top most item */
HeapNode<T> Top();
/* last item */
HeapNode<T> End();
/* push an item into the heap */
void Push(HeapNode<T> node);
/* replace the top-most item and update the heap */
void ReplaceTop(HeapNode<T> node);
/* pop the top most item */
HeapNode<T> Pop();
/* move item k down the tree */
void Down(int k);
/* move item k up the tree */
void Up(int k);
};
} /* end of the nts (NiuTrans.Tensor) namespace */
#endif
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
*
* Implementation of list that keeps data items
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-04-17
*
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "XList.h"
#include "XGlobal.h"
#include "wchar.h"
#include "locale.h"
#if !defined( WIN32 ) && !defined( _WIN32 )
#include "sys/time.h"
#include "time.h"
#include "iconv.h"
#else
#include "time.h"
#endif
/* the nts (NiuTrans.Tensor) namespace */
namespace nts{
/* constructor */
XList::XList()
{
mem = NULL;
maxNum = 0;
count = 0;
items = NULL;
isIntList = false;
}
/*
constructor
>> myMaxNum - maximum number of items to keep
>> isIntListOrNot - specify if the list keeps int items
*/
XList::XList(int myMaxNum, bool isIntListOrNot)
{
mem = NULL;
maxNum = myMaxNum;
count = 0;
items = new void*[myMaxNum];
isIntList = isIntListOrNot;
}
/*
constructor
>> myMaxNum - maximum number of items to keep
>> myMem - the memory pool used for data allocation
>> isIntListOrNot - specify if the list keeps int items
*/
XList::XList(int myMaxNum, XMem * myMem, bool isIntListOrNot)
{
mem = myMem;
maxNum = myMaxNum;
count = 0;
items = (void**)mem->Alloc(mem->devID, sizeof(void*) * maxNum);
isIntList = isIntListOrNot;
}
/* de-constructor */
XList::~XList()
{
if(isIntList){
for(int i = 0; i < count; i++){
int * p = (int*)items[i];
delete[] p;
}
}
if(mem == NULL)
delete[] items;
}
/*
allocate the data array for the list
>> myMaxNum - maximum number of items to keep
>> isIntListOrNot - specify if the list keeps int items
*/
void XList::Create(int myMaxNum, XMem * myMem)
{
mem = myMem;
maxNum = myMaxNum;
count = 0;
items = (void**)mem->Alloc(mem->devID, sizeof(void*) * maxNum);
}
/*
add an item into the list
>> item - pointer to the item
*/
void XList::Add(void * item)
{
if( count == maxNum ){
void ** newItems;
if( mem == NULL )
newItems = new void*[maxNum * 2 + 1];
else
newItems = (void**)mem->Alloc(mem->devID, sizeof(void*) * (maxNum * 2 + 1));
memcpy(newItems, items, sizeof(void*) * maxNum);
if( mem == NULL )
delete[] items;
items = newItems;
maxNum = maxNum * 2 + 1;
}
items[count++] = item;
}
/*
add a number of items into the list
>> inputItems - pointer to the array of items
>> inputItemCount - number of input items
*/
void XList::Add(void ** inputItems, int inputItemCount)
{
if( count + inputItemCount >= maxNum ){
int newMaxNum = (count + inputItemCount) * 2 + 1;
void ** newItems;
if( mem == NULL )
newItems = new void*[newMaxNum];
else
newItems = (void**)mem->Alloc(mem->devID, sizeof(void*) * newMaxNum);
memcpy(newItems, items, sizeof(void*) * maxNum);
if( mem == NULL )
delete[] items;
items = newItems;
maxNum = newMaxNum;
}
memcpy(items + count, inputItems, sizeof(void*) * inputItemCount);
count += inputItemCount;
}
/*
append a list to the current list
>> l - the list we use to append
*/
void XList::AddList(XList * l)
{
Add(l->items, l->count);
}
/*
add an integer-typed item into the list
>> item - pointer to the item
*/
void XList::AddInt(int i)
{
CheckNTErrors(isIntList, "An int list is required!");
int * a = new int[1];
*a = i;
Add(a);
}
/*
insert an item to the given position of the list
>> pos - the position
>> item - the item for insertion
*/
void XList::Insert(int pos, void * item)
{
if( count == maxNum ){
void ** newItems;
if( mem == NULL )
newItems = new void*[maxNum * 2 + 1];
else
newItems = (void**)mem->Alloc(mem->devID, sizeof(void*) * (maxNum * 2 + 1));
memcpy(newItems, items, sizeof(void*) * maxNum);
if( mem == NULL )
delete[] items;
items = newItems;
maxNum = maxNum * 2 + 1;
}
for(int i = count - 1; i >= pos; i--)
items[i + 1] = items[i];
items[pos] = item;
count++;
}
/* get the item at position i */
void * XList::GetItem(int i)
{
if( i >= 0 && i < count )
return items[i];
else
return NULL;
}
/* get the integer-typed item at position i */
int XList::GetItemInt(int i)
{
CheckNTErrors(isIntList, "An int list is required!");
if( i >= 0 && i < count ){
return *(int*)(items[i]);
}
else
return 0;
}
/* set the item at position i */
void XList::SetItem(int i, void * item)
{
if( i >= 0 && i < count )
items[i] = item;
}
/* get the integer-typed item at position i */
void XList::SetItemInt(int i, int item)
{
CheckNTErrors(isIntList, "An int list is required!");
if( i >= 0 && i < count )
*(int*)(items[i]) = item;
}
/*
find the position of the first matched item
>> item - the item for matching
<< the position where we hit the item (if any)
*/
int XList::FindFirst(void * item)
{
for(int i = 0;i < count; i++){
if(item == items[i])
return i;
}
return -1;
}
/* clear the data array */
void XList::Clear()
{
if(isIntList){
for(int i = 0; i < count; i++){
delete[] (int*)items[i];
}
count = 0;
}
else
count = 0;
}
/* delete the data array as well as the string arrays kept in it */
void XList::ClearStringList()
{
if(mem == NULL){
for(int i = 0; i < count; i++){
delete[] (char*)items[i];
}
}
count = 0;
}
/*
sort the list
>> itemSize - size of an item
>> comp - the comparison function used in sorting
*/
void XList::Sort(int itemSize, ListCompare comp)
{
qsort(items, count, itemSize, comp);
}
/* reverse the list */
void XList::Reverse()
{
int half = count/2;
for(int i = 0; i < half; i++){
void * tmp = items[i];
items[i] = items[count - i - 1];
items[count - i - 1] = tmp;
}
}
/* remove the item at position i */
void XList::Remove(int i)
{
if(i >= count || i < 0)
return;
memcpy(items + i, items + i + 1, sizeof(void*) * (count - i - 1));
count--;
}
/*
copy the list
>> myMem - memory pool used for allocating the data in the new list
<< hard copy of the list
*/
XList * XList::Copy(XMem * myMem)
{
XList * newList = new XList(maxNum, myMem);
for(int i = 0; i < count; i++){
newList->Add(GetItem(i));
}
return newList;
}
/*
shuffle the list
>> nround - number of rounds for shuffling
>> beg - where we start
>> len - how many items are used in shuffling
*/
void XList::Shuffle(int nround, int beg, int len)
{
if(beg < 0){
beg = 0;
len = count;
}
if(beg + len > count)
return;
srand((unsigned int)time(NULL));
for(int k = 0; k < nround; k++){
/* Fisher¨CYates shuffle */
for(int i = 0; i < len; i++){
float a = (float)rand()/RAND_MAX;
size_t j = (unsigned int) (a*(i+1));
void* t = items[beg + j];
items[beg + j] = items[beg + i];
items[beg + i] = t;
}
}
}
}
/* end of the nts (NiuTrans.Tensor) namespace */
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
*
* Implementation of list that keeps data items
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-04-17
* The first coding job this year!
*
*/
#ifndef __XLIST_H__
#define __XLIST_H__
#include "XMem.h"
#include "XGlobal.h"
/* the nts (NiuTrans.Tensor) namespace */
namespace nts{
typedef int (* ListCompare)(const void * item1, const void * item2);
/* the XList class */
class XList
{
public:
/* data items */
void ** items;
/* number of items */
int count;
/* maximum number of items can be kept */
int maxNum;
/* the memory pool for data array allocation */
XMem * mem;
/* indicates whether data items are integers */
bool isIntList;
public:
/* constructor */
XList();
/* constructor */
XList(int myMaxNum, bool isIntListOrNot = false);
/* constructor */
XList(int myMaxNum, XMem * myMem, bool isIntListOrNot = false);
/* de-constructor */
~XList();
/* utilities */
void Create(int myMaxNum, XMem * myMem);
void Add(void * item);
void Add(void ** inputItems, int inputItemCount);
void AddList(XList * l);
void AddInt(int i);
void Insert(int pos, void * item);
void * GetItem(int i);
int GetItemInt(int i);
void SetItem(int i, void * item);
void SetItemInt(int i, int item);
int FindFirst(void * item);
void Clear();
void ClearStringList();
void Sort(int itemSize, ListCompare comp);
void Reverse();
void Remove(int i);
XList * Copy(XMem * myMem);
void Shuffle(int nround = 10, int beg = -1, int len = 0);
/* short */
_XINLINE_ void * Get(int i) {return GetItem(i);};
_XINLINE_ int GetInt(int i) {return GetItemInt(i);};
_XINLINE_ void Set(int i, void * item) {SetItem(i, item);};
_XINLINE_ void SetInt(int i, int item) {SetItemInt(i, item);};
};
}
/* end of the nts (NiuTrans.Tensor) namespace */
#endif
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2016-03-09
*
*/
#include <stdlib.h>
#include <limits.h>
#include <string.h>
#include "XPRunner.h"
#include "XGlobal.h"
/* the nts (NiuTrans.Tensor) namespace */
namespace nts{
/*
The XPRunner maintains a the parallel processing resources, e.g., a pool
of threads. It can provide the parallel computation interface for someone
that needs to do something parallel, e.g., speed-up matrix operation by
multi-threading.
*/
XPRunner * globalPRunner = NULL;
/****************************
general methods
*/
/* constructor */
XPRunner::XPRunner()
{
method = PRUNNER_SINGLE;
/* multi-threading */
threads = NULL;
threadNum = 0;
minimumOPNum = INT_MAX;
MUTEX_INIT(mutex);
isMultiThreaded = true;
availableThreadNum = 0;
runningThreadNum = 0;
runningThreads = new int[MAX_THREAD_NUM];
memset(runningThreads, 0 ,sizeof(int) * MAX_THREAD_NUM);
runningStates = new int[MAX_THREAD_NUM];
memset(runningStates, 0 ,sizeof(int) * MAX_THREAD_NUM);
availableThreads = new int[MAX_THREAD_NUM];
memset(availableThreads, 0 ,sizeof(int) * MAX_THREAD_NUM);
}
/* deconstructor */
XPRunner::~XPRunner()
{
KillThreads();
MUTEX_DELE(mutex);
delete[] runningThreads;
delete[] runningStates;
delete[] availableThreads;
}
/*
initialization
>> myThreadNum - number of required threads
*/
void XPRunner::Init(int myThreadNum)
{
CreateThreads(myThreadNum);
if(myThreadNum > 0)
method = PRUNNER_MULTIPLE;
}
/****************************
methods for multi-threading
*/
/*
initialization
>> tNum - number of required threads
*/
void XPRunner::CreateThreads(int tNum)
{
if(tNum > MAX_THREAD_NUM){
XPRINT2(0, stderr, "[XPRunner::CreateThreads] Error! Too many threads[%d>%d]!\n", tNum, MAX_THREAD_NUM);
exit(1);
}
threads = new XThread[tNum];
for(int i = 0; i < tNum; i++){
if(!threads[i].Start()){
XPRINT1(0, stderr, "[XPRunner::CreateThreads] Error! cannot create thread %d\n", i);
exit(1);
}
}
#ifdef _WIN32
Sleep(300);
#else
usleep(300 * 1000);
#endif
threadNum = tNum;
minimumOPNum = MIN_OPERATION_NUM;
}
/* kill all threads */
void XPRunner::KillThreads()
{
for(int i = 0; i < threadNum; i++){
//threads[i].End();
}
#ifdef _WIN32
//Sleep(300);
#else
//sleep(0.3);
#endif
delete[] threads;
threads = NULL;
}
/*
run a set of jobs in parallel
>> jobFunctions - the function for each job
>> jobArgs - the list of arguments for each job
>> sleepTime - time to sleep (in ms) for each round
*/
void XPRunner::Run(XList * jobFunctions, XList * jobArgs, float sleepTime)
{
if(threadNum <= 0){
XPRINT(1, stderr, "Error! No threads were created!\n");
exit(1);
}
runningThreadNum = 0;
availableThreadNum = 0;
memset(runningStates, 0, sizeof(int) * MAX_THREAD_NUM);
int c = jobFunctions->count;
int unfinished = c;
MUTEX_LOCK(mutex);
while(unfinished > 0){
/* get the list of threads that are ready to process the job */
for(int i = 0; i < threadNum; i++){
if(runningStates[i] == 2 && threads[i].jobCount == 0){
/* a job has been finished*/
unfinished--;
availableThreads[availableThreadNum++] = i;
runningStates[i] = 1;
#ifdef _WIN32
MUTEX_LOCK(threads[i].workingMutex);
COND_RESET(threads[i].jobCond);
MUTEX_UNLOCK(threads[i].workingMutex);
#endif
}
else if(runningStates[i] == 0 && threads[i].jobCount == 0){
availableThreads[availableThreadNum++] = i;
runningStates[i] = 1;
#ifdef _WIN32
MUTEX_LOCK(threads[i].workingMutex);
COND_RESET(threads[i].jobCond);
MUTEX_UNLOCK(threads[i].workingMutex);
#endif
}
}
/* assign the jobs */
for(int i = availableThreadNum - 1; i >= 0 && c > 0; i--){
/* the function to run*/
TFunction function = (TFunction)jobFunctions->GetItem(jobArgs->count - c);
/* the arguments that are passed to the function */
volatile XList * args = (XList*)jobArgs->GetItem(jobArgs->count - c);
/* thread */
XThread * thread = threads + availableThreads[i];
thread->argv = args;
thread->function = function;
MUTEX_LOCK(thread->workingMutex);
thread->working = 1;
MUTEX_UNLOCK(thread->workingMutex);
#ifdef USE_PTHREAD
MUTEX_LOCK(thread->mutex);
thread->jobCount++;
MUTEX_UNLOCK(thread->mutex);
//COND_BROADCAST(thread->cond);
COND_SIGNAL(thread->cond);
//MUTEX_UNLOCK(thread->mutex);
#else
#ifdef _WIN32
/* reset various locks */
MUTEX_LOCK(thread->workingMutex);
thread->jobCount++;
COND_RESET(thread->jobCond);
//COND_RESET(thread->gCond);
MUTEX_UNLOCK(thread->workingMutex);
/* inform the job */
//ResumeThread(threads[i].hnd);
COND_SIGNAL(thread->jobCond);
#endif
#endif
/* a job is under processing */
c--;
availableThreadNum--;
runningStates[availableThreads[i]] = 2;
}
if(sleepTime > 0){
#ifdef _WIN32
Sleep((DWORD)sleepTime);
#else
sleep(sleepTime/1000);
#endif
}
}
MUTEX_UNLOCK(mutex);
}
/*
get the number of parallel jobs to run
size - number of operations we need
*/
int XPRunner::GetJobNum(int size)
{
int jobNum = int((float)size/minimumOPNum);
return MIN(jobNum, threadNum);
}
} /* end of the nts (NiuTrans.Tensor) namespace */
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2016-03-09
*
*/
#ifndef __XPRUNNER_H__
#define __XPRUNNER_H__
#include "XThread.h"
#include "XList.h"
/* the nts (NiuTrans.Tensor) namespace */
namespace nts{
#define MIN_OPERATION_NUM 1024 * 4
#define MAX_JOB_NUM 32
#define MAX_THREAD_NUM 32
#define PRUNNER_SINGLE 0
#define PRUNNER_MULTIPLE 1
#define PRUNNER_GPU 2
/*
The XPRunner maintains a the parallel processing resources, e.g., a pool
of threads. It can provide the parallel computation interface for someone
that needs to do something parallel, e.g., speed-up matrix operation by
multi-threading.
*/
class XPRunner
{
public:
/*
method of parallelization
// 0: single job; 1: multi-threading; 2: gpu
*/
int method;
public:
/* a set of threads */
XThread * threads;
/* max number of threads */
int threadNum;
/* a mutex lock */
MUTEX_HANDLE mutex;
/*
Minimum number of atomic operations for a thread.
It is used to avoid large overhead of too many "tiny" jobs.
*/
int minimumOPNum;
/* if multi-threading is activated */
bool isMultiThreaded;
/* list of running threads */
int * runningThreads;
/* list of threads states */
int * runningStates;
/* number of running threads */
int runningThreadNum;
/* list of available threads */
int * availableThreads;
/* number of available threads */
int availableThreadNum;
/* general methods */
public:
/* constructor */
XPRunner();
/* deconstructor */
~XPRunner();
/* initialization */
void Init(int myThreadNum);
/* methods for multi-threading */
public:
/* initialization */
void CreateThreads(int tNum);
/* kill all running threads in the pool */
void KillThreads();
/* run a set of jobs in parallel */
void Run(XList * jobFunctions, XList * jobArgs, float sleepTime = 0);
/* get the number of parallel jobs to run */
int GetJobNum(int size);
};
extern XPRunner * globalPRunner;
} /* end of the nts (NiuTrans.Tensor) namespace */
#endif
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
*
* This is an implementation of queue. Actually we intend to use it to maintain
* a priority job list
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2017-04-05
*
*/
#include <stdio.h>
#include <stdlib.h>
#include "XQueue.h"
#include "XDevice.h"
#include "XList.h"
#include "XUtility.h"
/* the nts (NiuTrans.Tensor) namespace */
namespace nts{
/**************************************
job item used in queues
*/
/* constructor */
JobQueueNode::JobQueueNode()
{
job = NULL;
args = new XList(1);
}
/* de-constructor */
JobQueueNode::~JobQueueNode()
{
delete args;
}
/**************************************
This class provides standard utilities of Queue.
*/
/* constuctor */
XQueue::XQueue(int mySize)
{
queue = new void*[mySize];
memset(queue, 0, sizeof(void*) * mySize);
size = mySize;
itemCount = 0;
head = 0;
tail = 0;
isJobQueue = false;
jobDequeuerArgs = new XList(1);
jobDequeuerBreak = false;
runningJobCount = 0;
jobStream = NULL;
jobStream1 = NULL;
jobStream2 = NULL;
MUTEX_INIT(enqueueMutex);
MUTEX_INIT(dequeueMutex);
COND_INIT(queueCond);
MUTEX_INIT(jobQueueMutex);
}
/* deconstructor */
XQueue::~XQueue()
{
delete[] queue;
delete jobDequeuerArgs;
delete jobStream;
delete jobStream1;
delete jobStream2;
//if(isJobQueue)
// StopJobConsumer();
MUTEX_DELE(enqueueMutex);
MUTEX_DELE(dequeueMutex);
COND_DELE(queueCond);
MUTEX_DELE(jobQueueMutex);
}
/*
put an item in the tail of the queue
>> item - the item we intend to add into the queue
*/
void XQueue::Enqueue(void * item)
{
MUTEX_LOCK(enqueueMutex);
MUTEX_LOCK(dequeueMutex);
CheckNTErrors((itemCount < size), "Put too many items into the queue!");
queue[tail] = item;
tail = (tail + 1) % size;
itemCount++;
COND_SIGNAL(queueCond);
MUTEX_UNLOCK(dequeueMutex);
MUTEX_UNLOCK(enqueueMutex);
}
/*
fetch an item from head of the queue
<< return - the head item of the queue
*/
void * XQueue::Dequeue()
{
MUTEX_LOCK(dequeueMutex);
while(itemCount == 0)
{
#ifdef WIN32
MUTEX_UNLOCK(dequeueMutex);
#endif
COND_WAIT(queueCond, dequeueMutex);
#ifdef WIN32
MUTEX_LOCK(dequeueMutex);
#endif
}
void * r = queue[head];
head = (head + 1) % size;
itemCount--;
MUTEX_UNLOCK(dequeueMutex);
return r;
}
/* return if the queue is empty */
bool XQueue::IsEmpty()
{
return itemCount == 0;
}
/* wait until the queue is empty */
void XQueue::WaitForEmptyJobQueue()
{
while(runningJobCount > 0){
XSleep(10);
}
if(jobStream != NULL){
CheckNTErrors((jobStream->IsFinished()), "None fineished jobs remain");
jobStream->Clear();
}
if(jobStream1 != NULL){
CheckNTErrors((jobStream1->IsFinished()), "None fineished jobs remain");
jobStream1->Clear();
}
if(jobStream2 != NULL){
CheckNTErrors((jobStream2->IsFinished()), "None fineished jobs remain");
jobStream2->Clear();
}
}
int devids[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
int cpuid = -1;
/*
run job consumer (in another thread)
>> jobDevID - id of the device for running the jobs
*/
void XQueue::RunJobConsumer(int jobDevID)
{
CheckNTErrors((jobDevID < 16), "device id is out of scope!");
isJobQueue = true;
jobDequeuerArgs->Clear();
jobDequeuerArgs->Add(this);
jobDequeuerArgs->Add(jobDevID >= 0 ? devids + jobDevID : &cpuid);
jobDequeuer.function = (TFunction)DequeueJobs;
jobDequeuer.argv = jobDequeuerArgs;
jobDequeuer.Start();
jobDequeuer.LetItGo();
}
/* stop the job consumer */
void XQueue::StopJobConsumer()
{
jobDequeuerBreak = true;
XSleep(10);
EnqueueJob(NULL, NULL);
jobDequeuer.End();
isJobQueue = false;
}
/* add a job item to process */
void XQueue::EnqueueJob(void * job, XList * jobArgs)
{
MUTEX_LOCK(jobQueueMutex);
runningJobCount++;
MUTEX_UNLOCK(jobQueueMutex);
JobQueueNode * node = new JobQueueNode();
node->job = job;
if(jobArgs != NULL)
node->args->AddList(jobArgs);
Enqueue(node);
}
/* job item consumer */
void XQueue::DequeueJobs(XList * args)
{
CheckNTErrors((args->count == 2), "Illegal arguments!");
XQueue * q = (XQueue*)args->GetItem(0);
int devID = *(int*)args->GetItem(1);
int devIDBackup = XDevice::GetGPUDevice();
if(devID >= 0)
XDevice::SetGPUDevice(devID);
while(1){
JobQueueNode * node = (JobQueueNode*)q->Dequeue();
if(q->GetJobBreak())
break;
CheckNTErrors((node != NULL), "Illegal job!");
/* process a job */
((TFunction)node->job)(node->args);
delete node;
MUTEX_LOCK(q->jobQueueMutex);
q->runningJobCount--;
MUTEX_UNLOCK(q->jobQueueMutex);
}
if(devID >= 0)
XDevice::SetGPUDevice(devIDBackup);
}
/* get the break flag */
bool XQueue::GetJobBreak()
{
return jobDequeuerBreak;
}
/* get job stream */
XStream * XQueue::GetJobStream(int n)
{
if(n == 0)
return jobStream;
else if(n == 1)
return jobStream1;
else if(n == 2)
return jobStream2;
else{
ShowNTErrors("invalid stream id!");
}
return NULL;
}
/* make job streams */
void XQueue::MakeJobStreams(int devID, int devID1, int devID2)
{
if(devID != INVALID_DEVICE_ID)
jobStream = new XStream(0, devID);
if(devID1 != INVALID_DEVICE_ID)
jobStream1 = new XStream(0, devID1);
if(devID2 != INVALID_DEVICE_ID)
jobStream2 = new XStream(0, devID2);
}
} /* end of the nts (NiuTrans.Tensor) namespace */
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
*
* This is an implementation of queue. Actually we intend to use it to maintain
* a priority job list
*
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2017-04-05
* I came back from the holiday - while Tongran and Dingdang are still in Beijing
* (working and playing??)
*
* Parts of the code is copied from Duquan's work. Thanks :)
*/
#ifndef __XQUEUE_H__
#define __XQUEUE_H__
#include "XGlobal.h"
#include "XThread.h"
#include "XStream.h"
#include "XDevice.h"
#include "XList.h"
/* the nts (NiuTrans.Tensor) namespace */
namespace nts{
#define MAX_QUEUE_SIZE 1024 * 8
/*
job item used in queues
*/
class JobQueueNode
{
public:
/* the job function */
void * job;
/* arguments of the job */
XList * args;
public:
/* constructor */
JobQueueNode();
/* de-constructor */
~JobQueueNode();
};
/*
This class provides standard utilities of Queue.
*/
class XQueue
{
private:
/* mutex for the enqueue process */
MUTEX_HANDLE enqueueMutex;
/* mutex for the dequeue process */
MUTEX_HANDLE dequeueMutex;
/* conditional mutex for the dequeue process */
COND_HANDLE queueCond;
/* mutex for the job queue */
MUTEX_HANDLE jobQueueMutex;
/* the array for the queue */
void ** queue;
/* max size of the queue */
int size;
/* number of item in queue */
int itemCount;
/* head of the queue */
int head;
/* tail of the queue */
int tail;
/* indicates whether we are using a job queue */
bool isJobQueue;
/* consume the job items in the queue */
XThread jobDequeuer;
/* argument list of jobDequeuer */
XList * jobDequeuerArgs;
/* indicates whether jobDequeuer stops */
bool jobDequeuerBreak;
/* running job count */
int runningJobCount;
/* job streams (we think that three streams is enough :)) */
XStream * jobStream;
XStream * jobStream1;
XStream * jobStream2;
public:
/* constuctor */
XQueue(int mySize = MAX_QUEUE_SIZE);
/* deconstructor */
~XQueue();
/* put an item in the tail of the queue */
void Enqueue(void * item);
/* fetch an item from head of the queue */
void * Dequeue();
/* return if the queue is empty */
bool IsEmpty();
/* wait until the queue is empty */
void WaitForEmptyJobQueue();
/* run the job consumer */
void RunJobConsumer(int jobDevID = 0);
/* stop the job consumer */
void StopJobConsumer();
/* add a job item to process */
void EnqueueJob(void * job, XList * jobArgs);
/* job item consumer */
static
void DequeueJobs(XList * args);
/* get the break flag */
bool GetJobBreak();
/* get job stream */
XStream * GetJobStream(int n = 0);
/* make job streams */
void MakeJobStreams(int devID = INVALID_DEVICE_ID, int devID1 = INVALID_DEVICE_ID, int devID2 = INVALID_DEVICE_ID);
};
} /* end of the nts (NiuTrans.Tensor) namespace */
#endif
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
*
* This is for streaming (on GPU), i.e., run jobs in different stream for
* GPU Async capabilities.
*
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2016-03-09
*
*/
#include "stdio.h"
#include "stdlib.h"
#include "XGlobal.h"
#include "XStream.h"
#include "XDevice.h"
/* the nts (NiuTrans.Tensor) namespace */
namespace nts{
/*
This class defines the stream used in pipelining jobs. E.g., one can put
a sequence of jobs in a stream and asynchronously do something else. Basically
we can use multiply streams to hide the data transfer cost on GPUs by using
job overlaps.
*/
/* constructor */
XStream::XStream(int priority, int myDevID, int myMaxEventNum)
{
devID = myDevID;
#ifdef USE_CUDA
if(myDevID >= 0){
int backupDevID = XDevice::GetGPUDevice();
XDevice::SetGPUDevice(myDevID);
events = new cudaEvent_t[myMaxEventNum];
XDevice::SetGPUDevice(backupDevID);
maxEventNum = myMaxEventNum;
usedEventNum = 0;
}
else{
maxEventNum = 0;
usedEventNum = 0;
}
#endif
Create(priority, devID);
}
/* deconstructor */
XStream::~XStream()
{
Destroy();
#ifdef USE_CUDA
delete[] events;
#endif
}
/* create the stream */
void XStream::Create(int priority, int myDevID)
{
if(myDevID < 0)
return;
#ifdef USE_CUDA
int backupDevID = XDevice::GetGPUDevice();
XDevice::SetGPUDevice(myDevID);
//cudaStreamCreateWithPriority(&stream, cudaStreamDefault, priority);
CheckNTErrors((cudaStreamCreate(&stream) == cudaSuccess),
"cannot create the cuda stream!");
XDevice::SetGPUDevice(backupDevID);
#endif
devID = myDevID;
}
/* destroy the stream */
void XStream::Destroy()
{
if(devID < 0)
return;
#ifdef USE_CUDA
int backupDevID = XDevice::GetGPUDevice();
XDevice::SetGPUDevice(devID);
cudaStreamDestroy(stream);
XDevice::SetGPUDevice(backupDevID);
Clear();
#endif
}
/* clear it */
void XStream::Clear()
{
#ifdef USE_CUDA
int backupDevID = XDevice::GetGPUDevice();
XDevice::SetGPUDevice(devID);
for(int i = 0; i < usedEventNum; i++){
cudaEventDestroy(events[i]);
}
usedEventNum = 0;
XDevice::SetGPUDevice(backupDevID);
#endif
}
/* judge if all the jobs in the stream have been finished */
bool XStream::IsFinished()
{
#ifdef USE_CUDA
if(cudaStreamQuery(stream) == cudaSuccess)
return true;
else
return false;
#else
return true;
#endif
}
void XStream::StreamSynchronize()
{
#ifdef USE_CUDA
int devIDBackup = XDevice::GetGPUDevice();
if(devID != devIDBackup)
XDevice::SetGPUDevice(devID);
cudaStreamSynchronize(stream);
if(devID != devIDBackup)
XDevice::SetGPUDevice(devIDBackup);
#endif
}
void XStream::ThreadSynchronize()
{
#ifdef USE_CUDA
cudaThreadSynchronize();
#endif
}
void XStream::DeviceSynchronize(int devID)
{
#ifdef USE_CUDA
int devIDBackup = XDevice::GetGPUDevice();
cudaGetDevice(&devIDBackup);
if(devID != devIDBackup)
XDevice::SetGPUDevice(devID);
cudaDeviceSynchronize();
if(devID != devIDBackup)
XDevice::SetGPUDevice(devIDBackup);
#endif
}
/* make a dependency of two streams. i.e., current stream must wait for the last job finished in another stream */
void XStream::MakeDependency(XStream * precedingStream)
{
#ifdef USE_CUDA
cudaEvent_t * e = precedingStream->MakeEvent();
cudaEventRecord(*e, precedingStream->stream);
cudaStreamWaitEvent(stream, *e, 0);
#endif
}
/* get the stream */
#ifdef USE_CUDA
inline cudaStream_t * XStream::Get()
{
return &stream;
}
/* make a event */
inline cudaEvent_t * XStream::MakeEvent()
{
int backupDevID = XDevice::GetGPUDevice();
XDevice::SetGPUDevice(devID);
CheckNTErrors((usedEventNum < maxEventNum), "Too many events are required!");
cudaEvent_t * e = events + usedEventNum++;
cudaEventCreate(e);
XDevice::SetGPUDevice(backupDevID);
return e;
}
#endif
} /* end of the nts (NiuTrans.Tensor) namespace */
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
*
* This is for streaming (on GPU), i.e., run jobs in different stream for
* GPU Async capabilities.
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2016-03-09
*
*/
#ifndef __XSTREAM_H__
#define __XSTREAM_H__
/* the CUDA stuff */
#ifdef USE_CUDA
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cuda_fp16.h>
#endif
/* the nts (NiuTrans.Tensor) namespace */
namespace nts{
#define MAX_CUDA_EVENT_NUM_IN_A_STREAM 128
/*
This class defines the stream used in pipelining jobs. E.g., one can put
a sequence of jobs in a stream and asychronously do something else. Basically
we can use multiply streams to hide the data transfer cost on GPUs by using
job overlaps.
*/
class XStream
{
public:
#ifdef USE_CUDA
/* the cuda stream */
cudaStream_t stream;
/* list of cuda events for synchronize different streams */
cudaEvent_t * events;
/* max number of the events */
int maxEventNum;
/* number of used events */
int usedEventNum;
#else
/* virtual pointer */
void * stream;
#endif
/* device that holds the stream */
int devID;
public:
/* constructor */
XStream(int priority = 0, int devID = 0, int maxEventNum = MAX_CUDA_EVENT_NUM_IN_A_STREAM);
/* deconstructor */
~XStream();
/* create the stream */
void Create(int priority = 0, int devID = 0);
/* destroy the stream */
void Destroy();
/* clear it */
void Clear();
/* judge if all the jobs in the stream have been finished */
bool IsFinished();
/* stream synchronize */
void StreamSynchronize();
/* thread synchronize */
static
void ThreadSynchronize();
/* device synchronize */
static
void DeviceSynchronize(int devID);
/* make a dependency of two streams. i.e., current stream must wait for the last job finished in another stream */
void MakeDependency(XStream * precedingStream);
#ifdef USE_CUDA
/* get the stream */
cudaStream_t * Get();
/* make a event */
cudaEvent_t * MakeEvent();
#endif
};
} /* end of the nts (NiuTrans.Tensor) namespace */
#endif
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
*
* a naive implementation of thread pool (actually it is a pool)
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2016-03-08
*
*/
#include "XGlobal.h"
#include "XThread.h"
/* the nts (NiuTrans.Tensor) namespace */
namespace nts{
/* constructor */
XThread::XThread()
{
#ifdef USE_PTHREAD
MUTEX_INIT(mutex);
COND_INIT(cond);
#endif
MUTEX_INIT(gMutex);
function = NULL;
argv = NULL;
toBreak = false;
jobCount = 0;
working = 0;
MUTEX_INIT(workingMutex);
COND_INIT(jobCond);
isRunning = false;
hnd = 0;
}
/* de-constructor */
XThread::~XThread()
{
End();
#ifdef USE_PTHREAD
MUTEX_DELE(mutex);
COND_DELE(cond);
#endif
MUTEX_DELE(gMutex);
MUTEX_DELE(workingMutex);
COND_DELE(jobCond);
};
/* a wrapper for the start-routine parameter in pthread_create */
void * XThread::Wrapper(void * ptr)
{
XThread * p = (XThread *)ptr;
p->Run();
return 0;
}
/*
Tunning for this thread. It is very very native implementation.
We loop and wait for a signal to activate the job processing.
After that, we wait again if there is no new job.
*/
void XThread::Run()
{
#ifdef _WIN32
//COND_RESET(gCond);
#endif
while(1){
#ifdef USE_PTHREAD
/* waiting for the job */
MUTEX_LOCK(mutex);
while(jobCount == 0){
COND_WAIT(cond, mutex); // it unlocks the mutex first
// and then wait
}
#else
#ifdef _WIN32
//SuspendThread(hnd);
COND_WAIT(jobCond, gMutex);
#endif
#endif
if(toBreak){
#ifdef USE_PTHREAD
MUTEX_UNLOCK(mutex);
#endif
break;
}
/* do what you want to do*/
function(argv);
#ifdef USE_PTHREAD
jobCount--;
MUTEX_UNLOCK(mutex);
#else
#ifdef _WIN32
MUTEX_LOCK(workingMutex);
working = 0;
jobCount--;
MUTEX_UNLOCK(workingMutex);
#endif
#endif
}
}
/* create and run the thread */
bool XThread::Start()
{
toBreak = false;
isRunning = true;
#ifdef USE_PTHREAD
int r = pthread_create(&hnd, NULL, &Wrapper, static_cast<void *>(this));
if(r != 0)
return false;
#else
#ifdef _WIN32
DWORD id;
hnd = BEGINTHREAD(0, 0, &Wrapper, this, 0, &id);
if(hnd == 0)
return false;
#else
Run();
#endif
#endif
return true;
}
/* end the thread */
void XThread::End()
{
toBreak = true;
if(isRunning == false)
return;
while(jobCount > 0){
#ifdef _WIN32
Sleep(200);
#else
usleep(200 * 1000);
#endif
};
#ifdef USE_PTHREAD
//MUTEX_LOCK(mutex);
jobCount++;
//COND_BROADCAST(cond);
//COND_SIGNAL(cond);
//MUTEX_UNLOCK(mutex);
COND_BROADCAST(cond);
#else
COND_SIGNAL(jobCond);
#endif
Join();
isRunning = false;
}
/* wait for thread termination */
void XThread::Join()
{
#ifdef USE_PTHREAD
pthread_join(hnd, 0);
#else
#ifdef _WIN32
WaitForSingleObject(hnd, INFINITE);
CloseHandle(hnd); // are you sure if you want to do this?
#endif
#endif
}
/* let the thread process a job */
void XThread::LetItGo()
{
#ifdef USE_PTHREAD
MUTEX_LOCK(mutex);
jobCount++;
MUTEX_UNLOCK(mutex);
#else
#ifdef _WIN32
/* reset various locks */
MUTEX_LOCK(workingMutex);
jobCount++;
COND_RESET(jobCond);
MUTEX_UNLOCK(workingMutex);
/* inform the job */
COND_SIGNAL(jobCond);
#endif
#endif
}
/* waith for a singal */
void XThread::Wait(COND_HANDLE * c, MUTEX_HANDLE * m)
{
#ifdef USE_PTHREAD
MUTEX_LOCK(*m);
COND_WAIT(*c, *m);
MUTEX_UNLOCK(*m);
#else
#ifdef _WIN32
COND_WAIT(*c, *m);
#endif
#endif
}
/***********************************************
a counter with mutex
*/
/* constructor */
XCounter::XCounter()
{
count = 0;
MUTEX_INIT(mutex);
}
/* deconstructor */
XCounter::~XCounter()
{
MUTEX_DELE(mutex);
}
/* add the counter by 1 */
void XCounter::Add()
{
MUTEX_LOCK(mutex);
count++;
MUTEX_UNLOCK(mutex);
}
/* get the counting number */
int XCounter::Get()
{
MUTEX_LOCK(mutex);
int c = count;
MUTEX_UNLOCK(mutex);
return c;
}
} /* end of the nts (NiuTrans.Tensor) namespace */
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
*
* a naive implementation of thread pool (actually it is a pool)
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2016-03-08
*
*/
#ifndef __XTHREAD_H__
#define __XTHREAD_H__
#include "XList.h"
#ifndef _WIN32
#define USE_PTHREAD // for linux
#endif
/* the nts (NiuTrans.Tensor) namespace */
namespace nts{
//////////////////////////////////////////////////
// neccessary libs
#ifdef USE_PTHREAD
#include <pthread.h> // use "-lpthread" when compiling on linux systems
#else
#ifdef _WIN32
#include <windows.h>
#include <process.h>
#endif
#endif
#if(defined(_WIN32) && !defined (__CYGWIN__))
#define CRFPP_USE_THREAD 1
#define BEGINTHREAD(src, stack, func, arg, flag, id) \
(HANDLE)_beginthreadex((void *)(src), (unsigned)(stack), \
(unsigned(_stdcall *)(void *))(func), (void *)(arg), \
(unsigned)(flag), (unsigned *)(id))
#endif
//////////////////////////////////////////////////
// mutex
#ifdef WIN32
#define THREAD_HANDLE HANDLE
#define MUTEX_HANDLE CRITICAL_SECTION
#define COND_HANDLE HANDLE
#define MUTEX_INIT( x ) InitializeCriticalSection( &(x) )
#define MUTEX_DELE( x ) DeleteCriticalSection( &(x) )
#define MUTEX_LOCK( x ) EnterCriticalSection( &(x) )
#define MUTEX_UNLOCK( x ) LeaveCriticalSection( &(x) )
#define COND_INIT( x ) ( x = CreateEvent( NULL, false, false, NULL ) )
#define COND_DELE( x ) CloseHandle( (x) )
#define COND_WAIT( x, y ) WaitForSingleObject( (x), INFINITE )
#define COND_SIGNAL( x ) SetEvent( (x) )
#define COND_RESET( x) ResetEvent( (x) )
#else
#define THREAD_HANDLE pthread_t
#define MUTEX_HANDLE pthread_mutex_t
#define COND_HANDLE pthread_cond_t
#define MUTEX_INIT( x ) pthread_mutex_init( &(x), NULL )
#define MUTEX_DELE( x ) pthread_mutex_destroy( &(x) )
#define MUTEX_LOCK( x ) pthread_mutex_lock( &(x) )
#define MUTEX_UNLOCK( x ) pthread_mutex_unlock( &(x) )
#define COND_INIT( x ) pthread_cond_init( &(x), NULL )
#define COND_DELE( x ) pthread_cond_destroy( &(x) )
#define COND_WAIT( x, y ) pthread_cond_wait( &(x), &(y) )
#define COND_SIGNAL( x ) pthread_cond_signal( &(x) )
#define COND_BROADCAST( x ) pthread_cond_broadcast( &(x) )
#endif
typedef void (*TFunction) (volatile XList*);
/*
This is a class that wraps the standard implementation of threading
(for both windows and linux OS)
*/
class XThread
{
public:
/* thread id */
THREAD_HANDLE hnd;
/* to information outside caller */
MUTEX_HANDLE gMutex;
/* working state */
int working;
/* a lock to protect the working state */
MUTEX_HANDLE workingMutex;
/* to inform the job when it is ready */
COND_HANDLE jobCond;
/* indicate whether the thread is running */
bool isRunning;
#ifdef USE_PTHREAD
/* a mutex lock */
MUTEX_HANDLE mutex;
/* condition lock */
COND_HANDLE cond;
/* scheduling for threads */
sched_param schedParam;
#else
#endif
public:
/* function to run */
volatile
TFunction function;
/* arguments (for the function to run) */
volatile
XList * argv;
/* a flag to break */
volatile
bool toBreak;
/* number of jobs that are waiting */
volatile
int jobCount;
public:
/* constructor */
XThread();
/* deconstructor */
~XThread();
public:
/* a wrapper for the start-routine parameter in pthread_create */
static void * Wrapper(void * ptr);
/*
Core of the thread. It is very very native impelementation.
We loop and wait for a singnal to activate the job processing.
After that, we wait again if there is no new job.
*/
void Run();
/* create and run the thread */
bool Start();
/* end the thread */
void End();
/* wait for thread termination */
void Join();
/* let the thread process a job */
void LetItGo();
/* waith for a singal */
static
void Wait(COND_HANDLE * c, MUTEX_HANDLE * m);
};
/*
a counter with mutex
*/
class XCounter
{
private:
/* count */
int count;
/* lock */
MUTEX_HANDLE mutex;
public:
/* constructor */
XCounter();
/* deconstructor */
~XCounter();
/* add the counter by 1 */
void Add();
/* get the counting number */
int Get();
};
} /* end of the nts (NiuTrans.Tensor) namespace */
#endif
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* some public functions are defined here
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-04-27
*
*/
#include <stdio.h>
#include "XGlobal.h"
#ifndef __XUTILITY_H__
#define __XUTILITY_H__
namespace nts{ // namespace nts(NiuTrans.Tensor)
extern DTYPE GetFirstDigitNum(DTYPE p);
extern bool IsFloatValid(float f);
extern bool IsNAN(float f);
extern bool IsNAN(double f);
extern bool IsINF(float f);
extern bool IsINF(double f);
extern void ToLowercase(char * str);
extern char * GetNextWord(char * p);
extern void XMemSet(void * p, int value, size_t size);
extern void XMemSet(int devID, void * p, int value, size_t size);
extern void XMemCopy(void * t, int devIDT, const void * s, int devIDS, size_t size);
extern void XMemCopy2D(void * t, size_t tPitch, int devIDT, const void * s, size_t sPitch, int devIDS, size_t mSize, int n);
extern void * XMemAlloc(int devID, size_t size);
extern void * XMemAllocOnDev(int devID, size_t size);
extern void XMemFree(int devID, void * p);
extern void XMemFreeOnDev(int devID, void * p);
extern DTYPE ToCPU(int devID, void * value);
extern int ToCPUInt(int devID, void * value);
extern bool SetToDevice(int devID, void * p, DTYPE value);
extern unsigned int GetNextPower2(unsigned int n);
extern void XSleep(int sleepTime);
extern double GetClock();
extern double GetClockSec();
extern void XQSort(void * data, void * index, int num, int width, int stride, int (*comp)(const void *, const void *));
extern int CompXFloat(const void * a, const void * b);
#ifdef USE_CUDA
extern void XMemCopyAsync(void * t, int devIDT, const void * s, int devIDS, size_t size, cudaStream_t stream, int streamDevID);
#else
extern void XMemCopyAsync(void * t, int devIDT, const void * s, int devIDS, size_t size, void * stream, int streamDevID);
#endif
extern void ResetGPUDevices();
} // namespace nts(NiuTrans.Tensor)
#endif // __XUTILITY_H__
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
/* this is a header to include all functions in the "core" workspace */
#ifndef __CHEADER_H__
#define __CHEADER_H__
#include "../XTensor.h"
#include "Concatenate.h"
#include "ConcatenateSolely.h"
#include "CopyIndexed.h"
#include "CopyInGrid.h"
#include "CopyValues.h"
#include "FlushToMem.h"
#include "MakeMergeBlockIndex.h"
#include "MakeSplitBlockIndex.h"
#include "MatrixMul.h"
#include "MatrixMul2D.h"
#include "MatrixMul2DMultiTheading.h"
#include "MatrixMul2DParallel.h"
#include "MatrixMulBatched.h"
#include "MatrixMULBatchedCPU.h"
#include "Merge.h"
#include "MergeBlockLists.h"
#include "MultiplyElementWise.h"
#include "Negate.h"
#include "Normalize.h"
#include "Power.h"
#include "ReduceMax.h"
#include "ReduceMean.h"
#include "ReduceStandardVariance.h"
#include "ReduceSum.h"
#include "ReduceSumSquared.h"
#include "ReduceVariance.h"
#include "ScaleAndShift.h"
#include "SetData.h"
#include "Sort.h"
#include "Split.h"
#include "Sum.h"
#include "SumByColumnTV.h"
#include "SumByColumnVT.h"
#include "TopK.h"
#include "Unsqueeze.h"
#include "XMatrixSegment.h"
#include "XTensorBLAS.h"
#endif // __CHEADER_H__
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "../XTensor.h"
#include "Concatenate.h"
#include "Merge.h"
#include "ConcatenateSolely.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
concatenate a list of tensors along a given dimension
Note that this is actually a wrapper that selects "ConcatenateSolely"
or "Merge" by means of the tensor shapes
>> smalls - a list of tensors for concatenation
>> big - the resulting tensor
>> dim - which dimension we perform the concatenation
*/
void Concatenate(XList * smalls, XTensor * big, int dim)
{
bool uniform = true;
for (int i = 1; i < smalls->count; i++) {
XTensor * a = (XTensor*)smalls->GetItem(i - 1);
XTensor * b = (XTensor*)smalls->GetItem(i);
CheckNTErrors((a && b), "Empty input tensors!");
if (!XTensor::IsIdentical(a, b))
uniform = false;
}
if (uniform)
Merge(smalls, big, dim);
else
ConcatenateSolely(smalls, big, dim);
}
/*
concatenate two tensors along a given dimension
*/
void Concatenate(XTensor * smallA, XTensor * smallB, XTensor * big, int dim)
{
XList smalls(2);
smalls.Add(smallA);
smalls.Add(smallB);
Concatenate(&smalls, big, dim);
}
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#ifndef __CONCATENATE_H__
#define __CONCATENATE_H__
#include "../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
concatenate a list of tensors along a given dimension
Note that this is actually a wrapper that selects "ConcatenateSolely"
or "Merge" by means of the tensor shapes */
void Concatenate(XList * smalls, XTensor * big, int dim);
/* concatenate two tensors along a given dimension */
void Concatenate(XTensor * smallA, XTensor * smallB, XTensor * big, int dim);
} // namespace nts(NiuTrans.Tensor)
#endif // __CONCATENATE_H__
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "../XTensor.h"
#include "../XUtility.h"
#include "ConcatenateSolely.h"
#include "MergeBlockLists.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
concatenate a list of tensors along a given dimension
>> smalls - a list of tensors for concatenation
>> big - the resulting tensor
>> dim - which dimension we perform the concatenation
*/
void ConcatenateSolely(XList * smalls, XTensor * big, int dim)
{
CheckNTErrors((big->order > dim && dim >= 0), "Illegal dimension to concatenate!");
int catDimSize = 0;
int dimRDI = big->order - dim - 1;
for (int i = 0; i < smalls->count; i++) {
XTensor * tensor = (XTensor*)smalls->GetItem(i);
CheckNTErrors((big->order == tensor->order), "Unmatched tensor orders!");
for (int j = 0; j < big->order; j++) {
if (j != dimRDI) {
CheckNTErrors((big->dimSizeRDI[j] == tensor->dimSizeRDI[j]), "Unmatched tensor sizes!");
}
else {
catDimSize += tensor->dimSizeRDI[j];
}
}
}
CheckNTErrors((catDimSize == big->dimSizeRDI[dimRDI]), "Unmatched tensor sizes!");
int stride = 1;
for (int i = 0; i < dimRDI; i++)
stride *= big->dimSizeRDI[i];
int blockNum = 1;
for (int i = dimRDI + 1; i < big->order; i++)
blockNum *= big->dimSizeRDI[i];
int offset = 0;
/* two strategies are used - we can either resort to memcpy2d for the case of
concatenation of a few items, or use MergeBlockLists to merge a large number
of data blocks */
if (smalls->count <= MIN_TENSOR_CAT_NUM) {
for (int i = 0; i < smalls->count; i++) {
XTensor * tensor = (XTensor*)smalls->GetItem(i);
int sPitch = stride * tensor->dimSizeRDI[dimRDI] * tensor->unitSize;
int tPitch = stride * big->dimSizeRDI[dimRDI] * big->unitSize;
int mSize = sPitch;
int n = blockNum;
XMemCopy2D((char*)big->data + offset, tPitch, big->devID,
(char*)tensor->data, sPitch, tensor->devID,
mSize, n);
offset += sPitch;
}
}
else {
XList * sourceArrays = new XList(smalls->count);
int * blockSizes = new int[smalls->count];
for (int i = 0; i < smalls->count; i++) {
XTensor * tensor = (XTensor*)smalls->GetItem(i);
blockSizes[i] = stride * tensor->dimSizeRDI[dimRDI] * tensor->unitSize;
sourceArrays->Add(tensor->data);
}
MergeBlockLists(sourceArrays, blockSizes, blockNum, big->data, big->mem);
delete[] blockSizes;
delete sourceArrays;
}
}
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#ifndef __CONCATENATESOLELY_H__
#define __CONCATENATESOLELY_H__
#include "../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* concatenate a list of tensors along a given dimension */
extern "C"
void ConcatenateSolely(XList * smalls, XTensor * big, int dim);
} // namespace nts(NiuTrans.Tensor)
#endif // __CONCATENATESOLELY_H__
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-06-14
*/
#include "../XTensor.h"
#include "../XDevice.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cuda_fp16.h>
__global__
void KernelFloatToFloat16(float * s, __half * t, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size){
t[i] = __float2half(s[i]);
}
}
__global__
void KernelFloat16ToFloat(__half * s, float * t, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size){
t[i] = __half2float(s[i]);
}
}
/*
data conversion (cuda code)
>> devID - device id
>> s - source data array
>> typeS - source data type
>> t - target data array
>> typeT - target data type
>> size - number of the items in s (and t)
*/
void CudaConvertDataType(int devID, void * s, TENSOR_DATA_TYPE typeS, void * t, TENSOR_DATA_TYPE typeT, int size)
{
CheckNTErrors((devID >= 0), "This code must be run on GPUs!");
if(typeS == typeT)
return;
int gridSize[3];
int blockSize[3];
GDevs.GetCudaThread(devID, size, gridSize, blockSize);
dim3 blocks(gridSize[0]);
dim3 threads(blockSize[0]);
int devIDBackup;
ProtectCudaDev(devID, devIDBackup);
if(typeS == X_FLOAT && typeT == X_FLOAT16)
KernelFloatToFloat16<<<blocks, threads>>>((float*)s, (__half*)t, size);
else if(typeS == X_FLOAT16 && typeT == X_FLOAT)
KernelFloat16ToFloat<<<blocks, threads>>>((__half*)s, (float*)t, size);
else{
ShowNTErrors("Unsupported data types for conversion!");
}
ProtectCudaDev(devID, devIDBackup);
}
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "../XTensor.h"
#include "../XUtility.h"
#include "CopyBlocks.h"
#include "CopyBlocksOnSite.h"
#include "CopyBlocksSelected.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
copy a number of blocks to target positions
>> source - data array (head of the blocks) to copy from
>> blockSize - size of block
>> blockNum - number of blocks
>> target - target data array
>> targetBlocks - target positions of the copy
>> myMem - the memory pool
*/
void CopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem)
{
if (myMem != NULL && myMem->devID >= 0) {
#ifdef USE_CUDA
/* copy the index from host to device */
int * targetBlocksTMP = (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int));
XMemCopy(targetBlocksTMP, myMem->devID, targetBlocks, -1, blockNum * sizeof(int));
CopyBlocksOnSite(source, blockSize, blockNum, target, targetBlocksTMP, myMem);
myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
#else
ShowNTErrors("Plesae specify USE_CUDA and recompile the code!");
#endif
}
else {
CopyBlocksOnSite(source, blockSize, blockNum, target, targetBlocks, myMem);
}
}
/*
copy a number of blocks source source positions to target positions
>> source - data array (head of the blocks) to copy from
>> blockSize - size of block
>> srcBlocks - source positions of the copy
>> blockNum - number of blocks (lenth of srcBlocks and tgtBlocks)
>> target - target data array
>> targetBlocks - target positions of the copy
>> myMem - the memory pool
*/
void CopyBlocks(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem)
{
if (myMem != NULL && myMem->devID >= 0) {
#ifdef USE_CUDA
CudaCopyBlocksSelected(source, blockSize, sourceBlocks, blockNum, target, targetBlocks, myMem);
#else
ShowNTErrors("Plesae specify USE_CUDA and recompile the code!");
#endif
}
else {
int devID = myMem != NULL ? myMem->devID : -1;
/* The following code should be fine with GPUs, but too many
kernel calls would slow down the system. We prefer to use
one kernel to do block copy in batch (kernel fusion). */
for (int i = 0; i < blockNum; i++) {
XMemCopy((char*)target + targetBlocks[i] * blockSize, devID,
(char*)source + sourceBlocks[i] * blockSize, devID, blockSize);
}
}
}
} // namespace nts(NiuTrans.Tensor)
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#ifndef __COPYBLOCKS_H__
#define __COPYBLOCKS_H__
#include "../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* copy a number of blocks to target positions */
void CopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem);
/* copy a number of blocks from source positions to target positions */
void CopyBlocks(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem);
} // namespace nts(NiuTrans.Tensor)
#endif // __COPYBLOCKS_H__
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "../XTensor.h"
#include "CopyBlocksInGrid.h"
#include "../XUtility.h"
#include "CopyBlocksInGrid.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
copy a number of blocks in grid
>> source - pointer to the source data array
>> blockSize - size of a data block
>> blockNum - number of the blocks (in a grid)
>> gridNum - number of the grids.
Note that a grid may have a number of blocks
>> target - pointer to the target data array
>> index - source block id for each target block
>> myMem - the memory pool
>> isIndexOnDev - indicates whether the index is on the device already
*/
void CopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, void * target,
int * index, int unitSize, bool isIndexOnDev, XMem * myMem)
{
CheckNTErrors((unitSize == sizeof(int)), "TODO!");
if (myMem != NULL && myMem->devID >= 0) {
#ifdef USE_CUDA
int * indexGPU = index;
if (!isIndexOnDev) {
indexGPU = (int*)myMem->AllocBuf(myMem->devID, blockNum * gridNum * sizeof(int));
XMemCopy(indexGPU, myMem->devID, index, -1, blockNum * gridNum * sizeof(int));
}
CudaCopyBlocksInGrid(source, blockSize, blockNum, gridNum, target, indexGPU, unitSize, myMem);
if (!isIndexOnDev)
myMem->ReleaseBuf(myMem->devID, blockNum * gridNum * sizeof(int));
#else
ShowNTErrors("Plesae specify USE_CUDA and recompile the code!");
#endif
}
else {
void * buf = XMemAlloc(myMem->devID, blockSize * blockNum * unitSize);
for (int k = 0; k < gridNum; k++) {
int offset = k * blockSize * blockNum;
for (int i = 0; i < blockNum; i++) {
int b = index[k * blockNum + i];
if (b >= 0 && b < blockNum) {
int * t = (int*)buf + blockSize * i;
int * s = (int*)source + offset + blockSize * b;
for (int j = 0; j < blockSize; j++)
t[j] = s[j];
}
}
XMemCopy((int*)target + offset, myMem->devID,
buf, myMem->devID,
blockSize * blockNum * unitSize);
}
XMemFree(myMem->devID, buf);
}
}
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#ifndef __COPYBLOCKSINGRID_CUH__
#define __COPYBLOCKSINGRID_CUH__
#include "../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* copy data by index */
extern "C"
void CudaCopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, void * target, int * index, int unitSize, XMem * myMem);
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
#endif // __COPYBLOCKSINGRID_CUH__
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#ifndef __COPYBLOCKSINGRID_H__
#define __COPYBLOCKSINGRID_H__
#include "../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* copy a number of blocks in grid */
extern "C"
void CopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, void * target, int * index, int unitSize, bool isIndexOnDev, XMem * myMem);
} // namespace nts(NiuTrans.Tensor)
#endif // __COPYBLOCKSINGRID_H__
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "../XTensor.h"
#include "../XUtility.h"
#include "CopyBlocksOnSite.h"
#include "CopyBlocksOnSite.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
copy a number of blocks to target positions. Here we assume that
all the data has been on the device (CPU/GPU) already.
>> source - data array (head of the blocks) to copy from
>> blockSize - size of block
>> blockNum - number of blocks
>> target - target data array
>> targetBlocks - target positions of the copy
>> myMem - the memory pool
*/
void CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem)
{
if (myMem != NULL && myMem->devID >= 0) {
#ifdef USE_CUDA
CudaCopyBlocks(source, blockSize, blockNum, target, targetBlocks, myMem);
#else
ShowNTErrors("Plesae specify USE_CUDA and recompile the code!");
#endif
}
else {
int devID = myMem != NULL ? myMem->devID : -1;
/* The following code should be fine with GPUs, but too many
kernel calls would slow down the system. We prefer to use
one kernel to do block copy in batch (kernel fusion). */
for (int i = 0, b = 0; i < blockNum; i++, b += blockSize) {
XMemCopy((char*)target + targetBlocks[i] * blockSize, devID,
(char*)source + b, devID, blockSize);
}
}
}
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "CopyBlocksOnSite.h"
#include "CopyBlocksOnSite.cuh"
#include "../XDevice.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/*
copy a number of blocks to target positions
NOTE that this version makes more use of the 2d threads in cuda
>> source - data array (head of the blocks) to copy from
>> blockSize - size of block
>> blockNum - number of blocks
>> target - target data array
>> targetBlocks - target positions of the copy
*/
template<int miniBlockSize>
__global__
void KernelCopyBlocks(DTYPE * source, int blockSize, int blockNum, DTYPE * target, int * targetBlocks)
{
/* entry index in the block */
int i = (blockDim.x * blockIdx.x + threadIdx.x) * miniBlockSize;
/* block index */
int j = blockDim.y * blockIdx.y + threadIdx.y;
if (j >= blockNum)
return;
/* target position */
int k = targetBlocks[j];
DTYPE * s = source + blockSize * j;
DTYPE * t = target + blockSize * k;
if (i < blockSize) {
if (miniBlockSize == 4) {
t[i] = s[i];
t[i + 1] = s[i + 1];
t[i + 2] = s[i + 2];
t[i + 3] = s[i + 3];
}
else if (miniBlockSize <= 1) {
t[i] = s[i];
}
else {
printf("something wrong!");
}
}
}
/*
copy a number of blocks to target positions (cuda version)
>> source - data array (head of the blocks) to copy from
>> blockSize - size of block
>> blockNum - number of blocks
>> target - target data array
>> targetBlocks - target positions of the copy (on the device)
>> myMem - memory pool
*/
void CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem)
{
CheckNTErrors((myMem != NULL), "No memory pool!");
CheckNTErrors((myMem->devID >= 0), "Wrong device to run!");
CheckNTErrors((blockSize % sizeof(DTYPE) == 0), "Unsupported block size!");
int cudaGrids[3];
int cudaBlocks[3];
int bSize = blockSize / sizeof(DTYPE);
if (bSize % 4 == 0) {
GDevs.GetCudaThread2D(myMem->devID, bSize / 4, blockNum, MAX_INT, cudaGrids, cudaBlocks);
KernelCopyBlocks<4> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((DTYPE*)source, bSize, blockNum, (DTYPE*)target, targetBlocks);
}
else {
GDevs.GetCudaThread2D(myMem->devID, bSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
KernelCopyBlocks<1> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((DTYPE*)source, bSize, blockNum, (DTYPE*)target, targetBlocks);
}
}
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#ifndef __COPYBLOCKS_CUH__
#define __COPYBLOCKS_CUH__
#include "../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* copy a number of blocks to target positions */
__global__
void KernelCopyBlocks(DTYPE * source, int blockSize, int blockNum, DTYPE * target, int * targetBlocks);
/* copy a number of blocks to target positions (cuda version) */
extern "C"
void CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem);
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
#endif // __COPYBLOCKS_CUH__
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#ifndef __COPYBLOCKSONSITE_H__
#define __COPYBLOCKSONSITE_H__
#include "../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* copy a number of blocks to target positions (on site) */
extern "C"
void CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem);
} // namespace nts(NiuTrans.Tensor)
#endif // __COPYBLOCKSONSITE_H__
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "CopyBlocks.h"
#include "CopyBlocksSelected.cuh"
#include "../XUtility.h"
#include "../XDevice.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/*
copy a number of blocks from source positions to target positions
>> source - data array (head of the blocks) to copy from
>> blockSize - size of block
>> sourceBlocks - source positions of the copy
>> blockNum - number of blocks
>> target - target data array
>> targetBlocks - target positions of the copy
*/
__global__
void KernelCopyBlocksSelected(DTYPE * source, int blockSize, int * sourceBlocks, int blockNum, DTYPE * target, int * targetBlocks)
{
/* block index */
int i = blockDim.x * blockIdx.x + threadIdx.x;
/* entry index in the block */
int j = blockDim.y * blockIdx.y + threadIdx.y;
if (j >= blockNum)
return;
/* target position */
int srcIndex = sourceBlocks[j];
int tgtIndex = targetBlocks[j];
DTYPE * s = source + blockSize * srcIndex;
DTYPE * t = target + blockSize * tgtIndex;
if (i < blockSize)
t[i] = s[i];
}
/*
copy a number of blocks from source positions to target positions (cuda version)
>> source - data array (head of the blocks) to copy from
>> blockSize - size of block
>> sourceBlocks - source positions of the copy
>> blockNum - number of blocks
>> target - target data array
>> targetBlocks - target positions of the copy
>> myMem - memory pool
*/
void CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem)
{
CheckNTErrors((myMem != NULL), "No memory pool!");
CheckNTErrors((myMem->devID >= 0), "Wrong device to run!");
CheckNTErrors((blockSize % sizeof(DTYPE) == 0), "Unsupported block size!");
/* copy the index to the GPU memory */
int * sourceBlocksTMP = (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int));
int * targetBlocksTMP = (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int));
XMemCopy(sourceBlocksTMP, myMem->devID, sourceBlocks, -1, blockNum * sizeof(int));
XMemCopy(targetBlocksTMP, myMem->devID, targetBlocks, -1, blockNum * sizeof(int));
int cudaGrids[3];
int cudaBlocks[3];
GDevs.GetCudaThread2D(myMem->devID, blockSize / sizeof(DTYPE), blockNum, MAX_INT, cudaGrids, cudaBlocks);
KernelCopyBlocksSelected << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((DTYPE*)source, blockSize / sizeof(DTYPE), sourceBlocksTMP, blockNum, (DTYPE*)target, targetBlocksTMP);
myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
}
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论