Commit 7ac8e731 by xuchen

Merge branch 'xuchen' into xiaotong-working

parents 7ae1562d 7e9d7015
...@@ -127,7 +127,6 @@ struct FNNNet ...@@ -127,7 +127,6 @@ struct FNNNet
}; };
/* entry of the program */ /* entry of the program */
extern "C"
int FNNLMMain(int argc, const char ** argv); int FNNLMMain(int argc, const char ** argv);
}; };
......
...@@ -47,9 +47,9 @@ extern const char * GetDataTypeName(TENSOR_DATA_TYPE type); ...@@ -47,9 +47,9 @@ extern const char * GetDataTypeName(TENSOR_DATA_TYPE type);
extern TENSOR_DATA_TYPE GetDataType(const char * typeName); extern TENSOR_DATA_TYPE GetDataType(const char * typeName);
/* data conversion (for lower precision computation) */ /* data conversion (for lower precision computation) */
extern "C" unsigned short FloatToFloat16(float f); unsigned short FloatToFloat16(float f);
extern "C" float Float16ToFloat(unsigned short h); float Float16ToFloat(unsigned short h);
extern "C" void ConvertDataType(int devID, void ConvertDataType(int devID,
void * s, TENSOR_DATA_TYPE typeS, void * s, TENSOR_DATA_TYPE typeS,
void * t, TENSOR_DATA_TYPE typeT, int size); void * t, TENSOR_DATA_TYPE typeT, int size);
......
...@@ -486,9 +486,8 @@ quick sorting ...@@ -486,9 +486,8 @@ quick sorting
NOTE: this means that the items may not placed in a continuous memory space NOTE: this means that the items may not placed in a continuous memory space
>> comp - the comparison function >> comp - the comparison function
*/ */
void XQSort(void * dataA, void * dataB, void * index, int num, int width, int stride, int (*comp)(const void *, const void *)) void XQSort(void * data, void * index, int num, int width, int stride, int (*comp)(const void *, const void *))
{ {
XMemCopy(dataB, -1, dataA, -1, num * width);
char *lo, *hi; // ends of sub-array currently sorting char *lo, *hi; // ends of sub-array currently sorting
int *indexlo, *indexhi; int *indexlo, *indexhi;
char *mid; // points to middle of subarray char *mid; // points to middle of subarray
...@@ -507,8 +506,8 @@ void XQSort(void * dataA, void * dataB, void * index, int num, int width, int st ...@@ -507,8 +506,8 @@ void XQSort(void * dataA, void * dataB, void * index, int num, int width, int st
stackptr = 0; stackptr = 0;
lo = (char*)dataB; lo = (char*)data;
hi = (char*)dataB + realStride * (num - 1); hi = (char*)data + realStride * (num - 1);
indexlo = (int*)index; indexlo = (int*)index;
indexhi = index != NULL ? (int*)index + stride * (num - 1) : NULL; indexhi = index != NULL ? (int*)index + stride * (num - 1) : NULL;
......
...@@ -53,7 +53,7 @@ extern void XSleep(int sleepTime); ...@@ -53,7 +53,7 @@ extern void XSleep(int sleepTime);
extern double GetClock(); extern double GetClock();
extern double GetClockSec(); extern double GetClockSec();
extern void XQSort(void * dataA, void * dataB, void * index, int num, int width, int stride, int (*comp)(const void *, const void *)); extern void XQSort(void * data, void * index, int num, int width, int stride, int (*comp)(const void *, const void *));
extern int CompXFloat(const void * a, const void * b); extern int CompXFloat(const void * a, const void * b);
#ifdef USE_CUDA #ifdef USE_CUDA
......
...@@ -60,7 +60,6 @@ set each entry to its absolute value ...@@ -60,7 +60,6 @@ set each entry to its absolute value
>> a - input tensor >> a - input tensor
>> b - output tensor >> b - output tensor
*/ */
extern "C"
void _CudaAbsolute(const XTensor * a, XTensor * b) void _CudaAbsolute(const XTensor * a, XTensor * b)
{ {
CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!"); CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
......
...@@ -34,7 +34,6 @@ __global__ ...@@ -34,7 +34,6 @@ __global__
void KernelAbsolute(__half * a, __half * b, int size); void KernelAbsolute(__half * a, __half * b, int size);
/* set each entry to its absolute value */ /* set each entry to its absolute value */
extern "C"
void _CudaAbsolute(const XTensor * a, XTensor * b); void _CudaAbsolute(const XTensor * a, XTensor * b);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -27,7 +27,6 @@ ...@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* matrix multiplication in batch mode (CPU code) */ /* matrix multiplication in batch mode (CPU code) */
extern "C"
void _MatrixMULBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA, const XList * b, MATRIX_TRANS_TYPE transposedB, void _MatrixMULBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA, const XList * b, MATRIX_TRANS_TYPE transposedB,
XList * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0); XList * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
......
...@@ -46,7 +46,7 @@ c = a * b * \alpha ...@@ -46,7 +46,7 @@ c = a * b * \alpha
>> cRowSize - row size of matrix c >> cRowSize - row size of matrix c
>> alpha - the scaling factor >> alpha - the scaling factor
*/ */
extern "C" __global__ __global__
void KernelMatrixMulDenseMSparseMV2(DTYPE * a, MATRIX_TRANS_TYPE transposedA, int aColSize, int aRowSize, void KernelMatrixMulDenseMSparseMV2(DTYPE * a, MATRIX_TRANS_TYPE transposedA, int aColSize, int aRowSize,
void * b, MATRIX_TRANS_TYPE transposedB, int bNonZeroNum, int bColSize, int bRowSize, void * b, MATRIX_TRANS_TYPE transposedB, int bNonZeroNum, int bColSize, int bRowSize,
DTYPE * c, int cColSize, int cRowSize, DTYPE alpha) DTYPE * c, int cColSize, int cRowSize, DTYPE alpha)
......
...@@ -32,7 +32,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -32,7 +32,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
mutilication of a dense matrix with a sparse vector mutilication of a dense matrix with a sparse vector
c = a * b * \alpha c = a * b * \alpha
*/ */
extern "C" __global__ __global__
void KernelMatrixMulDenseMSparseMV2(DTYPE * a, MATRIX_TRANS_TYPE transposedA, int aColSize, int aRowSize, void KernelMatrixMulDenseMSparseMV2(DTYPE * a, MATRIX_TRANS_TYPE transposedA, int aColSize, int aRowSize,
void * b, MATRIX_TRANS_TYPE transposedB, int bNonZeroNum, int bColSize, int bRowSize, void * b, MATRIX_TRANS_TYPE transposedB, int bNonZeroNum, int bColSize, int bRowSize,
DTYPE * c, int cColSize, int cRowSize, DTYPE alpha); DTYPE * c, int cColSize, int cRowSize, DTYPE alpha);
...@@ -42,7 +42,6 @@ matrix multiplication (for 2d tensors) (cuda version) ...@@ -42,7 +42,6 @@ matrix multiplication (for 2d tensors) (cuda version)
c = trans(a) * trans(b) * alpha + c * beta c = trans(a) * trans(b) * alpha + c * beta
where trans() return the transposed matrix if the flag is fired where trans() return the transposed matrix if the flag is fired
*/ */
extern "C"
void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c, void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XStream * stream = NULL); DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XStream * stream = NULL);
......
...@@ -30,7 +30,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -30,7 +30,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
matrix multiplication for a block (x1,y1) - (x2,y2) matrix multiplication for a block (x1,y1) - (x2,y2)
where (x1,y1) is the upper-left corner and (x2,y2) is the bottom-right corner where (x1,y1) is the upper-left corner and (x2,y2) is the bottom-right corner
*/ */
extern "C"
void _MatrixMul2DMultiTheading(XList * args); void _MatrixMul2DMultiTheading(XList * args);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -31,7 +31,6 @@ matrix multiplication (for 2d tensors) with multi-threading. ...@@ -31,7 +31,6 @@ matrix multiplication (for 2d tensors) with multi-threading.
c = trans(a) * trans(b) * alpha + c * beta c = trans(a) * trans(b) * alpha + c * beta
where trans() return the transposed matrix if the flag is fired. where trans() return the transposed matrix if the flag is fired.
*/ */
extern "C"
void _MatrixMul2DParallel(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, void _MatrixMul2DParallel(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB,
XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL); XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
......
...@@ -34,7 +34,7 @@ multiplication of data arrays in a element-wise manner c(i) = a(i)*b(i) ...@@ -34,7 +34,7 @@ multiplication of data arrays in a element-wise manner c(i) = a(i)*b(i)
>> c - result data array >> c - result data array
>> size - size of c >> size - size of c
*/ */
extern "C" __global__ __global__
void KernelMulElementWise(DTYPE * a, DTYPE * b, DTYPE * c, int size) void KernelMulElementWise(DTYPE * a, DTYPE * b, DTYPE * c, int size)
{ {
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
...@@ -51,7 +51,7 @@ multiplication of data arrays in a element-wise manner c(i) = a(i)*b(i) + \alpha ...@@ -51,7 +51,7 @@ multiplication of data arrays in a element-wise manner c(i) = a(i)*b(i) + \alpha
>> size - size of c >> size - size of c
>> alpha - the coefficient >> alpha - the coefficient
*/ */
extern "C" __global__ __global__
void KernelMulElementWiseV2(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE alpha) void KernelMulElementWiseV2(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE alpha)
{ {
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
...@@ -120,7 +120,6 @@ where i is the item index ...@@ -120,7 +120,6 @@ where i is the item index
>> alpha - the coefficient >> alpha - the coefficient
>> leadingDim - dimension along which we perform broadcasting >> leadingDim - dimension along which we perform broadcasting
*/ */
extern "C"
void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim) void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
{ {
int leadingDimRDI = a->order - leadingDim - 1; int leadingDimRDI = a->order - leadingDim - 1;
......
...@@ -29,11 +29,11 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -29,11 +29,11 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* multiplication of two tensors in a element-wise manner c(i) = a(i)*b(i) */ /* multiplication of two tensors in a element-wise manner c(i) = a(i)*b(i) */
extern "C" __global__ __global__
void KernelMulElementWise(DTYPE * a, DTYPE * b, DTYPE * c, int size); void KernelMulElementWise(DTYPE * a, DTYPE * b, DTYPE * c, int size);
/* multiplication of two tensors in a element-wise manner c(i) = a(i)*b(i) + \alpha*c(i) */ /* multiplication of two tensors in a element-wise manner c(i) = a(i)*b(i) + \alpha*c(i) */
extern "C" __global__ __global__
void KernelMulElementWiseV2(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE alpha); void KernelMulElementWiseV2(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE alpha);
/* multiplication of two tensors in a element-wise manner c(i) = a(i)*b(i)+ \alpha*c(i) */ /* multiplication of two tensors in a element-wise manner c(i) = a(i)*b(i)+ \alpha*c(i) */
...@@ -41,7 +41,6 @@ template<int nonZeroAlpha>__global__ ...@@ -41,7 +41,6 @@ template<int nonZeroAlpha>__global__
void KernelMulElementWiseTensorDynamic(DTYPE * a, DTYPE * b, DTYPE * c, DTYPE alpha, int stride, int ldSizeA, int ldSizeB, int ldSizeC, int blockNum); void KernelMulElementWiseTensorDynamic(DTYPE * a, DTYPE * b, DTYPE * c, DTYPE alpha, int stride, int ldSizeA, int ldSizeB, int ldSizeC, int blockNum);
/* element-wise product of two tensors */ /* element-wise product of two tensors */
extern "C"
void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha = 0, int leadingDim = 0); void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha = 0, int leadingDim = 0);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -68,7 +68,6 @@ set each entry to its negtive value ...@@ -68,7 +68,6 @@ set each entry to its negtive value
>> a - input tensor >> a - input tensor
>> b - output tensor >> b - output tensor
*/ */
extern "C"
void _CudaNegate(const XTensor * a, XTensor * b) void _CudaNegate(const XTensor * a, XTensor * b)
{ {
CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!"); CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
......
...@@ -37,7 +37,6 @@ __global__ ...@@ -37,7 +37,6 @@ __global__
void KernelNegate(__half * a, __half * b, int size); void KernelNegate(__half * a, __half * b, int size);
/* set each entry to its negtive value */ /* set each entry to its negtive value */
extern "C"
void _CudaNegate(const XTensor * a, XTensor * b); void _CudaNegate(const XTensor * a, XTensor * b);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -66,7 +66,6 @@ set each entry to its sign value ...@@ -66,7 +66,6 @@ set each entry to its sign value
>> a - input tensor we are processing >> a - input tensor we are processing
>> b - output tensor we are processing >> b - output tensor we are processing
*/ */
extern "C"
void _CudaSign(const XTensor * a, XTensor * b) void _CudaSign(const XTensor * a, XTensor * b)
{ {
CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!"); CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
......
...@@ -37,7 +37,6 @@ __global__ ...@@ -37,7 +37,6 @@ __global__
void KernelSign(__half * a, __half * b, int size); void KernelSign(__half * a, __half * b, int size);
/* set each entry to its sign value */ /* set each entry to its sign value */
extern "C"
void _CudaSign(const XTensor * a, XTensor * b); void _CudaSign(const XTensor * a, XTensor * b);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -35,7 +35,7 @@ c = a + b * \beta ...@@ -35,7 +35,7 @@ c = a + b * \beta
>> size - the size of a/b/c >> size - the size of a/b/c
>> beta - the coefficient >> beta - the coefficient
*/ */
extern "C" __global__ __global__
void KernelADD(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta) void KernelADD(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta)
{ {
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
......
...@@ -29,15 +29,13 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -29,15 +29,13 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* summation of data arrays (CUDA Kernel) */ /* summation of data arrays (CUDA Kernel) */
extern "C" __global__ __global__
void KernelADD(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta = (DTYPE)1.0); void KernelADD(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta = (DTYPE)1.0);
/* tensor summation c = a + b * \beta (cuda version) */ /* tensor summation c = a + b * \beta (cuda version) */
extern "C"
void _CudaSum(const XTensor * a, const XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0); void _CudaSum(const XTensor * a, const XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);
/* tensor summation c = a + b * \beta (cuda version) with an input handle */ /* tensor summation c = a + b * \beta (cuda version) with an input handle */
extern "C"
void _CudaSumWithHandle(int devID, cublasHandle_t * handle, DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta = (DTYPE)1.0); void _CudaSumWithHandle(int devID, cublasHandle_t * handle, DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta = (DTYPE)1.0);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -39,7 +39,7 @@ c_col = a_col + b * \beta ...@@ -39,7 +39,7 @@ c_col = a_col + b * \beta
>> size - size of the entire data array >> size - size of the entire data array
>> beta - the scaling factor >> beta - the scaling factor
*/ */
extern "C" __global__ __global__
void KernelADDByColumnTV(DTYPE * a, DTYPE * b, DTYPE * c, int colNum, int blockSize, int size, DTYPE beta) void KernelADDByColumnTV(DTYPE * a, DTYPE * b, DTYPE * c, int colNum, int blockSize, int size, DTYPE beta)
{ {
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
......
...@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* summation of a tensor and a vector (column vector) */ /* summation of a tensor and a vector (column vector) */
extern "C"
void _CudaSumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0); void _CudaSumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -39,7 +39,7 @@ c = a + \sum{col} b_col * \beta ...@@ -39,7 +39,7 @@ c = a + \sum{col} b_col * \beta
>> size - size of the entire data array >> size - size of the entire data array
>> beta - the scaling factor >> beta - the scaling factor
*/ */
extern "C" __global__ __global__
void KernelADDByColumnVT(DTYPE * a, DTYPE * b, DTYPE * c, int colNum, int rowNum, int blockNum, DTYPE beta) void KernelADDByColumnVT(DTYPE * a, DTYPE * b, DTYPE * c, int colNum, int rowNum, int blockNum, DTYPE beta)
{ {
int row = blockDim.x * blockIdx.x + threadIdx.x; int row = blockDim.x * blockIdx.x + threadIdx.x;
......
...@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* summation of a vector (column vector) and a tensor */ /* summation of a vector (column vector) and a tensor */
extern "C"
void _CudaSumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0); void _CudaSumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -143,7 +143,6 @@ void _CudaBLASMatrixMULBatched(cublasHandle_t * handle, ...@@ -143,7 +143,6 @@ void _CudaBLASMatrixMULBatched(cublasHandle_t * handle,
} }
/* matrix multiplication in batch and strided mode via cuda version BLAS */ /* matrix multiplication in batch and strided mode via cuda version BLAS */
extern "C"
void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle, void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, long long int strideA, const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, long long int strideA,
const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB, long long int strideB, const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB, long long int strideB,
......
...@@ -27,14 +27,12 @@ ...@@ -27,14 +27,12 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* matrix multiplication (BLAS) */ /* matrix multiplication (BLAS) */
extern "C"
void _MatrixMULCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, void _MatrixMULCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB,
XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0); XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
#ifdef USE_CUDA #ifdef USE_CUDA
/* matrix multiplication via cuda version BLAS */ /* matrix multiplication via cuda version BLAS */
extern "C"
void _CudaBLASMatrixMUL(cublasHandle_t * handle, void _CudaBLASMatrixMUL(cublasHandle_t * handle,
const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB, const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
...@@ -42,7 +40,6 @@ void _CudaBLASMatrixMUL(cublasHandle_t * handle, ...@@ -42,7 +40,6 @@ void _CudaBLASMatrixMUL(cublasHandle_t * handle,
int na, int ma, int nb, int mb, int nc, int mc, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0); int na, int ma, int nb, int mb, int nc, int mc, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
/* matrix multiplication in batch mode via cuda version BLAS */ /* matrix multiplication in batch mode via cuda version BLAS */
extern "C"
void _CudaBLASMatrixMULBatched(cublasHandle_t * handle, void _CudaBLASMatrixMULBatched(cublasHandle_t * handle,
const void ** a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, const void ** a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
const void ** b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB, const void ** b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
...@@ -51,7 +48,6 @@ void _CudaBLASMatrixMULBatched(cublasHandle_t * handle, ...@@ -51,7 +48,6 @@ void _CudaBLASMatrixMULBatched(cublasHandle_t * handle,
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0); DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
/* matrix multiplication in batch and strided mode via cuda version BLAS */ /* matrix multiplication in batch and strided mode via cuda version BLAS */
extern "C"
void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle, void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, long long int strideA, const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, long long int strideA,
const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB, long long int strideB, const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB, long long int strideB,
...@@ -60,7 +56,6 @@ void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle, ...@@ -60,7 +56,6 @@ void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0); DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
/* matrix multiplication in batch mode via cuda version BLAS */ /* matrix multiplication in batch mode via cuda version BLAS */
extern "C"
void _CudaBLASMatrixMULList(cublasHandle_t * handle, const XList * a, MATRIX_TRANS_TYPE transposedA, void _CudaBLASMatrixMULList(cublasHandle_t * handle, const XList * a, MATRIX_TRANS_TYPE transposedA,
const XList * b, MATRIX_TRANS_TYPE transposedB, XList * c, const XList * b, MATRIX_TRANS_TYPE transposedB, XList * c,
int count, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0); int count, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
......
...@@ -27,14 +27,12 @@ ...@@ -27,14 +27,12 @@
namespace nts{ // namespace nts(NiuTrans.Tensor) namespace nts{ // namespace nts(NiuTrans.Tensor)
/* generate a tensor with selected data c = select(a) */ /* generate a tensor with selected data c = select(a) */
extern "C"
void _CudaSelect(const XTensor * a, XTensor * c, XTensor * indexCPU); void _CudaSelect(const XTensor * a, XTensor * c, XTensor * indexCPU);
/* /*
generate a tensor with selected data in range[low,high] along the given dimension generate a tensor with selected data in range[low,high] along the given dimension
c = select(a) c = select(a)
*/ */
extern "C"
void _CudaSelectRange(const XTensor * a, XTensor * c, int dim, int low, int high); void _CudaSelectRange(const XTensor * a, XTensor * c, int dim, int low, int high);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -60,7 +60,6 @@ set each entry to its log value ...@@ -60,7 +60,6 @@ set each entry to its log value
>> a - input tensor >> a - input tensor
>> b - output tensor >> b - output tensor
*/ */
extern "C"
void _CudaLog(const XTensor * a, XTensor * b) void _CudaLog(const XTensor * a, XTensor * b)
{ {
CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!"); CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
......
...@@ -37,7 +37,6 @@ __global__ ...@@ -37,7 +37,6 @@ __global__
void KernelLog(__half * a, __half * b, int size); void KernelLog(__half * a, __half * b, int size);
/* set each entry to its log value */ /* set each entry to its log value */
extern "C"
void _CudaLog(const XTensor * a, XTensor * b); void _CudaLog(const XTensor * a, XTensor * b);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -88,7 +88,6 @@ where a and b are the scalar and bias respectively, and \epsilon is the adjustme ...@@ -88,7 +88,6 @@ where a and b are the scalar and bias respectively, and \epsilon is the adjustme
>> b - the bias >> b - the bias
>> epsilon - a parameter >> epsilon - a parameter
*/ */
extern "C"
void _CudaNormalize(const XTensor * input, XTensor * output, int dim, void _CudaNormalize(const XTensor * input, XTensor * output, int dim,
const XTensor * mean, const XTensor * var, const XTensor * mean, const XTensor * var,
const XTensor * a, const XTensor * b, const XTensor * a, const XTensor * b,
......
...@@ -43,7 +43,6 @@ normalized the data with normal distribution. For an input x, ...@@ -43,7 +43,6 @@ normalized the data with normal distribution. For an input x,
y = a * (x-mean)/sqrt(variance+\epsilon) + b y = a * (x-mean)/sqrt(variance+\epsilon) + b
where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter
*/ */
extern "C"
void _CudaNormalize(const XTensor * input, XTensor * output, int dim, void _CudaNormalize(const XTensor * input, XTensor * output, int dim,
const XTensor * mean, const XTensor * var, const XTensor * mean, const XTensor * var,
const XTensor * a, const XTensor * b, DTYPE epsilon); const XTensor * a, const XTensor * b, DTYPE epsilon);
......
...@@ -100,7 +100,6 @@ void KernelPower(__half * a, __half * b, __half p, int size) ...@@ -100,7 +100,6 @@ void KernelPower(__half * a, __half * b, __half p, int size)
} }
/* get the power of the entries */ /* get the power of the entries */
extern "C"
void _CudaPower(const XTensor * a, XTensor * b, DTYPE p) void _CudaPower(const XTensor * a, XTensor * b, DTYPE p)
{ {
CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!"); CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
......
...@@ -37,7 +37,6 @@ __global__ ...@@ -37,7 +37,6 @@ __global__
void KernelSqrtV2(__half * a, __half * b, int size); void KernelSqrtV2(__half * a, __half * b, int size);
/* get the power of the entries */ /* get the power of the entries */
extern "C"
void _CudaPower(const XTensor * a, XTensor * b, DTYPE p); void _CudaPower(const XTensor * a, XTensor * b, DTYPE p);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -47,8 +47,7 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift) ...@@ -47,8 +47,7 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift)
} }
#endif #endif
CheckNTErrors((a->dataType == DEFAULT_DTYPE), CheckNTErrors((a->dataType == DEFAULT_DTYPE), "The tensor is not in the default data type!");
"The tensor is not in the default data type!");
/* sparse tensor */ /* sparse tensor */
if(a->isSparse){ if(a->isSparse){
......
...@@ -37,7 +37,6 @@ __global__ ...@@ -37,7 +37,6 @@ __global__
void KernelScaleAndShift(__half * a, __half * b, int size, __half scale, __half shift); void KernelScaleAndShift(__half * a, __half * b, int size, __half scale, __half shift);
/* scale and shift all tensor entires b = a * scale + shift (cuda version) */ /* scale and shift all tensor entires b = a * scale + shift (cuda version) */
extern "C"
void _CudaScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift); void _CudaScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* copy data by index */ /* copy data by index */
extern "C"
void _CudaCopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, void * target, int * index, int unitSize, XMem * myMem); void _CudaCopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, void * target, int * index, int unitSize, XMem * myMem);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -33,7 +33,6 @@ __global__ ...@@ -33,7 +33,6 @@ __global__
void KernelCopyBlocks(DTYPE * source, int blockSize, int blockNum, DTYPE * target, int * targetBlocks); void KernelCopyBlocks(DTYPE * source, int blockSize, int blockNum, DTYPE * target, int * targetBlocks);
/* copy a number of blocks to target positions (cuda version) */ /* copy a number of blocks to target positions (cuda version) */
extern "C"
void _CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem); void _CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -27,7 +27,6 @@ ...@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* copy a number of blocks to target positions (on site) */ /* copy a number of blocks to target positions (on site) */
extern "C"
void _CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem); void _CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -72,7 +72,7 @@ copy a number of blocks from source positions to target positions (cuda version) ...@@ -72,7 +72,7 @@ copy a number of blocks from source positions to target positions (cuda version)
*/ */
void _CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID) void _CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID)
{ {
CheckNTErrors((devID >= 0), "Wrong device to run!"); CheckNTErrors(devID >= 0, "Wrong device to run!");
CheckNTErrors((blockSize % sizeof(DTYPE) == 0), "Unsupported block size!"); CheckNTErrors((blockSize % sizeof(DTYPE) == 0), "Unsupported block size!");
/* copy the index to the GPU memory */ /* copy the index to the GPU memory */
......
...@@ -33,7 +33,6 @@ __global__ ...@@ -33,7 +33,6 @@ __global__
void KernelCopyBlocksSelected(DTYPE * source, int blockSize, int * sourceBlocks, int blockNum, DTYPE * target, int * targetBlocks); void KernelCopyBlocksSelected(DTYPE * source, int blockSize, int * sourceBlocks, int blockNum, DTYPE * target, int * targetBlocks);
/* copy a number of blocks form source positions to target positions (cuda version) */ /* copy a number of blocks form source positions to target positions (cuda version) */
extern "C"
void _CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID); void _CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -27,7 +27,6 @@ ...@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* copy selected sub-tensors */ /* copy selected sub-tensors */
extern "C"
void _CopyIndexed(const XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum); void _CopyIndexed(const XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum);
/* /*
......
...@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* copy all elements from a source matrix to a target matrix */ /* copy all elements from a source matrix to a target matrix */
extern "C"
void _CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream = NULL); void _CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream = NULL);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -29,7 +29,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor) ...@@ -29,7 +29,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* get the max-valued items along a dimension of the tensor (cuda version) */ /* get the max-valued items along a dimension of the tensor (cuda version) */
extern "C"
void _CudaReduceMax(const XTensor * input, XTensor * output, int dim); void _CudaReduceMax(const XTensor * input, XTensor * output, int dim);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -31,7 +31,6 @@ standard variance of the items along a dimension of the tensor ...@@ -31,7 +31,6 @@ standard variance of the items along a dimension of the tensor
For a 1-dimensional data array a, For a 1-dimensional data array a,
variance = (1/n * \sum_i (a_i - mean)^2)^0.5 variance = (1/n * \sum_i (a_i - mean)^2)^0.5
*/ */
extern "C"
void _ReduceStandardVariance(XTensor * input, XTensor * output, int dim, XTensor * mean); void _ReduceStandardVariance(XTensor * input, XTensor * output, int dim, XTensor * mean);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -53,12 +53,10 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor ...@@ -53,12 +53,10 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor
int dimRDI = input->order - dim - 1; int dimRDI = input->order - dim - 1;
for(int i = 0; i < input->order; i++){ for(int i = 0; i < input->order; i++){
if(i < dimRDI){ if(i < dimRDI){
CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i]), CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i]), "Unmatched tensors!");
"Unmatched tensors!");
} }
else if(i > dimRDI){ else if(i > dimRDI){
CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i - 1]), CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i - 1]), "Unmatched tensors!");
"Unmatched tensors!");
} }
} }
......
...@@ -34,7 +34,6 @@ For a 1-dimensional data array a, ...@@ -34,7 +34,6 @@ For a 1-dimensional data array a,
sum = \sum_i ((a_i + shift)^power) if isExp == false sum = \sum_i ((a_i + shift)^power) if isExp == false
sum = \sum_i exp((a_i + shift)^power) if isExp == true sum = \sum_i exp((a_i + shift)^power) if isExp == true
*/ */
extern "C"
void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor * shift, DTYPE power, bool isExp); void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor * shift, DTYPE power, bool isExp);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -27,7 +27,6 @@ ...@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* concatenate a list of tensors along a given dimension */ /* concatenate a list of tensors along a given dimension */
extern "C"
void _ConcatenateSolely(const XList * smalls, XTensor * big, int dim); void _ConcatenateSolely(const XList * smalls, XTensor * big, int dim);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -69,7 +69,6 @@ set target data block index for the data movement in split ...@@ -69,7 +69,6 @@ set target data block index for the data movement in split
>> gridNum - number of grids >> gridNum - number of grids
>> mem - the memory pool >> mem - the memory pool
*/ */
extern "C"
void _CudaMakeMergeBlockIndex(int devID, void _CudaMakeMergeBlockIndex(int devID,
int * blockIndex, int blockNum, int blockNumInMerge, int * blockIndex, int blockNum, int blockNumInMerge,
int splitSizeInGrid, int gridSize, int gridNum) int splitSizeInGrid, int gridSize, int gridNum)
......
...@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* set target data block index for the data movement in split */ /* set target data block index for the data movement in split */
extern "C"
void _CudaMakeMergeBlockIndex(int devID, int * blockIndex, int blockNum, int blockNumInMerge, void _CudaMakeMergeBlockIndex(int devID, int * blockIndex, int blockNum, int blockNumInMerge,
int splitSizeInGrid, int gridSize, int gridNum); int splitSizeInGrid, int gridSize, int gridNum);
......
...@@ -27,7 +27,6 @@ ...@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* set target data block index for the data movement in merge */ /* set target data block index for the data movement in merge */
extern "C"
void _MakeMergeBlockIndex(int * blockIndex, int blockNum, int blockNumInMerge, void _MakeMergeBlockIndex(int * blockIndex, int blockNum, int blockNumInMerge,
int splitSizeInGrid, int gridSize, int gridNum, XMem * mem); int splitSizeInGrid, int gridSize, int gridNum, XMem * mem);
......
...@@ -57,7 +57,6 @@ set target data block index for the data movement in split ...@@ -57,7 +57,6 @@ set target data block index for the data movement in split
>> blockSplitSize - size of the splitted block >> blockSplitSize - size of the splitted block
>> blockNum - number of data blocks >> blockNum - number of data blocks
*/ */
extern "C"
void _CudaMakeSplitBlockIndex(int devID, int * blockIndex, int splitNum, int blockSplitSize, int blockNum) void _CudaMakeSplitBlockIndex(int devID, int * blockIndex, int splitNum, int blockSplitSize, int blockNum)
{ {
int cudaGrids[3]; int cudaGrids[3];
......
...@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* set target data block index for the data movement in split */ /* set target data block index for the data movement in split */
extern "C"
void _CudaMakeSplitBlockIndex(int devID, int * blockIndex, int splitNum, int blockSplitSize, int blockNum); void _CudaMakeSplitBlockIndex(int devID, int * blockIndex, int splitNum, int blockSplitSize, int blockNum);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -27,7 +27,6 @@ ...@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* set target data block index for the data movement in split */ /* set target data block index for the data movement in split */
extern "C"
void _MakeSplitBlockIndex(int * blockIndex, int splitNum, int blockSplitSize, int blockNum, XMem * mem); void _MakeSplitBlockIndex(int * blockIndex, int splitNum, int blockSplitSize, int blockNum, XMem * mem);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -99,8 +99,7 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim) ...@@ -99,8 +99,7 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
char * sData = (char*)s->data + g * blockSize * blockNum * s->unitSize; char * sData = (char*)s->data + g * blockSize * blockNum * s->unitSize;
for (int k = 0; k < mergedNum; k++) { for (int k = 0; k < mergedNum; k++) {
XMemCopy2D(tData + k * tStep, tPtich, t->devID, XMemCopy2D(tData + k * tStep, tPtich, t->devID,
sData + k * sStep, sPitch, s->devID, sData + k * sStep, sPitch, s->devID, mSize, n);
mSize, n);
} }
} }
} }
......
...@@ -71,7 +71,6 @@ merge data by blocks (cuda version) ...@@ -71,7 +71,6 @@ merge data by blocks (cuda version)
>> target - target data array >> target - target data array
>> myMem - the memory pool >> myMem - the memory pool
*/ */
extern "C"
void _CudaMergeBlockLists(const XList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem) void _CudaMergeBlockLists(const XList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem)
{ {
CheckNTErrors((myMem != NULL), "No memory pool!"); CheckNTErrors((myMem != NULL), "No memory pool!");
......
...@@ -33,7 +33,6 @@ __global__ ...@@ -33,7 +33,6 @@ __global__
void KernelCopyBlockLists(DTYPE ** sourceList, int * sourceBlockSizes, int sourceBlockNum, DTYPE ** targetList); void KernelCopyBlockLists(DTYPE ** sourceList, int * sourceBlockSizes, int sourceBlockNum, DTYPE ** targetList);
/* merge data by blocks (cuda version) */ /* merge data by blocks (cuda version) */
extern "C"
void _CudaMergeBlockLists(const XList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem); void _CudaMergeBlockLists(const XList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -27,7 +27,6 @@ ...@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* merge data by blocks */ /* merge data by blocks */
extern "C"
void _MergeBlockLists(const XList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem); void _MergeBlockLists(const XList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -66,7 +66,6 @@ insert a dimension by copying the blocks for x times (where x is the size of the ...@@ -66,7 +66,6 @@ insert a dimension by copying the blocks for x times (where x is the size of the
>> dim - where to insert the dimension >> dim - where to insert the dimension
>> dSize - size of the newly-inserted dimension >> dSize - size of the newly-inserted dimension
*/ */
extern "C"
void _CudaUnsqueeze(const XTensor * a, XTensor * b, int dim, int dSize) void _CudaUnsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
{ {
int blockSize = 1; int blockSize = 1;
......
...@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* duplicate the data along a given dimension */ /* duplicate the data along a given dimension */
extern "C"
void _CudaUnsqueeze(const XTensor * a, XTensor * b, int dim, int dSize); void _CudaUnsqueeze(const XTensor * a, XTensor * b, int dim, int dSize);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
*/ */
#include "../../XTensor.h" #include "../../XTensor.h"
#include "../movement/CopyValues.h"
#include "../../XUtility.h" #include "../../XUtility.h"
#include "../../XName.h" #include "../../XName.h"
#include "Sort.h" #include "Sort.h"
...@@ -63,15 +64,15 @@ void _Sort(const XTensor * a, XTensor * b, XTensor * index, int dim) ...@@ -63,15 +64,15 @@ void _Sort(const XTensor * a, XTensor * b, XTensor * index, int dim)
blockNum *= a->dimSizeRDI[i]; blockNum *= a->dimSizeRDI[i];
int blockSize = stride * strideNum; int blockSize = stride * strideNum;
_CopyValues(a, b);
for (int k = 0; k < blockNum; k++) { for (int k = 0; k < blockNum; k++) {
for (int i = 0; i < stride; i++) { for (int i = 0; i < stride; i++) {
void * dataA = (char*)a->data + (k * blockSize + i) * a->unitSize;
void * dataB = (char*)b->data + (k * blockSize + i) * b->unitSize; void * dataB = (char*)b->data + (k * blockSize + i) * b->unitSize;
void * indexData = (char*)index->data + (k * blockSize + i) * sizeof(int); void * indexData = (char*)index->data + (k * blockSize + i) * sizeof(int);
/* we sort the data array along "dim" */ /* we sort the data array along "dim" */
if (a->dataType == X_FLOAT) if (a->dataType == X_FLOAT)
XQSort(dataA, dataB, indexData, strideNum, a->unitSize, stride, CompXFloat); XQSort(dataB, indexData, strideNum, a->unitSize, stride, CompXFloat);
else { else {
ShowNTErrors("TODO!"); ShowNTErrors("TODO!");
} }
......
...@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* sort the tensor along a given dimension */ /* sort the tensor along a given dimension */
extern "C"
void _CudaSortBig(const XTensor * a, XTensor * b, XTensor * indexA, XTensor * indexB, int dim, int k = -1); void _CudaSortBig(const XTensor * a, XTensor * b, XTensor * indexA, XTensor * indexB, int dim, int k = -1);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -39,7 +39,6 @@ void _SortMe(XTensor * a, XTensor * index, int dim); ...@@ -39,7 +39,6 @@ void _SortMe(XTensor * a, XTensor * index, int dim);
sort the data along a given dimension (return a XTensor structure) sort the data along a given dimension (return a XTensor structure)
make a new tensor to keep the result and return it make a new tensor to keep the result and return it
*/ */
extern "C"
void Sort(XTensor & a, XTensor & b, XTensor & index, int dim); void Sort(XTensor & a, XTensor & b, XTensor & index, int dim);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* get the top-k items along a given dimension */ /* get the top-k items along a given dimension */
extern "C"
void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k); void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -63,7 +63,6 @@ set the cell to the ascending order along a given dimension ...@@ -63,7 +63,6 @@ set the cell to the ascending order along a given dimension
>> a - the tensor >> a - the tensor
>> dim - the dimension >> dim - the dimension
*/ */
extern "C"
void CudaSetAscendingOrder(XTensor * a, int dim) void CudaSetAscendingOrder(XTensor * a, int dim)
{ {
CheckNTErrors((a->dataType == X_INT), "TODO!"); CheckNTErrors((a->dataType == X_INT), "TODO!");
......
...@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* set the cell to the ascending order along a given dimension */ /* set the cell to the ascending order along a given dimension */
extern "C"
void CudaSetAscendingOrder(XTensor * a, int dim); void CudaSetAscendingOrder(XTensor * a, int dim);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -28,15 +28,12 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -28,15 +28,12 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* segmentation and parallel processing for 2d tensors (i.e., matrices) */ /* segmentation and parallel processing for 2d tensors (i.e., matrices) */
/* segment a 2d tensor (i.e., matrix) into blocks and run jobs in parallel */ /* segment a 2d tensor (i.e., matrix) into blocks and run jobs in parallel */
extern "C"
void RunParallel2D(XPRunner * parallelRunner, void * job, int opNum, int rowNum, int colNum, int argNum, ...); void RunParallel2D(XPRunner * parallelRunner, void * job, int opNum, int rowNum, int colNum, int argNum, ...);
/* segment a block into sub-blocks */ /* segment a block into sub-blocks */
extern "C"
int SegmentTensor2D(int rowNum, int colNum, int blockNum, int * blockIndex); int SegmentTensor2D(int rowNum, int colNum, int blockNum, int * blockIndex);
/* segment a block into sub-blocks */ /* segment a block into sub-blocks */
extern "C"
int SegmentTensor2DInRows(int rowNum, int colNum, int blockNum, int * blockIndex); int SegmentTensor2DInRows(int rowNum, int colNum, int blockNum, int * blockIndex);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -35,11 +35,9 @@ y = 1 if x > 1 ...@@ -35,11 +35,9 @@ y = 1 if x > 1
x if -1 <= x <= 1 x if -1 <= x <= 1
-1 if x < -1 -1 if x < -1
*/ */
extern "C"
void _CudaHardTanH(const XTensor * input, XTensor * output); void _CudaHardTanH(const XTensor * input, XTensor * output);
/* de/dx (Cuda version) */ /* de/dx (Cuda version) */
extern "C"
void _CudaHardTanHBackward(XTensor * gold, XTensor * y, XTensor * x, void _CudaHardTanHBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx, XTensor * dedy, XTensor * dedx,
LOSS_FUNCTION_NAME lossName); LOSS_FUNCTION_NAME lossName);
......
...@@ -190,7 +190,7 @@ set dE/dx = exp(y) ...@@ -190,7 +190,7 @@ set dE/dx = exp(y)
>> size - size of output >> size - size of output
>> lossName - name of the loss function >> lossName - name of the loss function
*/ */
extern "C" __global__ __global__
void KernelExpLoss(DTYPE * dedy, DTYPE * dedx, DTYPE * y, int size, LOSS_FUNCTION_NAME lossName) void KernelExpLoss(DTYPE * dedy, DTYPE * dedx, DTYPE * y, int size, LOSS_FUNCTION_NAME lossName)
{ {
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
......
...@@ -223,7 +223,7 @@ backward compuation for squared error (Cuda kernel) ...@@ -223,7 +223,7 @@ backward compuation for squared error (Cuda kernel)
>> y - model output (in vector) >> y - model output (in vector)
>> size - size of the vector (dedy) >> size - size of the vector (dedy)
*/ */
extern "C" __global__ __global__
void KernelLossBackwardSquaredError(DTYPE * dedy, DTYPE * t, DTYPE * y, int size) void KernelLossBackwardSquaredError(DTYPE * dedy, DTYPE * t, DTYPE * y, int size)
{ {
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
...@@ -243,7 +243,7 @@ backward compuation of blocks for squared error (Cuda kernel) ...@@ -243,7 +243,7 @@ backward compuation of blocks for squared error (Cuda kernel)
>> lenInBlock - number of items in a block for computation >> lenInBlock - number of items in a block for computation
>> size - size of the vector (dedy) >> size - size of the vector (dedy)
*/ */
extern "C" __global__ __global__
void KernelLossBackwardSquaredErrorBlock(DTYPE * dedy, DTYPE * t, DTYPE * y, void KernelLossBackwardSquaredErrorBlock(DTYPE * dedy, DTYPE * t, DTYPE * y,
int blockSize, int begInBlock, int lenInBlock, int size) int blockSize, int begInBlock, int lenInBlock, int size)
{ {
...@@ -266,7 +266,7 @@ backward compuation for cross entropy (Cuda kernel) ...@@ -266,7 +266,7 @@ backward compuation for cross entropy (Cuda kernel)
>> y - model output (in vector) >> y - model output (in vector)
>> size - size of the vector (dedy) >> size - size of the vector (dedy)
*/ */
extern "C" __global__ __global__
void KernelLossBackwardCrossEntropy(DTYPE * dedy, DTYPE * t, DTYPE * y, int tBeg, int tLen, int yBeg, int blockNum, int stride, int dimensionSize) void KernelLossBackwardCrossEntropy(DTYPE * dedy, DTYPE * t, DTYPE * y, int tBeg, int tLen, int yBeg, int blockNum, int stride, int dimensionSize)
{ {
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
...@@ -298,7 +298,7 @@ backward compuation for cross entropy (Cuda kernel) ...@@ -298,7 +298,7 @@ backward compuation for cross entropy (Cuda kernel)
>> lenInBlock - number of items in a block for computation >> lenInBlock - number of items in a block for computation
>> size - size of the vector (dedy) >> size - size of the vector (dedy)
*/ */
extern "C" __global__ __global__
void KernelLossBackwardCrossEntropyBlock(DTYPE * dedy, DTYPE * t, DTYPE * y, void KernelLossBackwardCrossEntropyBlock(DTYPE * dedy, DTYPE * t, DTYPE * y,
int blockSize, int begInBlock, int lenInBlock, int size) int blockSize, int begInBlock, int lenInBlock, int size)
{ {
......
...@@ -30,21 +30,17 @@ namespace nts{ // namespace nts(NiuTrans.Tensor) ...@@ -30,21 +30,17 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* compute the loss (cuda version) */ /* compute the loss (cuda version) */
extern "C"
DTYPE _CudaLossCompute(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName, DTYPE _CudaLossCompute(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
bool isLogOutput, int leadDim, int gBeg, int gLen, int oBeg); bool isLogOutput, int leadDim, int gBeg, int gLen, int oBeg);
/* compute the loss in log scale (cuda version) */ /* compute the loss in log scale (cuda version) */
extern "C"
DTYPE _CudaLossComputeForLogScale(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName, DTYPE _CudaLossComputeForLogScale(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
int leadDim, int gBeg, int gLen, int oBeg); int leadDim, int gBeg, int gLen, int oBeg);
/* backward compuation for a single element (cuda version) */ /* backward compuation for a single element (cuda version) */
extern "C"
DTYPE _CudaLossBackwardPoint(DTYPE t, DTYPE y, LOSS_FUNCTION_NAME LFName); DTYPE _CudaLossBackwardPoint(DTYPE t, DTYPE y, LOSS_FUNCTION_NAME LFName);
/* backward compuation for (dense) vectors (cuda version) */ /* backward compuation for (dense) vectors (cuda version) */
extern "C"
void _CudaLossBackward(XTensor * dedy, XTensor * t, XTensor * y, void _CudaLossBackward(XTensor * dedy, XTensor * t, XTensor * y,
LOSS_FUNCTION_NAME LFName, LOSS_FUNCTION_NAME LFName,
int leadDim = -1, int tBeg = 0, int tLen = -1, int yBeg = 0); int leadDim = -1, int tBeg = 0, int tLen = -1, int yBeg = 0);
......
...@@ -30,11 +30,9 @@ namespace nts{ // namespace nts(NiuTrans.Tensor) ...@@ -30,11 +30,9 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* rectify function y = max(0, x) (Cuda version) */ /* rectify function y = max(0, x) (Cuda version) */
extern "C"
void _CudaRectify(const XTensor * input, XTensor * output); void _CudaRectify(const XTensor * input, XTensor * output);
/* de/dx (Cuda version) */ /* de/dx (Cuda version) */
extern "C"
void _CudaRectifyBackward(XTensor * gold, XTensor * y, XTensor * x, void _CudaRectifyBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx, XTensor * dedy, XTensor * dedx,
LOSS_FUNCTION_NAME lossName); LOSS_FUNCTION_NAME lossName);
......
...@@ -30,11 +30,9 @@ namespace nts{ // namespace nts(NiuTrans.Tensor) ...@@ -30,11 +30,9 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* rectify function y = max(0, x) (Cuda version) */ /* rectify function y = max(0, x) (Cuda version) */
extern "C"
void _CudaSigmoid(const XTensor * input, XTensor * output); void _CudaSigmoid(const XTensor * input, XTensor * output);
/* de/dx (Cuda version) */ /* de/dx (Cuda version) */
extern "C"
void _CudaSigmoidBackward(XTensor * gold, XTensor * y, XTensor * x, void _CudaSigmoidBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx, XTensor * dedy, XTensor * dedx,
LOSS_FUNCTION_NAME lossName); LOSS_FUNCTION_NAME lossName);
......
...@@ -30,15 +30,12 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -30,15 +30,12 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* softmax y = e^x / \sum_{i} e^{x_i} (Cuda version) */ /* softmax y = e^x / \sum_{i} e^{x_i} (Cuda version) */
extern "C"
void _CudaSoftmax(const XTensor * input, XTensor * output, int leadDim); void _CudaSoftmax(const XTensor * input, XTensor * output, int leadDim);
/* softmax y = e^x / \sum_{i} e^{x_i} (Cuda version) */ /* softmax y = e^x / \sum_{i} e^{x_i} (Cuda version) */
extern "C"
void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * sum, XTensor * max); void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * sum, XTensor * max);
/* de/dx (Cuda version) */ /* de/dx (Cuda version) */
extern "C"
void _CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, void _CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx, XTensor * dedy, XTensor * dedx,
int leadDim, int leadDim,
......
...@@ -27,7 +27,6 @@ ...@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* test for Absolute Function */ /* test for Absolute Function */
extern "C"
bool TestAbsolute(); bool TestAbsolute();
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -27,7 +27,6 @@ ...@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* test for Concatenate Function */ /* test for Concatenate Function */
extern "C"
bool TestConcatenate(); bool TestConcatenate();
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -27,7 +27,6 @@ ...@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* test for ConcatenateSolely Function */ /* test for ConcatenateSolely Function */
extern "C"
bool TestConcatenateSolely(); bool TestConcatenateSolely();
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -27,7 +27,6 @@ ...@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* test for ConvertDataType Function */ /* test for ConvertDataType Function */
extern "C"
bool TestConvertDataType(); bool TestConvertDataType();
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -27,7 +27,6 @@ ...@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* test for CopyIndexed Function */ /* test for CopyIndexed Function */
extern "C"
bool TestCopyIndexed(); bool TestCopyIndexed();
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -27,7 +27,6 @@ ...@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* test for CopyValues Function */ /* test for CopyValues Function */
extern "C"
bool TestCopyValues(); bool TestCopyValues();
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -27,7 +27,6 @@ ...@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* test for HardTanH Function */ /* test for HardTanH Function */
extern "C"
bool TestHardTanH(); bool TestHardTanH();
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -27,7 +27,6 @@ ...@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* test for Identity Function */ /* test for Identity Function */
extern "C"
bool TestIdentity(); bool TestIdentity();
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -27,7 +27,6 @@ ...@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* test for Log Function */ /* test for Log Function */
extern "C"
bool TestLog(); bool TestLog();
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -27,7 +27,6 @@ ...@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* test for LogSoftmax Function */ /* test for LogSoftmax Function */
extern "C"
bool TestLogSoftmax(); bool TestLogSoftmax();
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -27,7 +27,6 @@ ...@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* test for Loss Function */ /* test for Loss Function */
extern "C"
bool TestLoss(); bool TestLoss();
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -27,7 +27,6 @@ ...@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* test for MatrixMul Function */ /* test for MatrixMul Function */
extern "C"
bool TestMatrixMul(); bool TestMatrixMul();
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -248,7 +248,6 @@ bool TestMatrixMul2D2() ...@@ -248,7 +248,6 @@ bool TestMatrixMul2D2()
*/ */
/* test for MatrixMul2D Function */ /* test for MatrixMul2D Function */
extern "C"
bool TestMatrixMul2D() bool TestMatrixMul2D()
{ {
XPRINT(0, stdout, "[TEST MATRIXMUL2D] matrix multiplication (for 2d tensors) \n"); XPRINT(0, stdout, "[TEST MATRIXMUL2D] matrix multiplication (for 2d tensors) \n");
......
...@@ -27,7 +27,6 @@ ...@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* test for MatrixMul2D Function */ /* test for MatrixMul2D Function */
extern "C"
bool TestMatrixMul2D(); bool TestMatrixMul2D();
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -27,7 +27,6 @@ ...@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* test for MatrixMul2DParallel Function */ /* test for MatrixMul2DParallel Function */
extern "C"
bool TestMatrixMul2DParallel(); bool TestMatrixMul2DParallel();
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -210,14 +210,14 @@ bool TestSort() ...@@ -210,14 +210,14 @@ bool TestSort()
XPRINT(0, stdout, "[TEST SORT] sort the tensor along a given dimension \n"); XPRINT(0, stdout, "[TEST SORT] sort the tensor along a given dimension \n");
bool returnFlag = true, caseFlag = true; bool returnFlag = true, caseFlag = true;
///* case 1 test */ /* case 1 test */
//caseFlag = TestSort1(); caseFlag = TestSort1();
//if (!caseFlag) { if (!caseFlag) {
// returnFlag = false; returnFlag = false;
// XPRINT(0, stdout, ">> case 1 failed!\n"); XPRINT(0, stdout, ">> case 1 failed!\n");
//} }
//else else
// XPRINT(0, stdout, ">> case 1 passed!\n"); XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 2 test */ /* case 2 test */
caseFlag = TestSort2(); caseFlag = TestSort2();
......
...@@ -69,7 +69,6 @@ ...@@ -69,7 +69,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* test for all Function */ /* test for all Function */
extern "C"
bool Test(); bool Test();
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论