Commit 906eebb7 by xuchen

merge with xuchen branch

parents 7283b3dc b3a76184
......@@ -70,7 +70,7 @@ void XLossGrad::Compute(XTensor * gold, XTensor * y,
XTensor * dedy,
LOSS_FUNCTION_NAME lossName)
{
LossBackward(dedy, gold, y, lossName);
_LossBackward(dedy, gold, y, lossName);
}
}
\ No newline at end of file
......@@ -328,6 +328,37 @@ void XLink::MakeLink(const XList * list, XTensor * h, int id)
}
/*
create a hyper edge with a input tensors and a list of output tensors
>> h - a input tensor
>> list - a list of output tensors
>> id - id of the edge type
*/
void XLink::MakeLink(XTensor * t, XList * list, int id)
{
/* forward */
for(int i = 0; i < list->count; i++){
XTensor * h = (XTensor*)list->GetItem(i);
if(h == NULL)
continue;
XLink &income = h->income;
income.Reset();
income.SetHead(h);
income.SetType(id);
income.AddTail(t);
}
/* backward */
XLink &outgo = t->outgo;
CheckNTErrors(outgo.head == NULL || outgo.head == t, "Wrong head of the hyperedge!");
for(int i = 0; i < list->count; i++){
XTensor * t = (XTensor*)list->GetItem(i);
if(t == NULL)
continue;
outgo.AddTail(t);
}
}
/*
add parameters
>> h - head
>> param - parameter we want introduce
......
......@@ -139,6 +139,10 @@ struct XLink
static
void MakeLink(const XList * list, XTensor * h, int id);
/* create a hyper edge with a input tensors and a list of output tensors */
static
void MakeLink(XTensor * h, XList * list, int id);
/* add a parameter */
static
void AddParamToHead(XTensor * h, DTYPE param);
......
......@@ -26,76 +26,80 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* get operator name */
const char * GetOPName(int type)
{
if((type & MATH_BASE) != 0){
if(type == MATH_ABSOLUTE)
if ((type & MATH_BASE) != 0){
if (type == MATH_ABSOLUTE)
return "M_ABSOLUTE";
else if(type == MATH_MATRIXMUL)
else if (type == MATH_MATRIXMUL)
return "M_MATRIXMUL";
else if(type == MATH_MATRIXMULBATCHED)
else if (type == MATH_MATRIXMULBATCHED)
return "M_MATRIXMULBATCHED";
else if(type == MATH_MULTIPLY)
else if (type == MATH_MULTIPLY)
return "M_MULTIPLY";
else if(type == MATH_NEGATE)
else if (type == MATH_NEGATE)
return "M_NEGATE";
else if(type == MATH_SIGN)
else if (type == MATH_SIGN)
return "M_SIGN";
else if(type == MATH_SUM)
else if (type == MATH_SUM)
return "M_SUM";
else if(type == MATH_LOG)
return "M_NORMALIZE";
else if(type == MATH_NORMALIZE)
else if (type == MATH_LOG)
return "M_LOG";
else if(type == MATH_POWER)
else if (type == MATH_NORMALIZE)
return "M_NORMALIZE";
else if (type == MATH_POWER)
return "M_POWER";
else if(type == MATH_SCALEANDSHIFT)
else if (type == MATH_SCALEANDSHIFT)
return "M_SCALEANDSHIFT";
else if(type == REDUCE_REDUCEMAX)
else if (type == REDUCE_REDUCEMAX)
return "R_REDUCEMAX";
else if(type == REDUCE_REDUCEMEAN)
else if (type == REDUCE_REDUCEMEAN)
return "R_REDUCEMEAN";
else if(type == REDUCE_REDUCESUM)
else if (type == REDUCE_REDUCESUM)
return "R_REDUCESUM";
else if(type == REDUCE_REDUCESUMSQUARED)
else if (type == REDUCE_REDUCESUMSQUARED)
return "R_REDUCESUMSQUARED";
else if(type == REDUCE_REDUCEVARIANCE)
else if (type == REDUCE_REDUCEVARIANCE)
return "R_REDUCEVARIANCE";
}
else if((type & DATA_BASE) != 0){
if(type == GETANDSET_SELECT)
else if ((type & DATA_BASE) != 0){
if (type == GETANDSET_SELECT)
return "G_SELECT";
else if(type == MOVEMENT_COPYINDEXED)
else if (type == MOVEMENT_COPYINDEXED)
return "M_COPYINDEXED";
else if(type == MOVEMENT_COPYVALUES)
else if (type == MOVEMENT_COPYVALUES)
return "M_COPYVALUES";
else if(type == SHAPE_CONCATENATE)
else if (type == SHAPE_CONCATENATE)
return "S_CONCATENATE";
else if(type == SHAPE_MERGE)
else if (type == SHAPE_MERGE)
return "S_MERGE";
else if(type == SHAPE_MERGE_LIST)
else if (type == SHAPE_MERGE_LIST)
return "S_MERGE_LIST";
else if(type == SHAPE_PERMUTE)
else if (type == SHAPE_PERMUTE)
return "S_PERMUTE";
else if(type == SHAPE_SPLIT)
else if (type == SHAPE_SPLIT)
return "S_SPLIT";
else if(type == SHAPE_SPLIT_LIST)
else if (type == SHAPE_SPLIT_LIST)
return "S_SPLIT_LIST";
else if(type == SHAPE_TRANSPOSE)
else if (type == SHAPE_TRANSPOSE)
return "S_TRANSPOSE";
else if(type == SHAPE_UNSQUEEZE)
else if (type == SHAPE_UNSQUEEZE)
return "S_UNSQUEEZE";
else if (type == SORT_SORT)
return "S_SORT";
else if (type == SORT_TOPK)
return "S_TOPK";
}
else if((type & FUNCTION_BASE) != 0){
if(type == FUNC_HARDTANH)
else if ((type & FUNCTION_BASE) != 0){
if (type == FUNC_HARDTANH)
return "F_HARDTANH";
else if(type == FUNC_IDENTITY)
else if (type == FUNC_IDENTITY)
return "F_IDENTITY";
else if(type == FUNC_LOGSOFTMAX)
else if (type == FUNC_LOGSOFTMAX)
return "F_LOGSOFTMAX";
else if(type == FUNC_RECTIFY)
else if (type == FUNC_RECTIFY)
return "F_RECTIFY";
else if(type == FUNC_SIGMOID)
else if (type == FUNC_SIGMOID)
return "F_SIGMOID";
else if(type == FUNC_SOFTMAX)
else if (type == FUNC_SOFTMAX)
return "F_SOFTMAX";
}
......
......@@ -69,6 +69,10 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#define SHAPE_TRANSPOSE SHAPE_SPLIT_LIST + 1
#define SHAPE_UNSQUEEZE SHAPE_TRANSPOSE + 1
#define SORT SHAPE_UNSQUEEZE + 1
#define SORT_SORT SORT + 1
#define SORT_TOPK SORT_SORT + 1
/* activation functions */
#define FUNCTION_BASE DATA_BASE * 2
#define FUNC_HARDTANH FUNCTION_BASE + 1
......
......@@ -1121,7 +1121,7 @@ bool XTensor::Resize(const int myOrder, const int * myDimSize,
if(isSparse){
/*
for sparse matrices, we use a list of tuple (key, value),
ordered by key. Take a (2-dimensional) matrix as an examples,
ordered by key. Take a (2-dimensional) matrix as an example,
we have key = m * i + j;
The data array is
---------
......
......@@ -486,8 +486,9 @@ quick sorting
NOTE: this means that the items may not placed in a continuous memory space
>> comp - the comparison function
*/
void XQSort(void * data, void * index, int num, int width, int stride, int (*comp)(const void *, const void *))
void XQSort(void * dataA, void * dataB, void * index, int num, int width, int stride, int (*comp)(const void *, const void *))
{
XMemCopy(dataB, -1, dataA, -1, num * width);
char *lo, *hi; // ends of sub-array currently sorting
int *indexlo, *indexhi;
char *mid; // points to middle of subarray
......@@ -506,8 +507,8 @@ void XQSort(void * data, void * index, int num, int width, int stride, int (*com
stackptr = 0;
lo = (char*)data;
hi = (char*)data + realStride * (num - 1);
lo = (char*)dataB;
hi = (char*)dataB + realStride * (num - 1);
indexlo = (int*)index;
indexhi = index != NULL ? (int*)index + stride * (num - 1) : NULL;
......
......@@ -53,7 +53,7 @@ extern void XSleep(int sleepTime);
extern double GetClock();
extern double GetClockSec();
extern void XQSort(void * data, void * index, int num, int width, int stride, int (*comp)(const void *, const void *));
extern void XQSort(void * dataA, void * dataB, void * index, int num, int width, int stride, int (*comp)(const void *, const void *));
extern int CompXFloat(const void * a, const void * b);
#ifdef USE_CUDA
......
......@@ -21,6 +21,7 @@
#include <math.h>
#include "../../XTensor.h"
#include "../../XName.h"
#include "Absolute.h"
#include "Absolute.cuh"
......@@ -28,21 +29,54 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/*
set every entry to its absolute value
>> a - the tensor we are processing
>> a - input tensor we are processing
>> b - output tensor we are processing
*/
void _Absolute(XTensor * a)
void _Absolute(const XTensor * a, XTensor * b)
{
#ifdef USE_CUDA
/* run it on GPUs */
if (a->devID >= 0) {
_CudaAbsolute(a);
_CudaAbsolute(a, b);
return;
}
#endif
CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
DTYPE * d = (DTYPE*)a->data;
DTYPE * db = (DTYPE*)b->data;
for (int i = 0; i < a->unitNum; i++)
d[i] = (DTYPE)fabs(d[i]);
db[i] = (DTYPE)fabs(d[i]);
}
/*
set every entry to its absolute value (do it on site)
keep the result in the input tensor a and return nothing
>> a - the tensor we are processing
*/
void _AbsoluteMe(XTensor * a)
{
_Absolute(a, a);
}
/*
set every entry to its absolute value (return a XTensor structure)
make a new tensor to keep the result and return it
>> a - input tensor we are processing
<< return - the absolute value of input tensor
*/
XTensor Absolute(const XTensor & a)
{
XTensor b(&a);
b.SetTMP();
/* call _Absolute function */
_Absolute(&a, &b);
/* tensor connections */
XLink::MakeLink(&a, NULL, &b, MATH_ABSOLUTE);
return b;
}
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
......@@ -29,37 +29,41 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/*
set each entry to its absolute value (CUDA Kernel)
>> d - pointer to the data array
>> a - pointer to input data array
>> b - pointer to output data array
>> size - size of the data array
*/
__global__
void KernelAbsolute(DTYPE * d, int size)
void KernelAbsolute(DTYPE * a, DTYPE * b, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size)
d[i] = fabs(d[i]);
b[i] = fabs(a[i]);
}
/*
set each entry to its absolute value (CUDA Kernel)
This is for float16 computation
>> d - pointer to the data array
>> a - pointer to input data array
>> b - pointer to output data array
>> size - size of the data array
*/
__global__
void KernelAbsolute(__half * d, int size)
void KernelAbsolute(__half * a, __half * b, int size)
{
return;
}
/*
set each entry to its with float16 data type value
>> a - the tensor
set each entry to its absolute value
>> a - input tensor
>> b - output tensor
*/
extern "C"
void _CudaAbsolute(XTensor * a)
void _CudaAbsolute(const XTensor * a, XTensor * b)
{
CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
CheckNTErrors((a->isSparse == false), "TODO!");
int gridSize[3];
......@@ -74,10 +78,10 @@ void _CudaAbsolute(XTensor * a)
ProtectCudaDev(a->devID, devIDBackup);
if (a->dataType == DEFAULT_DTYPE) {
KernelAbsolute << <blocks, threads >> >((DTYPE*)a->data, a->unitNum);
KernelAbsolute << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum);
}
else if (a->dataType == X_FLOAT16) {
KernelAbsolute << <blocks, threads >> >((__half*)a->data, a->unitNum);
KernelAbsolute << <blocks, threads >> >((__half*)a->data, (__half*)b->data, a->unitNum);
}
else {
ShowNTErrors("TODO!");
......
......@@ -27,15 +27,15 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* set each entry to its absolute value (CUDA Kernel) */
__global__
void KernelAbsolute(DTYPE * d, int size);
void KernelAbsolute(DTYPE * a, DTYPE * b, int size);
/* set each entry to its absolute value (CUDA Kernel) with float16 data type*/
__global__
void KernelAbsolute(__half * d, int size);
void KernelAbsolute(__half * a, __half * b, int size);
/* set each entry to its absolute value */
extern "C"
void _CudaAbsolute(XTensor * a);
void _CudaAbsolute(const XTensor * a, XTensor * b);
#endif // USE_CUDA
......
......@@ -27,8 +27,19 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* set every entry to its absolute value */
extern "C"
void _Absolute(XTensor * a);
void _Absolute(const XTensor * a, XTensor * b);
/*
set every entry to its absolute value (do it on site)
keep the result in the input tensor a and return nothing
*/
void _AbsoluteMe(XTensor * a);
/*
set every entry to its absolute value (return a XTensor structure)
make a new tensor to keep the result and return it
*/
XTensor Absolute(const XTensor & a);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -38,11 +38,11 @@ c_i = trans(a_i) * trans(b_i) * \alpha + c_i * \beta for each i in [0,count-1]
>> beta - scalar
*/
void _MatrixMULBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA,
const XList * b, MATRIX_TRANS_TYPE transposedB,
XList * c, DTYPE alpha, DTYPE beta)
const XList * b, MATRIX_TRANS_TYPE transposedB,
XList * c, DTYPE alpha, DTYPE beta)
{
CheckNTErrors((a && b && c), "Empty input lists!");
CheckNTErrors((a->count == b->count && a->count == c->count), "Input lists must be of the same size!");
CheckNTErrors(a && b && c, "Empty input lists!");
CheckNTErrors(a->count == b->count && a->count == c->count, "Input lists must be of the same size!");
if (a->count == 0)
return;
......
......@@ -28,8 +28,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* matrix multiplication in batch mode (CPU code) */
extern "C"
void _MatrixMULBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA, const XList * b, MATRIX_TRANS_TYPE transposedB, XList * c,
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
void _MatrixMULBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA, const XList * b, MATRIX_TRANS_TYPE transposedB,
XList * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -66,4 +66,4 @@ XTensor MatrixMul(const XTensor &a, const XTensor &b,
} // namespace nts(NiuTrans.Tensor)
#endif // __MATRIXMUL_H__
#endif // __MATRIXMUL_H__
\ No newline at end of file
......@@ -45,9 +45,9 @@ where trans() return the transposed matrix if the flag is fired
>> stream - the string for creating the job pipeline
*/
void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
const XTensor * b, MATRIX_TRANS_TYPE transposedB,
XTensor * c, DTYPE alpha, DTYPE beta,
XPRunner * parallelRunner, XStream * stream)
const XTensor * b, MATRIX_TRANS_TYPE transposedB,
XTensor * c, DTYPE alpha, DTYPE beta,
XPRunner * parallelRunner, XStream * stream)
{
CheckNTErrors((a && b && c), "Empty input tensors!");
CheckNTErrors((a->dataType == b->dataType), "Input tensors should have the same data type!");
......
......@@ -122,9 +122,8 @@ where trans() return the transposed matrix if the flag is fired
>> stream - the string for creating the job pipeline
*/
void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
const XTensor * b, MATRIX_TRANS_TYPE transposedB,
XTensor * c,
DTYPE alpha, DTYPE beta, XStream * stream)
const XTensor * b, MATRIX_TRANS_TYPE transposedB,
XTensor * c, DTYPE alpha, DTYPE beta, XStream * stream)
{
int an = transposedA == X_TRANS ? a->dimSize[1] : a->dimSize[0];
int am = transposedA == X_TRANS ? a->dimSize[0] : a->dimSize[1];
......
......@@ -44,7 +44,7 @@ where trans() return the transposed matrix if the flag is fired
*/
extern "C"
void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XStream * stream = NULL);
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XStream * stream = NULL);
#endif // USE_CUDA
......
......@@ -41,8 +41,8 @@ where trans() return the transposed matrix if the flag is fired
>> parallelRunner - parallel processing module
*/
void _MatrixMul2DParallel(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
const XTensor * b, MATRIX_TRANS_TYPE transposedB,
XTensor * c, DTYPE alpha, DTYPE beta, XPRunner * parallelRunner)
const XTensor * b, MATRIX_TRANS_TYPE transposedB,
XTensor * c, DTYPE alpha, DTYPE beta, XPRunner * parallelRunner)
{
CheckNTErrors((a && b && c), "Empty input tensors!");
CheckNTErrors((a->order == 2 && b->order == 2 && c->order == 2),
......
......@@ -32,8 +32,8 @@ c = trans(a) * trans(b) * alpha + c * beta
where trans() return the transposed matrix if the flag is fired.
*/
extern "C"
void _MatrixMul2DParallel(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
void _MatrixMul2DParallel(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB,
XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -46,9 +46,8 @@ where trans() returns the transposed matrix if the flag is fired
>> parallelRunner - parallel processing module
*/
void _MatrixMulBatched(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
const XTensor * b, MATRIX_TRANS_TYPE transposedB,
XTensor * c, DTYPE alpha, DTYPE beta,
XPRunner * parallelRunner)
const XTensor * b, MATRIX_TRANS_TYPE transposedB,
XTensor * c, DTYPE alpha, DTYPE beta, XPRunner * parallelRunner)
{
CheckNTErrors((a && b && c), "Empty input tensors!");
CheckNTErrors((a->dataType == b->dataType && a->dataType == c->dataType),
......@@ -169,14 +168,12 @@ where trans() returns the transposed matrix if the flag is fired.
>> b - tensor b
>> transposedB - indicates whether teh matrices in b are transposed
>> alpha - a coefficient
>> beta - another coefficient
>> parallelRunner - parallel processing module
<< return - the result of matrix multiplication of the two tensors
*/
XTensor MatrixMulBatched(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB,
DTYPE alpha, XPRunner * parallelRunner)
DTYPE alpha, XPRunner * parallelRunner)
{
CheckNTErrors(&a != &NULLTensor && &b != &NULLTensor, "Empty input tensors!");
CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
CheckNTErrors(a.order == b.order, "Input tensor and output tensor must have same order!");
......@@ -191,13 +188,13 @@ XTensor MatrixMulBatched(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const
int order = a.order;
int sub = 0;
int * dimSize = new int[order];
for (int i = 2; i < a.order; i++)
dimSize[sub++] = a.dimSizeRDI[i];
for (int i = 0; i < a.order - 2; i++)
dimSize[sub++] = a.dimSize[i];
dimSize[sub++] = an;
dimSize[sub++] = bm;
XTensor c = NewTensor(order, dimSize, a.dataType, a.denseRatio, a.devID, a.mem);
c.SetZeroAll();
float dr = (!a.isSparse || !b.isSparse) ? 1.0F : MAX(a.denseRatio, b.denseRatio);
XTensor c(order, dimSize, a.dataType, dr, a.devID, a.mem);
c.SetTMP();
/*call _MatrixMulBatched function */
......
......@@ -20,6 +20,7 @@
*/
#include "../../XTensor.h"
#include "../../XName.h"
#include "Negate.h"
#include "Negate.cuh"
......@@ -27,21 +28,55 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/*
set every entry to its minus value
>> a - the tensor we are processing
>> a - input tensor we are processing
>> b - output tensor we are processing
*/
void _Negate(XTensor * a)
void _Negate(const XTensor * a, XTensor * b)
{
#ifdef USE_CUDA
/* run it on GPUs */
if (a->devID >= 0) {
_CudaNegate(a);
_CudaNegate(a, b);
return;
}
}
#endif
CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
DTYPE * d = (DTYPE*)a->data;
DTYPE * db = (DTYPE*)b->data;
for (int i = 0; i < a->unitNum; i++)
d[i] = -d[i];
db[i] = -d[i];
}
/*
set every entry to its minus value (do it on site)
keep the result in the input tensor a and return nothing
>> a - the tensor we are processing
*/
void _NegateMe(XTensor * a)
{
_Negate(a, a);
}
/*
set every entry to its minus value (return a XTensor structure)
make a new tensor to keep the result and return it
>> a - input tensor we are processing
<< return - the minus value of input tensor
*/
XTensor Negate(const XTensor & a)
{
XTensor b(&a);
b.SetTMP();
/* call _Negate function */
_Negate(&a, &b);
/* tensor connections */
XLink::MakeLink(&a, NULL, &b, MATH_NEGATE);
return b;
}
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
......@@ -29,45 +29,49 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/*
set each entry to its negtive value (CUDA Kernel)
>> d - pointer to the data array
>> a - pointer to the input data array
>> b - pointer to the output data array
>> size - size of the data array
*/
__global__
void KernelNegate(DTYPE * d, int size)
void KernelNegate(DTYPE * a, DTYPE * b, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size)
d[i] = -d[i];
b[i] = -a[i];
}
/*
set each entry to its negtive value (CUDA Kernel)
This is for float16 computation
>> d - pointer to the data array
>> a - pointer to the input data array
>> b - pointer to the output data array
>> size - size of the data array
*/
__global__
void KernelNegate(__half * d, int size)
void KernelNegate(__half * a, __half * b, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
if (i < size)
d[i] = __hsub(__float2half(0), d[i]);
b[i] = __hsub(__float2half(0), a[i]);
#else
if (i < size)
d[i] = __float2half(-__half2float(d[i]));
b[i] = __float2half(-__half2float(a[i]));
#endif
}
/*
set each entry to its negtive value
>> a - the tensor
>> a - input tensor
>> b - output tensor
*/
extern "C"
void _CudaNegate(XTensor * a)
void _CudaNegate(const XTensor * a, XTensor * b)
{
CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
CheckNTErrors((a->isSparse == false), "TODO!");
int gridSize[3];
......@@ -82,10 +86,10 @@ void _CudaNegate(XTensor * a)
ProtectCudaDev(a->devID, devIDBackup);
if (a->dataType == DEFAULT_DTYPE) {
KernelNegate << <blocks, threads >> >((DTYPE*)a->data, a->unitNum);
KernelNegate << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum);
}
else if (a->dataType == X_FLOAT16) {
KernelNegate << <blocks, threads >> >((__half*)a->data, a->unitNum);
KernelNegate << <blocks, threads >> >((__half*)a->data, (__half*)b->data, a->unitNum);
}
else {
ShowNTErrors("TODO!");
......
......@@ -30,15 +30,15 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* set each entry to its negtive value (CUDA Kernel) */
__global__
void KernelNegate(DTYPE * d, int size);
void KernelNegate(DTYPE * a, DTYPE * b, int size);
/* set each entry to its negtive value (CUDA Kernel) with float16 data type*/
__global__
void KernelNegate(__half * d, int size);
void KernelNegate(__half * a, __half * b, int size);
/* set each entry to its negtive value */
extern "C"
void _CudaNegate(XTensor * a);
void _CudaNegate(const XTensor * a, XTensor * b);
#endif // USE_CUDA
......
......@@ -27,8 +27,19 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* set every entry to its minus value */
extern "C"
void _Negate(XTensor * a);
void _Negate(const XTensor * a, XTensor * b);
/*
set every entry to its minus value (do it on site)
keep the result in the input tensor a and return nothing
*/
void _NegateMe(XTensor * a);
/*
set every entry to its minus value (return a XTensor structure)
make a new tensor to keep the result and return it
*/
XTensor Negate(const XTensor & a);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -20,6 +20,7 @@
*/
#include "../../XTensor.h"
#include "../../XName.h"
#include "Sign.h"
#include "Sign.cuh"
......@@ -27,27 +28,60 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/*
set every entry to its sign value
>> a - the tensor we are processing
>> a - input tensor we are processing
>> b - output tensor we are processing
*/
void _Sign(XTensor * a)
void _Sign(const XTensor * a, XTensor * b)
{
#ifdef USE_CUDA
/* run it on GPUs */
if (a->devID >= 0) {
_CudaSign(a);
_CudaSign(a, b);
return;
}
#endif
CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
DTYPE * d = (DTYPE*)a->data;
DTYPE * db = (DTYPE*)b->data;
for (int i = 0; i < a->unitNum; i++) {
if (d[i] > 0)
d[i] = 1.0F;
db[i] = 1.0F;
else if (d[i] == 0)
d[i] = 0.0F;
db[i] = 0.0F;
else
d[i] = -1.0F;
db[i] = -1.0F;
}
}
/*
set every entry to its sign value (do it on site)
keep the result in the input tensor a and return nothing
>> a - the tensor we are processing
*/
void _SignMe(XTensor * a)
{
_Sign(a, a);
}
/*
set every entry to its sign value (return a XTensor structure)
make a new tensor to keep the result and return it
>> a - input tensor we are processing
<< return - the sign value of the input tensor
*/
XTensor Sign(const XTensor & a)
{
XTensor b(&a);
b.SetTMP();
/* call _ScaleAndShift function */
_Sign(&a, &b);
/* tensor connections */
XLink::MakeLink(&a, NULL, &b, MATH_SIGN);
return b;
}
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
......@@ -29,43 +29,47 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/*
set each entry to its sign value (CUDA Kernel)
>> d - pointer to the data array
>> a - pointer to input data array
>> b - pointer to output data array
>> size - size of the data array
*/
__global__
void KernelSign(DTYPE * d, int size)
void KernelSign(DTYPE * a, DTYPE * b, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size) {
if (d[i] > 0)
d[i] = 1.0F;
else if (d[i] == 0)
d[i] = 0.0F;
if (a[i] > 0)
b[i] = 1.0F;
else if (a[i] == 0)
b[i] = 0.0F;
else
d[i] = -1.0F;
b[i] = -1.0F;
}
}
/*
set each entry to its sign value (CUDA Kernel)
set each entry to its sign value with float16 data type value (CUDA Kernel)
This is for float16 computation
>> d - pointer to the data array
>> a - pointer to input data array
>> b - pointer to output data array
>> size - size of the data array
*/
__global__
void KernelSign(__half * d, int size)
void KernelSign(__half * a, __half * b, int size)
{
return;
}
/*
set each entry to its with float16 data type value
>> a - the tensor
set each entry to its sign value
>> a - input tensor we are processing
>> b - output tensor we are processing
*/
extern "C"
void _CudaSign(XTensor * a)
void _CudaSign(const XTensor * a, XTensor * b)
{
CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
CheckNTErrors((a->isSparse == false), "TODO!");
int gridSize[3];
......@@ -80,10 +84,10 @@ void _CudaSign(XTensor * a)
ProtectCudaDev(a->devID, devIDBackup);
if (a->dataType == DEFAULT_DTYPE) {
KernelSign << <blocks, threads >> >((DTYPE*)a->data, a->unitNum);
KernelSign << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum);
}
else if (a->dataType == X_FLOAT16) {
KernelSign << <blocks, threads >> >((__half*)a->data, a->unitNum);
KernelSign << <blocks, threads >> >((__half*)a->data, (__half*)b->data, a->unitNum);
}
else {
ShowNTErrors("TODO!");
......
......@@ -30,15 +30,15 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* set each entry to its sign value (CUDA Kernel) */
__global__
void KernelSign(DTYPE * d, int size);
void KernelSign(DTYPE * a, DTYPE * b, int size);
/* set each entry to its sign value (CUDA Kernel) with float16 data type*/
__global__
void KernelSign(__half * d, int size);
void KernelSign(__half * a, __half * b, int size);
/* set each entry to its sign value */
extern "C"
void _CudaSign(XTensor * a);
void _CudaSign(const XTensor * a, XTensor * b);
#endif // USE_CUDA
......
......@@ -27,8 +27,19 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* set every entry to its sign value */
extern "C"
void _Sign(XTensor * a);
void _Sign(const XTensor * a, XTensor * b);
/*
set every entry to its sign value (do it on site)
keep the result in the input tensor a and return nothing
*/
void _SignMe(XTensor * a);
/*
set every entry to its sign value (return a XTensor structure)
make a new tensor to keep the result and return it
*/
XTensor Sign(const XTensor & a);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -37,8 +37,8 @@ c = trans(a) * trans(b) * \alpha + c * \beta
>> c - output matrix (2d tensor)
*/
void _MatrixMULCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
const XTensor * b, MATRIX_TRANS_TYPE transposedB,
XTensor * c, DTYPE alpha, DTYPE beta)
const XTensor * b, MATRIX_TRANS_TYPE transposedB,
XTensor * c, DTYPE alpha, DTYPE beta)
{
CheckNTErrors((a && b && c), "Empty input tensors!");
CheckNTErrors((a->order == 2 && b->order == 2 && c->order == 2),
......
......@@ -89,11 +89,11 @@ void _CudaBLASMatrixMUL(cublasHandle_t * handle,
matrix multiplication via cuda version BLAS
*/
void _CudaBLASMatrixMULBatched(cublasHandle_t * handle,
const void ** a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
const void ** b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
void ** c, TENSOR_DATA_TYPE dataTypeC,
int count, int na, int ma, int nb, int mb, int nc, int mc,
DTYPE alpha, DTYPE beta)
const void ** a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
const void ** b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
void ** c, TENSOR_DATA_TYPE dataTypeC,
int count, int na, int ma, int nb, int mb, int nc, int mc,
DTYPE alpha, DTYPE beta)
{
/*
matrxi-matrix multiplication
......@@ -145,11 +145,11 @@ void _CudaBLASMatrixMULBatched(cublasHandle_t * handle,
/* matrix multiplication in batch and strided mode via cuda version BLAS */
extern "C"
void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, long long int strideA,
const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB, long long int strideB,
void * c, TENSOR_DATA_TYPE dataTypeC, long long int strideC,
int count, int na, int ma, int nb, int mb, int nc, int mc,
DTYPE alpha, DTYPE beta)
const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, long long int strideA,
const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB, long long int strideB,
void * c, TENSOR_DATA_TYPE dataTypeC, long long int strideC,
int count, int na, int ma, int nb, int mb, int nc, int mc,
DTYPE alpha, DTYPE beta)
{
/*
matrxi-matrix multiplication
......@@ -202,10 +202,10 @@ void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
matrix multiplication via cuda version BLAS
*/
void _CudaBLASMatrixMULList(cublasHandle_t * handle,
const XList * a, MATRIX_TRANS_TYPE transposedA,
const XList * b, MATRIX_TRANS_TYPE transposedB,
XList * c,
int count, DTYPE alpha, DTYPE beta)
const XList * a, MATRIX_TRANS_TYPE transposedA,
const XList * b, MATRIX_TRANS_TYPE transposedB,
XList * c,
int count, DTYPE alpha, DTYPE beta)
{
CheckNTErrors((a && b && c), "Empty input lists!");
CheckNTErrors((a->count == b->count && a->count == c->count), "Input lists must be of the same size!");
......
......@@ -28,38 +28,42 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* matrix multiplication (BLAS) */
extern "C"
void _MatrixMULCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
void _MatrixMULCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB,
XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
#ifdef USE_CUDA
/* matrix multiplication via cuda version BLAS */
extern "C"
void _CudaBLASMatrixMUL(cublasHandle_t * handle,
const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
void * c, TENSOR_DATA_TYPE dataTypeC,
int na, int ma, int nb, int mb, int nc, int mc, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
void * c, TENSOR_DATA_TYPE dataTypeC,
int na, int ma, int nb, int mb, int nc, int mc, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
/* matrix multiplication in batch mode via cuda version BLAS */
extern "C"
void _CudaBLASMatrixMULBatched(cublasHandle_t * handle,
const void ** a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
const void ** b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
void ** c, TENSOR_DATA_TYPE dataTypeC,
int count, int na, int ma, int nb, int mb, int nc, int mc, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
const void ** a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
const void ** b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
void ** c, TENSOR_DATA_TYPE dataTypeC,
int count, int na, int ma, int nb, int mb, int nc, int mc,
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
/* matrix multiplication in batch and strided mode via cuda version BLAS */
extern "C"
void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, long long int strideA,
const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB, long long int strideB,
void * c, TENSOR_DATA_TYPE dataTypeC, long long int strideC,
int count, int na, int ma, int nb, int mb, int nc, int mc, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, long long int strideA,
const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB, long long int strideB,
void * c, TENSOR_DATA_TYPE dataTypeC, long long int strideC,
int count, int na, int ma, int nb, int mb, int nc, int mc,
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
/* matrix multiplication in batch mode via cuda version BLAS */
extern "C"
void _CudaBLASMatrixMULList(cublasHandle_t * handle, const XList * a, MATRIX_TRANS_TYPE transposedA, const XList * b, MATRIX_TRANS_TYPE transposedB, XList * c,
int count, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
void _CudaBLASMatrixMULList(cublasHandle_t * handle, const XList * a, MATRIX_TRANS_TYPE transposedA,
const XList * b, MATRIX_TRANS_TYPE transposedB, XList * c,
int count, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
#endif
} // namespace nts(NiuTrans.Tensor)
......
......@@ -96,7 +96,6 @@ XTensor SelectRange(const XTensor &a, int dim, int low, int high)
int order = a.order;
int * dimSize = new int[order];
CheckNTErrors(&a != NULL, "Empty input tensors!");
CheckNTErrors(dim >= 0 && dim < a.order, "The input dimension is out of bounds!");
CheckNTErrors(low < high, "Illegal range specified!");
......@@ -110,8 +109,8 @@ XTensor SelectRange(const XTensor &a, int dim, int low, int high)
dimSize[i] = a.dimSize[i];
}
XTensor c = NewTensor(order, dimSize, a.dataType, a.denseRatio, a.devID, a.mem);
c.SetZeroAll();
float dr = (!a.isSparse) ? 1.0F : a.denseRatio;
XTensor c(order, dimSize, a.dataType, dr, a.devID, a.mem);
c.SetTMP();
/* call _SelectRange function */
......
......@@ -20,6 +20,7 @@
*/
#include "../../XTensor.h"
#include "../../XName.h"
#include "Log.h"
#include "Log.cuh"
#include <math.h>
......@@ -27,22 +28,55 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
set every entry to its log value
>> a - the tensor we are processing
set every entry to its log value (do it on site)
>> a - input tensor we are processing
>> b - output tensor we are processing
*/
void _Log(XTensor * a)
void _Log(const XTensor * a, XTensor * b)
{
#ifdef USE_CUDA
/* run it on GPUs */
if (a->devID >= 0) {
_CudaLog(a);
_CudaLog(a, b);
return;
}
}
#endif
CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
DTYPE * d = (DTYPE*)a->data;
DTYPE * db = (DTYPE*)b->data;
for (int i = 0; i < a->unitNum; i++)
d[i] = (DTYPE)log(d[i]);
db[i] = (DTYPE)log(d[i]);
}
/*
set every entry to its log value
keep the result in the input tensor a and return nothing
>> a - the tensor we are processing
*/
void _LogMe(XTensor * a)
{
_Log(a, a);
}
/*
set every entry to its log value (return a XTensor structure)
make a new tensor to keep the result and return it
>> a - input tensor we are processing
<< return - the log value of the input tensor
*/
XTensor Log(const XTensor & a)
{
XTensor b(&a);
b.SetTMP();
/* call _Log function */
_Log(&a, &b);
/* tensor connections */
XLink::MakeLink(&a, NULL, &b, MATH_LOG);
return b;
}
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
......@@ -29,37 +29,41 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/*
set each entry to its log value (CUDA Kernel)
>> d - pointer to the data array
>> a - pointer to input data array
>> b - pointer to output data array
>> size - size of the data array
*/
__global__
void KernelLog(DTYPE * d, int size)
void KernelLog(DTYPE * a, DTYPE * b, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size)
d[i] = log(d[i]);
b[i] = log(a[i]);
}
/*
set each entry to its log value (CUDA Kernel)
This is for float16 computation
>> d - pointer to the data array
>> a - pointer to input data array
>> b - pointer to output data array
>> size - size of the data array
*/
__global__
void KernelLog(__half * d, int size)
void KernelLog(__half * a, __half * b, int size)
{
return;
}
/*
set each entry to its log value
>> a - the tensor
>> a - input tensor
>> b - output tensor
*/
extern "C"
void _CudaLog(XTensor * a)
void _CudaLog(const XTensor * a, XTensor * b)
{
CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
CheckNTErrors((a->isSparse == false), "TODO!");
int gridSize[3];
......@@ -74,10 +78,10 @@ void _CudaLog(XTensor * a)
ProtectCudaDev(a->devID, devIDBackup);
if (a->dataType == DEFAULT_DTYPE) {
KernelLog << <blocks, threads >> >((DTYPE*)a->data, a->unitNum);
KernelLog << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum);
}
else if (a->dataType == X_FLOAT16) {
KernelLog << <blocks, threads >> >((__half*)a->data, a->unitNum);
KernelLog << <blocks, threads >> >((__half*)a->data, (__half*)b->data, a->unitNum);
}
else {
ShowNTErrors("TODO!");
......
......@@ -30,15 +30,15 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* set each entry to its log value (CUDA Kernel) */
__global__
void KernelLog(DTYPE * d, int size);
void KernelLog(DTYPE * a, DTYPE * b, int size);
/* set each entry to its log value (CUDA Kernel) with float16 data type*/
__global__
void KernelLog(__half * d, int size);
void KernelLog(__half * a, __half * b, int size);
/* set each entry to its log value */
extern "C"
void _CudaLog(XTensor * a);
void _CudaLog(const XTensor * a, XTensor * b);
#endif // USE_CUDA
......
......@@ -27,8 +27,19 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* set every entry to its log value */
extern "C"
void _Log(XTensor * a);
void _Log(const XTensor * a, XTensor * b);
/*
set every entry to its log value (do it on site)
keep the result in the input tensor a and return nothing
*/
void _LogMe(XTensor * a);
/*
set every entry to its log value (return a XTensor structure)
make a new tensor to keep the result and return it
*/
XTensor Log(const XTensor & a);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -21,6 +21,7 @@
#include <math.h>
#include "../../XTensor.h"
#include "../../XName.h"
#include "Power.h"
#include "Power.cuh"
......@@ -28,38 +29,73 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/*
get the power(a, p)
>> a - the tensor
>> p - as it is
>> a - input tensor
>> b - output tensor
>> p - parameter
*/
void _Power(XTensor * a, DTYPE p)
void _Power(const XTensor * a, XTensor * b, DTYPE p)
{
#ifdef USE_CUDA
/* run it on GPUs */
if (a->devID >= 0) {
_CudaPower(a, p);
_CudaPower(a, b, p);
return;
}
#endif
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
DTYPE * d = (DTYPE*)a->data;
DTYPE * aData = (DTYPE*)a->data;
DTYPE * bData = (DTYPE*)b->data;
if (p == 0) {
for (int i = 0; i < a->unitNum; i++)
d[i] = (DTYPE)1.0;
bData[i] = (DTYPE)1.0;
}
else if (p == (DTYPE)0.5) {
for (int i = 0; i < a->unitNum; i++)
d[i] = (DTYPE)sqrt(d[i]);
bData[i] = (DTYPE)sqrt(aData[i]);
}
else if (p == (DTYPE)2.0) {
for (int i = 0; i < a->unitNum; i++)
d[i] = d[i] * d[i];
bData[i] = aData[i] * aData[i];
}
else {
for (int i = 0; i < a->unitNum; i++)
d[i] = (DTYPE)pow(d[i], p);
bData[i] = (DTYPE)pow(aData[i], p);
}
}
/*
get the power(a, p) (do it on site)
keep the result in the input tensor a and return nothing
>> a - the tensor
>> p - parameter
*/
void _PowerMe(XTensor * a, DTYPE p)
{
_Power(a, a, p);
}
/*
get the power(a, p) (return a XTensor structure)
make a new tensor to keep the result and return it
>> a - input tensor
>> p - parameter
<< return - the power value of the input tensor
*/
XTensor Power(const XTensor & a, DTYPE p)
{
XTensor b(&a);
b.SetTMP();
/* call _Power function */
_Power(&a, &b, p);
/* tensor connections */
XLink::MakeLink(&a, NULL, &b, MATH_POWER);
XLink::AddParamToHead(&b, p);
return b;
}
} // namespace nts(NiuTrans.Tensor)
......@@ -21,6 +21,7 @@
#include "../../XDevice.h"
#include "../../XTensor.h"
#include "../movement/CopyValues.cuh"
#include "Power.h"
#include "Power.cuh"
......@@ -30,74 +31,80 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/*
set all entries to its root (CUDA Kernel)
>> d - data array
>> a - input data array
>> b - output data array
>> size - size of the data array
*/
__global__
void KernelSqrtV2(DTYPE * d, int size)
void KernelSqrtV2(DTYPE * a, DTYPE * b, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size)
d[i] = sqrt(d[i]);
b[i] = sqrt(a[i]);
}
/*
set all entries to its root (CUDA Kernel)
>> d - data array
>> a - input data array
>> b - output data array
>> size - size of the data array
*/
__global__
void KernelSqrtV2(__half * d, int size)
void KernelSqrtV2(__half * a, __half * b, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
if (i < size)
d[i] = hsqrt(d[i]);
b[i] = hsqrt(a[i]);
#else
if (i < size)
d[i] = __float2half(sqrt(__half2float(d[i])));
b[i] = __float2half(sqrt(__half2float(a[i])));
#endif
}
/*
get power(d[i], p)
>> d - data array
>> a - input data array
>> b - output data array
>> p - power
>> size - size of the data array
*/
__global__
void KernelPower(DTYPE * d, DTYPE p, int size)
void KernelPower(DTYPE * a, DTYPE * b, DTYPE p, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size)
d[i] = pow(d[i], p);
b[i] = pow(a[i], p);
}
/*
get power(d[i], p)
>> d - data array
>> a - input data array
>> b - output data array
>> p - power
>> size - size of the data array
*/
__global__
void KernelPower(__half * d, __half p, int size)
void KernelPower(__half * a, __half * b, __half p, int size)
{
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
#else
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size)
d[i] = __float2half(pow(__half2float(d[i]), __half2float(p)));
b[i] = __float2half(pow(__half2float(a[i]), __half2float(p)));
#endif
}
/* get the power of the entries */
extern "C"
void _CudaPower(XTensor * a, DTYPE p)
void _CudaPower(const XTensor * a, XTensor * b, DTYPE p)
{
CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
int gridSize[3];
int blockSize[3];
......@@ -111,15 +118,18 @@ void _CudaPower(XTensor * a, DTYPE p)
if (a->dataType == DEFAULT_DTYPE) {
if (p == (DTYPE)0.5) {
KernelSqrtV2 << <blocks, threads >> >((DTYPE*)a->data, a->unitNum);
KernelSqrtV2 << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum);
}
else if (p == (DTYPE)1.0) {
_CudaCopyValues(a, b);
}
else if (p != (DTYPE)1.0) {
KernelPower << <blocks, threads >> >((DTYPE*)a->data, p, a->unitNum);
KernelPower << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, p, a->unitNum);
}
}
else if (a->dataType == X_FLOAT16) {
if (p == (DTYPE)0.5) {
KernelSqrtV2 << <blocks, threads >> >((__half*)a->data, a->unitNum);
KernelSqrtV2 << <blocks, threads >> >((__half*)a->data, (__half*)b->data, a->unitNum);
}
else if (p != (DTYPE)1.0) {
ShowNTErrors("TODO!");
......
......@@ -30,15 +30,15 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* set all entries to its root (CUDA Kernel) */
__global__
void KernelSqrtV2(DTYPE * d, int size);
void KernelSqrtV2(DTYPE * a, DTYPE * b, int size);
/* set all entries to its root (CUDA Kernel) */
__global__
void KernelSqrtV2(__half * d, int size);
void KernelSqrtV2(__half * a, __half * b, int size);
/* get the power of the entries */
extern "C"
void _CudaPower(XTensor * a, DTYPE p);
void _CudaPower(const XTensor * a, XTensor * b, DTYPE p);
#endif // USE_CUDA
......
......@@ -27,8 +27,19 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* get the power(x, y) */
extern "C"
void _Power(XTensor * a, DTYPE p);
void _Power(const XTensor * a, XTensor * b, DTYPE p);
/*
get the power(x, y) (do it on site)
keep the result in the input tensor a and return nothing
*/
void _PowerMe(XTensor * a, DTYPE p);
/*
get the power(x, y) (return a XTensor structure)
make a new tensor to keep the result and return it
*/
XTensor Power(const XTensor & a, DTYPE p);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -110,8 +110,7 @@ make a new tensor to keep the result and return it
*/
XTensor CopyIndexed(const XTensor &s, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum)
{
CheckNTErrors(&s, "Empty input tensor!");
CheckNTErrors((dim >= 0 && dim < s.order), "A too larget dimension specified!");
CheckNTErrors(dim >= 0 && dim < s.order, "A too larget dimension specified!");
int order = s.order;
int * dimSize = new int[order];
......@@ -123,16 +122,13 @@ XTensor CopyIndexed(const XTensor &s, int dim, int * srcIndex, int indexSize, in
dimSize[i] = s.dimSize[i];
}
XTensor t = NewTensor(order, dimSize, s.dataType, s.denseRatio, s.devID, s.mem);
t.SetZeroAll();
float dr = (!s.isSparse) ? 1.0F : s.denseRatio;
XTensor t(order, dimSize, s.dataType, dr, s.devID, s.mem);
t.SetTMP();
/* call _CopyIndexed function */
_CopyIndexed(&s, &t, dim, srcIndex, indexSize, tgtIndex, copyNum);
/* destroy variables */
delete[] dimSize;
/* tensor connection */
XLink::MakeLink(&s, NULL, &t, MOVEMENT_COPYINDEXED);
XLink::AddParamToHeadInt(&t, dim);
......@@ -140,7 +136,10 @@ XTensor CopyIndexed(const XTensor &s, int dim, int * srcIndex, int indexSize, in
XLink::AddParamToHeadInt(&t, indexSize);
XLink::AddParamToHeadPointer(&t, tgtIndex);
XLink::AddParamToHeadInt(&t, copyNum);
/* destroy variables */
delete[] dimSize;
return t;
}
......
......@@ -101,32 +101,31 @@ make a new tensor to keep the result and return it
*/
XTensor ReduceMax(const XTensor &input, int dim)
{
CheckNTErrors(&input, "Empty input or output tensors!");
CheckNTErrors((dim >= 0 && dim < input.order), "Illegal dimension to reduce!");
CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
int order = input.order - 1;
int * dimSize = new int[order];
for(int i = 0; i < input.order; i++){
for(int i = 0; i < order; i++){
if(i < dim)
dimSize[i] = input.dimSize[i];
else if(i > dim)
else if(i >= dim)
dimSize[i] = input.dimSize[i + 1];
}
XTensor output = NewTensor(order, dimSize, input.dataType, input.denseRatio, input.devID, input.mem);
output.SetZeroAll();
float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
XTensor output(order, dimSize, input.dataType, dr, input.devID, input.mem);
output.SetTMP();
/* call _ReduceMax function */
_ReduceMax(&input, &output, dim);
/* destroy variables */
delete[] dimSize;
/* tensor connection */
XLink::MakeLink(&input, NULL, &output, REDUCE_REDUCEMAX);
XLink::AddParamToHeadInt(&output, dim);
/* destroy variables */
delete[] dimSize;
return output;
}
......
......@@ -58,20 +58,19 @@ For a 1-dimensional data array a, mean = (1/n) * sum_i input_i
*/
XTensor ReduceMean(const XTensor &input, int dim)
{
CheckNTErrors(&input, "Empty input or output tensors!");
CheckNTErrors((dim >= 0 && dim < input.order), "Illegal dimension to reduce!");
CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
int order = input.order - 1;
int * dimSize = new int[order];
for(int i = 0; i < input.order; i++){
for(int i = 0; i < order; i++){
if(i < dim)
dimSize[i] = input.dimSize[i];
else if(i > dim)
else if(i >= dim)
dimSize[i] = input.dimSize[i + 1];
}
XTensor output = NewTensor(order, dimSize, input.dataType, input.denseRatio, input.devID, input.mem);
output.SetZeroAll();
float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
XTensor output(order, dimSize, input.dataType, dr, input.devID, input.mem);
output.SetTMP();
/* call _ReduceMean function */
......@@ -87,4 +86,4 @@ XTensor ReduceMean(const XTensor &input, int dim)
return output;
}
} // namespace nts(NiuTrans.Tensor)
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
......@@ -214,20 +214,19 @@ sum = \sum_i exp((a_i - shift)^power) if isExp == true
*/
XTensor ReduceSum(const XTensor &input, int dim, const XTensor &shift, DTYPE power, bool isExp)
{
CheckNTErrors(&input, "Empty input or output tensors!");
CheckNTErrors((dim >= 0 && dim < input.order), "Illegal dimension to reduce!");
CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
int order = input.order - 1;
int * dimSize = new int[order];
for(int i = 0; i < input.order; i++){
for(int i = 0; i < order; i++){
if(i < dim)
dimSize[i] = input.dimSize[i];
else if(i > dim)
else if(i >= dim)
dimSize[i] = input.dimSize[i + 1];
}
XTensor output = NewTensor(order, dimSize, input.dataType, input.denseRatio, input.devID, input.mem);
output.SetZeroAll();
float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
XTensor output(order, dimSize, input.dataType, dr, input.devID, input.mem);
output.SetTMP();
/* call _ReduceSum function */
......@@ -237,6 +236,53 @@ XTensor ReduceSum(const XTensor &input, int dim, const XTensor &shift, DTYPE pow
XLink::MakeLink(&input, &shift, &output, REDUCE_REDUCESUM);
XLink::AddParamToHeadInt(&output, dim);
XLink::AddParamToHead(&output, power);
XLink::AddParamToHeadBool(&output, isExp);
/* destroy variables */
delete[] dimSize;
return output;
}
/*
sum the items along a dimension of the tensor (return a XTensor structure)
make a new tensor to keep the result and return it
For a 1-dimensional data array a,
sum = \sum_i (a_i)^power if isExp == false
sum = \sum_i exp((a_i)^power) if isExp == true
>> input - the input tensor
>> dim - the dimension where the reduction is performed on
>> ieExp - specify if the exp() is performed
>> power - we perform pow(item_i, power) on each item in the array
<< return - the sum along a dimension of the tensor
*/
XTensor ReduceSum(const XTensor &input, int dim, DTYPE power, bool isExp)
{
CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
int order = input.order - 1;
int * dimSize = new int[order];
for(int i = 0; i < order; i++){
if(i < dim)
dimSize[i] = input.dimSize[i];
else if(i >= dim)
dimSize[i] = input.dimSize[i + 1];
}
float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
XTensor output(order, dimSize, input.dataType, dr, input.devID, input.mem);
output.SetTMP();
/* call _ReduceSum function */
_ReduceSum(&input, &output, dim, NULL, power, isExp);
/* tensor connection */
XLink::MakeLink(&input, NULL, &output, REDUCE_REDUCESUM);
XLink::AddParamToHeadInt(&output, dim);
XLink::AddParamToHead(&output, power);
XLink::AddParamToHeadBool(&output, isExp);
/* destroy variables */
delete[] dimSize;
......
......@@ -43,7 +43,16 @@ For a 1-dimensional data array a,
sum = \sum_i (a_i - shift) if isExp == false
sum = \sum_i exp(a_i - shift) if isExp == true
*/
XTensor ReduceSum(const XTensor &input, int dim, const XTensor &shift = NULL, DTYPE power = (DTYPE)1.0F, bool isExp = false);
XTensor ReduceSum(const XTensor &input, int dim, const XTensor &shift, DTYPE power = (DTYPE)1.0F, bool isExp = false);
/*
sum the items along a dimension of the tensor (return a XTensor structure)
make a new tensor to keep the result and return it
For a 1-dimensional data array a,
sum = \sum_i (a_i) if isExp == false
sum = \sum_i exp(a_i) if isExp == true
*/
XTensor ReduceSum(const XTensor &input, int dim, DTYPE power = (DTYPE)1.0F, bool isExp = false);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -54,25 +54,24 @@ For a 1-dimensional data array a, sum = \sum_i (a_i - shift)^2
*/
XTensor ReduceSumSquared(const XTensor &input, int dim, const XTensor &shift)
{
CheckNTErrors(&input, "Empty input or output tensors!");
CheckNTErrors((dim >= 0 && dim < input.order), "Illegal dimension to reduce!");
CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
int order = input.order - 1;
int * dimSize = new int[order];
for(int i = 0; i < input.order; i++){
for(int i = 0; i < order; i++){
if(i < dim)
dimSize[i] = input.dimSize[i];
else if(i > dim)
else if(i >= dim)
dimSize[i] = input.dimSize[i + 1];
}
XTensor output = NewTensor(order, dimSize, input.dataType, input.denseRatio, input.devID, input.mem);
output.SetZeroAll();
float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
XTensor output(order, dimSize, input.dataType, dr, input.devID, input.mem);
output.SetTMP();
/* call _ReduceSumSquared function */
_ReduceSumSquared(&input, &output, dim, &shift);
/* tensor connection */
XLink::MakeLink(&input, &shift, &output, REDUCE_REDUCESUMSQUARED);
XLink::AddParamToHeadInt(&output, dim);
......@@ -83,4 +82,4 @@ XTensor ReduceSumSquared(const XTensor &input, int dim, const XTensor &shift)
return output;
}
} // namespace nts(NiuTrans.Tensor)
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
......@@ -19,6 +19,7 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "../../XName.h"
#include "../math/ScaleAndShift.h"
#include "ReduceSum.h"
#include "ReduceVariance.h"
......@@ -56,24 +57,27 @@ For a 1-dimensional data array a, variance = 1/n * \sum_i (a_i - mean)^2
*/
XTensor ReduceVariance(const XTensor &input, int dim, const XTensor &mean)
{
CheckNTErrors(&input, "Empty input or output tensors!");
CheckNTErrors((dim >= 0 && dim < input.order), "Illegal dimension to reduce!");
CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
int order = input.order - 1;
int * dimSize = new int[order];
for(int i = 0; i < input.order; i++){
for(int i = 0; i < order; i++){
if(i < dim)
dimSize[i] = input.dimSize[i];
else if(i > dim)
else if(i >= dim)
dimSize[i] = input.dimSize[i + 1];
}
XTensor output = NewTensor(order, dimSize, input.dataType, input.denseRatio, input.devID, input.mem);
output.SetZeroAll();
float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
XTensor output(order, dimSize, input.dataType, dr, input.devID, input.mem);
output.SetTMP();
/* call _ReduceVariance function */
_ReduceVariance(&input, &output, dim, &mean);
/* tensor connection */
XLink::MakeLink(&input, &mean, &output, REDUCE_REDUCEVARIANCE);
XLink::AddParamToHeadInt(&output, dim);
/* destroy variables */
delete[] dimSize;
......
......@@ -36,7 +36,7 @@ concatenate a list of tensors along a given dimension
*/
void _ConcatenateSolely(const XList * smalls, XTensor * big, int dim)
{
CheckNTErrors((big->order > dim && dim >= 0), "Illegal dimension to concatenate!");
CheckNTErrors(big->order > dim && dim >= 0, "Illegal dimension to concatenate!");
int catDimSize = 0;
int dimRDI = big->order - dim - 1;
......
......@@ -36,7 +36,7 @@ set target data block index for the data movement in merge
>> mem - the memory pool
*/
void _MakeMergeBlockIndex(int * blockIndex, int blockNum, int blockNumInMerge,
int splitSizeInGrid, int gridSize, int gridNum, XMem * mem)
int splitSizeInGrid, int gridSize, int gridNum, XMem * mem)
{
if (mem != NULL && mem->devID >= 0) {
#ifdef USE_CUDA
......
......@@ -40,7 +40,7 @@ set target data block index for the data movement in split (device code)
*/
__global__
void KernelMakeMergeBlockIndex(int * blockIndex, int blockNum, int blockNumInMerge,
int splitSizeInGrid, int gridSize, int gridNum)
int splitSizeInGrid, int gridSize, int gridNum)
{
/* block index */
int i = blockDim.x * blockIdx.x + threadIdx.x;
......@@ -71,8 +71,8 @@ set target data block index for the data movement in split
*/
extern "C"
void _CudaMakeMergeBlockIndex(int devID,
int * blockIndex, int blockNum, int blockNumInMerge,
int splitSizeInGrid, int gridSize, int gridNum)
int * blockIndex, int blockNum, int blockNumInMerge,
int splitSizeInGrid, int gridSize, int gridNum)
{
int cudaGrids[3];
int cudaBlocks[3];
......
......@@ -30,9 +30,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* set target data block index for the data movement in split */
extern "C"
void _CudaMakeMergeBlockIndex(int devID,
int * blockIndex, int blockNum, int blockNumInMerge,
int splitSizeInGrid, int gridSize, int gridNum);
void _CudaMakeMergeBlockIndex(int devID, int * blockIndex, int blockNum, int blockNumInMerge,
int splitSizeInGrid, int gridSize, int gridNum);
#endif // USE_CUDA
......
......@@ -29,7 +29,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* set target data block index for the data movement in merge */
extern "C"
void _MakeMergeBlockIndex(int * blockIndex, int blockNum, int blockNumInMerge,
int splitSizeInGrid, int gridSize, int gridNum, XMem * mem);
int splitSizeInGrid, int gridSize, int gridNum, XMem * mem);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -161,8 +161,7 @@ e.g., (N/3, M, 3) -> (N, M)
*/
XTensor Merge(const XTensor &s, int whereToMerge, int leadingDim)
{
CheckNTErrors(&s != NULL, "Invalid tensors!");
CheckNTErrors((leadingDim < whereToMerge), "Invalid leading dimension!");
CheckNTErrors(leadingDim < whereToMerge, "Invalid leading dimension!");
if (leadingDim < 0)
leadingDim = 0;
......@@ -180,8 +179,8 @@ XTensor Merge(const XTensor &s, int whereToMerge, int leadingDim)
}
}
XTensor t = NewTensor(order, dimSize, s.dataType, s.denseRatio, s.devID, s.mem);
t.SetZeroAll();
float dr = (!s.isSparse) ? 1.0F : s.denseRatio;
XTensor t(order, dimSize, s.dataType, dr, s.devID, s.mem);
t.SetTMP();
/* call _Merge function */
......@@ -314,6 +313,7 @@ void _Merge(const XList * smalls, XTensor * big, int whereToMerge)
/*
merge small tensors into a big tensor (return a XTensor structure)
make a new tensor to keep the result and return it
>> smalls - the list of the small tensors
>> whereToMerge - the merging operation is along with which dimension
......@@ -331,15 +331,13 @@ XTensor Merge(const XList &smalls, int whereToMerge)
dimSize[i] = tensor->dimSize[whereToMerge] * smalls.count;
}
XTensor big = NewTensor(order, dimSize,
tensor->dataType, tensor->denseRatio,
tensor->devID, tensor->mem);
big.SetZeroAll();
float dr = (!tensor->isSparse) ? 1.0F : tensor->denseRatio;
XTensor big(order, dimSize, tensor->dataType, dr, tensor->devID, tensor->mem);
big.SetTMP();
/* call _Merge function */
_Merge(&smalls, &big, whereToMerge);
/* tensor connections */
XLink::MakeLink(&smalls, &big, SHAPE_MERGE_LIST);
XLink::AddParamToHeadInt(&big, whereToMerge);
......@@ -370,10 +368,8 @@ XTensor Merge(const XTensor &smallA, const XTensor &smallB, int whereToMerge)
dimSize[i] = smallA.dimSize[whereToMerge] * 2;
}
XTensor big = NewTensor(order, dimSize,
smallA.dataType, smallA.denseRatio,
smallA.devID, smallA.mem);
big.SetZeroAll();
float dr = (!smallA.isSparse) ? 1.0F : smallA.denseRatio;
XTensor big(order, dimSize, smallA.dataType, dr, smallA.devID, smallA.mem);
big.SetTMP();
XList smalls(2);
......
......@@ -148,21 +148,21 @@ XTensor Split(const XTensor &s, int whereToSplit, int splitNum)
int order = s.order + 1;
int * dimSize = new int[order];
dimSize[0] = splitNum;
for (int i = 0; i < s.order; i++) {
if (i == whereToSplit)
dimSize[i + 1] = s.dimSize[i] / splitNum;
dimSize[i+1] = s.dimSize[i] / splitNum;
else
dimSize[i + 1] = s.dimSize[i];
dimSize[i+1] = s.dimSize[i];
}
dimSize[0] = splitNum;
XTensor t = NewTensor(order, dimSize, s.dataType, s.denseRatio, s.devID, s.mem);
t.SetZeroAll();
float dr = (!s.isSparse) ? 1.0F : s.denseRatio;
XTensor t(order, dimSize, s.dataType, dr, s.devID, s.mem);
t.SetTMP();
/* call _Split function */
_Split(&s, &t, whereToSplit, splitNum);
/* tensor connections */
XLink::MakeLink(&s, NULL, &t, SHAPE_SPLIT);
XLink::AddParamToHeadInt(&t, whereToSplit);
......@@ -175,7 +175,7 @@ XTensor Split(const XTensor &s, int whereToSplit, int splitNum)
}
/*
split a big tensor into small tensors.
split a big tensor into small tensors
>> big - the source tensor
>> smalls - the list that keeps the resulting tensors (for return)
......@@ -281,41 +281,19 @@ void _Split(const XTensor * big, XList * smalls, int whereToSplit, int splitNum)
}
/*
split a big tensor into small tensors (returna a XList struture).
make a new list to keep the result and return it.
split a big tensor into small tensors
>> big - the source tensor
>> smalls - the list that keeps the resulting tensors (for return)
NOTE that all the "small" tensors have already been placed in the list in advance.
>> whereToSplit - which dimension of the tensor is to split
>> splitNum - how many splits
<< return - a list of small tensors by splitting a big tensor
*/
XList SplitList(const XTensor &big, int whereToSplit, int splitNum)
void Split(const XTensor &big, XList &smalls, int whereToSplit, int splitNum)
{
CheckNTErrors(&big, "Invalid tensors!");
XList smalls = XList(splitNum);
int order = big.order;
int * dimSize = new int[order];
for (int i = 0; i < big.order; i++) {
if (i != whereToSplit)
dimSize[i] = big.dimSize[i];
else
dimSize[i] = big.dimSize[i] / splitNum;
}
for (int i = 0; i < splitNum; i++) {
XTensor tensor = NewTensor(order, dimSize,
big.dataType, big.denseRatio,
big.devID, big.mem);
tensor.SetZeroAll();
tensor.SetTMP();
smalls.Add(&tensor);
}
/* call _Split function */
_Split(&big, &smalls, whereToSplit, splitNum);
/* tensor connections */
for(int i = 0; i < smalls.count; i++){
XTensor * s = (XTensor*)smalls.Get(i);
......@@ -326,11 +304,6 @@ XList SplitList(const XTensor &big, int whereToSplit, int splitNum)
block, rather than the total number of splits */
XLink::AddParamToHeadInt(s, i);
}
/* destroy variables */
delete[] dimSize;
return smalls;
}
} // namespace nts(NiuTrans.Tensor)
......@@ -46,7 +46,7 @@ void _Split(const XTensor * big, XList * smalls, int whereToSplit, int splitNum)
split a big tensor into small tensors (return a XList structure)
make a new list to keep the result and return it
*/
XList SplitList(const XTensor &big, int whereToSplit, int splitNum);
void Split(const XTensor &big, XList &smalls, int whereToSplit, int splitNum);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -108,8 +108,6 @@ make a new tensor to keep the result and return it
*/
XTensor Unsqueeze(const XTensor &a, int dim, int dSize)
{
CheckNTErrors(&a, "Empty input tensors!");
int order = a.order + 1;
int * dimSize = new int[order];
......@@ -122,8 +120,8 @@ XTensor Unsqueeze(const XTensor &a, int dim, int dSize)
dimSize[i] = a.dimSize[i - 1];
}
XTensor b = NewTensor(order, dimSize, a.dataType, a.denseRatio, a.devID, a.mem);
b.SetZeroAll();
float dr = (!a.isSparse) ? 1.0F : a.denseRatio;
XTensor b(order, dimSize, a.dataType, dr, a.devID, a.mem);
b.SetTMP();
/* call _Unsqueeze function */
......
......@@ -29,13 +29,14 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/*
sort the tensor along a given dimension
>> a - the tensor
>> a - input tensor
>> b - output tensor
>> index - index of the items in the resulting tensor
>> dim - the dimension along which the sorting is performed
*/
void _Sort(XTensor * a, XTensor * index, int dim)
void _Sort(const XTensor * a, XTensor * b, XTensor * index, int dim)
{
CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
CheckNTErrors((dim >= 0 && dim < a->order), "Incorrect dimension specified!");
CheckNTErrors((a->order == index->order), "Unmatched input tensors!");
CheckNTErrors((index->dataType == X_INT), "Wrong data type!");
......@@ -46,7 +47,7 @@ void _Sort(XTensor * a, XTensor * index, int dim)
if (a->devID >= 0) {
#ifdef USE_CUDA
_CudaSortBig(a, a, index, index, dim);
_CudaSortBig(a, b, index, index, dim);
#else
ShowNTErrors("Plesae specify USE_CUDA and recompile the code!");
#endif
......@@ -64,12 +65,13 @@ void _Sort(XTensor * a, XTensor * index, int dim)
for (int k = 0; k < blockNum; k++) {
for (int i = 0; i < stride; i++) {
void * data = (char*)a->data + (k * blockSize + i) * a->unitSize;
void * dataA = (char*)a->data + (k * blockSize + i) * a->unitSize;
void * dataB = (char*)b->data + (k * blockSize + i) * b->unitSize;
void * indexData = (char*)index->data + (k * blockSize + i) * sizeof(int);
/* we sort the data array along "dim" */
if (a->dataType == X_FLOAT)
XQSort(data, indexData, strideNum, a->unitSize, stride, CompXFloat);
XQSort(dataA, dataB, indexData, strideNum, a->unitSize, stride, CompXFloat);
else {
ShowNTErrors("TODO!");
}
......@@ -78,4 +80,40 @@ void _Sort(XTensor * a, XTensor * index, int dim)
}
}
/*
sort the tensor along a given dimension (do it on site)
keep the result in the input tensor a and return nothing
>> a - input tensor
>> index - index of the items in the resulting tensor
>> dim - the dimension along which the sorting is performed
*/
void _SortMe(XTensor * a, XTensor * index, int dim)
{
_Sort(a, a, index, dim);
}
/*
sort the tensor along a given dimension (return a XTensor structure)
make a new tensor to keep the result and return it
>> a - input tensor
>> b - output tensor
>> index - index of the items in the resulting tensor
>> dim - the dimension along which the sorting is performed
*/
void Sort(XTensor & a, XTensor & b, XTensor & index, int dim)
{
/* call _Negate function */
_Sort(&a, &b, &index, dim);
/* tensor connections */
XList list(2);
list.Add(&b);
list.Add(&index);
XLink::MakeLink(&a, &list, SORT_SORT);
XLink::AddParamToHeadInt(&b, dim);
XLink::AddParamToHeadInt(&index, dim);
}
} // namespace nts(NiuTrans.Tensor)
......@@ -39,7 +39,7 @@ bitonic sort (for each row in a matrix)
>> n - row number of the matrix
*/
template<class T> __global__
void KernelBitonicSort2D(void * data, int j, int k, int m, int n)
void KernelBitonicSort2D(void * data, int j, int k, int m, int n)
{
const unsigned int idx = blockDim.x * blockIdx.x + threadIdx.x;
const unsigned int row = blockDim.y * blockIdx.y + threadIdx.y;
......@@ -74,7 +74,7 @@ bitonic sort (for each row in a matrix) with index
>> n - row number of the matrix
*/
template<class T> __global__
void KernelBitonicSort2D(void * data, int * index, int j, int k, int m, int n)
void KernelBitonicSort2D(void * data, int * index, int j, int k, int m, int n)
{
const unsigned int idx = blockDim.x * blockIdx.x + threadIdx.x;
const unsigned int row = blockDim.y * blockIdx.y + threadIdx.y;
......
......@@ -27,8 +27,20 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* sort the data along a given dimension */
void _Sort(const XTensor * a, XTensor * b, XTensor * index, int dim);
/*
sort the data along a given dimension (do it on site)
keep the result in the input tensor a and return nothing
*/
void _SortMe(XTensor * a, XTensor * index, int dim);
/*
sort the data along a given dimension (return a XTensor structure)
make a new tensor to keep the result and return it
*/
extern "C"
void _Sort(XTensor * a, XTensor * index, int dim);
void Sort(XTensor & a, XTensor & b, XTensor & index, int dim);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -105,4 +105,29 @@ void _TopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
}
}
}
/*
get the top-k items along a given dimension
>> a - input tensor
>> b - output tensor (top-k result)
>> index - index of the top-k items
>> dim - the dimension along which the sorting is performed
>> k - how many items returned after sorting
*/
void TopK(XTensor &a, XTensor &b, XTensor &index, int dim, int k)
{
_TopK(&a, &b, &index, dim, k);
/* tensor connection */
XList list(2);
list.Add(&b);
list.Add(&index);
XLink::MakeLink(&a, &list, SORT_TOPK);
XLink::AddParamToHeadInt(&b, dim);
XLink::AddParamToHeadInt(&index, k);
XLink::AddParamToHeadInt(&b, dim);
XLink::AddParamToHeadInt(&index, k);
}
} // namespace nts(NiuTrans.Tensor)
......@@ -30,6 +30,10 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
extern "C"
void _TopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k);
/* get the top-k items along a given dimension */
extern "C"
void TopK(XTensor &a, XTensor &b, XTensor &index, int dim, int k);
} // namespace nts(NiuTrans.Tensor)
#endif // __TOPK_H__
\ No newline at end of file
......@@ -60,7 +60,9 @@ void _HardTanH(const XTensor * x, XTensor * y)
}
/*
hard tanh function (return a structure)
hard tanh function (return a XTensor structure)
make a new tensor to keep the result and return it
y = 1 if x > 1
x if -1 <= x <= 1
-1 if x < -1
......@@ -72,8 +74,10 @@ XTensor HardTanH(const XTensor &x)
XTensor y(&x);
y.SetTMP();
/* call _HardTanH function */
_HardTanH(&x, &y);
/* tensor connection */
XLink::MakeLink(&x, NULL, &y, FUNC_HARDTANH);
return y;
......@@ -116,7 +120,7 @@ void _HardTanHBackward(XTensor * gold, XTensor * y, XTensor * x,
{
/* calculate dE/dy */
if(lossName != NOLOSS)
LossBackward(dedy, gold, y, lossName);
_LossBackward(dedy, gold, y, lossName);
DTYPE * dedyp = (DTYPE*)dedy->data;
DTYPE * dedxp = (DTYPE*)dedx->data;
......
......@@ -137,7 +137,7 @@ void _CudaHardTanHBackward(XTensor * gold, XTensor * y, XTensor * x,
/* calculate dE/dy */
if(lossName != NOLOSS)
LossBackward(dedy, gold, y, lossName);
_LossBackward(dedy, gold, y, lossName);
int gridSize[3], blockSize[3];
......
......@@ -37,7 +37,7 @@ y = 1 if x > 1
*/
void _HardTanH(const XTensor * x, XTensor * y);
/* hard tanh function (return a structure) */
/* hard tanh function (return a XTensor structure) */
XTensor HardTanH(const XTensor &x);
/* de/dx */
......
......@@ -19,6 +19,7 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-27
*/
#include "../XName.h"
#include "Identity.h"
#include "../XUtility.h"
#include "../core/movement/CopyValues.h"
......@@ -36,6 +37,26 @@ void _Identity(const XTensor * x, XTensor * y)
}
/*
identity function y = x (return a XTensor structure)
make a new tensor to keep the result and return it
>> x - input tensor
<< return - y
*/
XTensor Identity(const XTensor &x)
{
XTensor y(&x);
y.SetTMP();
/* call _Identity function */
_Identity(&x, &y);
/* tensor connection */
XLink::MakeLink(&x, NULL, &y, FUNC_IDENTITY);
return y;
}
/*
backward computation for identity function y = x
dE/dx = dE/dy * dy/dx = dE/dy
......@@ -58,7 +79,7 @@ void _IdentityBackward(XTensor * gold, XTensor * y, XTensor * x,
{
/* calculate dE/dy */
if(lossName != NOLOSS)
LossBackward(dedy, gold, y, lossName);
_LossBackward(dedy, gold, y, lossName);
if(dedy->data != dedx->data)
_CopyValues(dedy, dedx);
......
......@@ -28,11 +28,12 @@
namespace nts{ // namespace nts(NiuTrans.Tensor)
/* identity function y = x */
extern "C"
void _Identity(const XTensor * x, XTensor * y);
/* identity function y = x (return a XTensor structure) */
XTensor Identity(const XTensor &x);
/* de/dx */
extern "C"
void _IdentityBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
LOSS_FUNCTION_NAME lossName);
......
......@@ -161,19 +161,23 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
ShowNTErrors("TODO!");
}
/*
log scale softmax y = log(e^x / \sum_{i} e^{x_i}) (return a structure)
/*
log scale softmax y = log(e^x / \sum_{i} e^{x_i}) (return a XTensor structure)
make a new tensor to keep the result and return it
>> x - input vector
>> leadDim - leading dimension (along which we perform reduction)
<< return - result
<< return - y
*/
XTensor LogSoftmax(const XTensor &x, int leadDim)
{
XTensor y(&x);
y.SetTMP();
/* call _LogSoftmax function */
_LogSoftmax(&x, &y, leadDim);
/* tensor connection */
XLink::MakeLink(&x, NULL, &y, FUNC_LOGSOFTMAX);
XLink::AddParamToHeadInt(&y, leadDim);
......
......@@ -30,7 +30,7 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
/* log scale softmax y = log(e^x / \sum_{i} e^{x_i}) */
void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim);
/* log scale softmax y = log(e^x / \sum_{i} e^{x_i}) (return a structure) */
/* log scale softmax y = log(e^x / \sum_{i} e^{x_i}) (return a XTensor structure) */
XTensor LogSoftmax(const XTensor &x, int leadDim);
/* de/dx */
......
......@@ -42,7 +42,7 @@ compute the loss
>> oBeg - where to start in the model output (along the leading dimension)
<< return - error in model prediction with respect to gold standard
*/
DTYPE LossCompute(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
DTYPE _LossCompute(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
bool isLogOutput, int leadDim, int gBeg, int gLen, int oBeg)
{
DTYPE error = 0.0F;
......@@ -66,7 +66,7 @@ DTYPE LossCompute(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
blockNum = output->unitNum / blockSize;
if(isLogOutput)
return LossComputeForLogScale(gold, output, LFName, leadDim, gBeg, gLen, oBeg);
return _LossComputeForLogScale(gold, output, LFName, leadDim, gBeg, gLen, oBeg);
DTYPE * gp = (DTYPE*)gold->data;
DTYPE * op = (DTYPE*)output->data;
......@@ -180,7 +180,7 @@ DTYPE LossCompute(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
}
else {
#ifdef USE_CUDA
error = CudaLossCompute(gold, output, LFName, isLogOutput, leadDim, gBeg, gLen, oBeg);
error = _CudaLossCompute(gold, output, LFName, isLogOutput, leadDim, gBeg, gLen, oBeg);
#else
ShowNTErrors("Please specify USE_CUDA and recompile the code!");
#endif
......@@ -201,7 +201,7 @@ the log version of loss computation
>> oBeg - where to start in the model output (along the leading dimension)
<< return - error in model prediction with respect to gold standard
*/
DTYPE LossComputeForLogScale(XTensor * gold, XTensor * output,
DTYPE _LossComputeForLogScale(XTensor * gold, XTensor * output,
LOSS_FUNCTION_NAME LFName,
int leadDim, int gBeg, int gLen, int oBeg)
{
......@@ -343,7 +343,7 @@ with respect to gold standard, and y this the model output
>> LFName - name of loss function
<< return dE/dy
*/
DTYPE LossBackwardPoint(DTYPE t, DTYPE y, LOSS_FUNCTION_NAME LFName)
DTYPE _LossBackwardPoint(DTYPE t, DTYPE y, LOSS_FUNCTION_NAME LFName)
{
/*
squared error
......@@ -380,7 +380,7 @@ with respect to gold standard, and y this the model output
>> tLen - segment length from tBeg (along the leading dimension)
>> yBeg - where to start in the model output (along the leading dimension)
*/
void LossBackward(XTensor * dedy, XTensor * t, XTensor * y,
void _LossBackward(XTensor * dedy, XTensor * t, XTensor * y,
LOSS_FUNCTION_NAME LFName,
int leadDim, int tBeg, int tLen, int yBeg)
{
......@@ -496,7 +496,7 @@ void LossBackward(XTensor * dedy, XTensor * t, XTensor * y,
}
else {
#ifdef USE_CUDA
CudaLossBackward(dedy, t, y, LFName, leadDim, tBeg, tLen, yBeg);
_CudaLossBackward(dedy, t, y, LFName, leadDim, tBeg, tLen, yBeg);
#else
ShowNTErrors("Please specify USE_CUDA and recompile the code!");
#endif
......
......@@ -51,7 +51,7 @@ compute the loss
>> yBeg - where to start in the model output (along the leading dimension)
<< return - error in model prediction with respect to gold standard
*/
DTYPE CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
bool isLogOutput, int leadDim, int gBeg, int gLen, int yBeg)
{
CheckNTErrors((gLen >= 0 && gLen <= y->unitNum), "Illegal input length!");
......@@ -65,7 +65,7 @@ DTYPE CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
CheckNTErrors((gLen == gold->dimSize[leadDim] && gBeg == 0 && yBeg == 0), "TODO!");
if(isLogOutput)
return LossComputeForLogScale(gold, y, LFName, leadDim, gBeg, gLen, yBeg);
return _LossComputeForLogScale(gold, y, LFName, leadDim, gBeg, gLen, yBeg);
DTYPE error = 0.0F;
......@@ -77,7 +77,7 @@ DTYPE CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
if(LFName == SQUAREDERROR){
XTensor * diff = NewTensor(gold->order, gold->dimSize, gold->dataType, gold->denseRatio, gold->devID, gold->mem);
_Sum(gold, y, diff, -1.0F);
_Power(diff, 2.0F);
_PowerMe(diff, 2.0F);
_ScaleAndShiftMe(diff, 0.5F, 0.0F);
int reduceTimes = diff->order;
......@@ -110,9 +110,9 @@ DTYPE CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
if(LFName == CROSSENTROPY){
XTensor * diff = NewTensor(y->order, y->dimSize, y->dataType, y->denseRatio, y->devID, y->mem);
_CopyValues(y, diff);
_Log(diff);
_LogMe(diff);
_Multiply(gold, diff, diff);
_Negate(diff);
_NegateMe(diff);
int reduceTimes = diff->order;
for (int i = 0; i < reduceTimes; i++) {
......@@ -148,7 +148,7 @@ DTYPE CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
_CopyValues(y, yOnehot);
_Multiply(gold, y, yOnehot);
_Sum(gold, yOnehot, diff, -1.0F);
_Power(diff, 2.0F);
_PowerMe(diff, 2.0F);
_ScaleAndShiftMe(diff, 0.5F, 0.0F);
int reduceTimes = diff->order;
......@@ -190,7 +190,7 @@ the log version of loss computation
>> yBeg - where to start in the model output (along the leading dimension)
<< return - error in model prediction with respect to gold standard
*/
DTYPE CudaLossComputeForLogScale(XTensor * gold, XTensor * y,
DTYPE _CudaLossComputeForLogScale(XTensor * gold, XTensor * y,
LOSS_FUNCTION_NAME LFName,
int leadDim, int gBeg, int gLen, int yBeg)
{
......@@ -209,9 +209,9 @@ with respect to gold standard, and y this the model output
>> LFName - name of loss function
<< return dE/dy
*/
DTYPE CudaLossBackward(DTYPE t, DTYPE y, LOSS_FUNCTION_NAME LFName)
DTYPE _CudaLossBackward(DTYPE t, DTYPE y, LOSS_FUNCTION_NAME LFName)
{
return LossBackwardPoint(t, y, LFName);
return _LossBackwardPoint(t, y, LFName);
// TODO: call cuda kernels for computing the errors
}
......@@ -328,7 +328,7 @@ with respect to gold standard, and y this the model output
>> tLen - segment length from oBeg (along the leading dimension)
>> yBeg - where to start in the model output (along the leading dimension)
*/
void CudaLossBackward(XTensor * dedy, XTensor * t, XTensor * y,
void _CudaLossBackward(XTensor * dedy, XTensor * t, XTensor * y,
LOSS_FUNCTION_NAME LFName,
int leadDim, int tBeg, int tLen, int yBeg)
{
......
......@@ -31,21 +31,21 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
/* compute the loss (cuda version) */
extern "C"
DTYPE CudaLossCompute(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
DTYPE _CudaLossCompute(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
bool isLogOutput, int leadDim, int gBeg, int gLen, int oBeg);
/* compute the loss in log scale (cuda version) */
extern "C"
DTYPE CudaLossComputeForLogScale(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
DTYPE _CudaLossComputeForLogScale(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
int leadDim, int gBeg, int gLen, int oBeg);
/* backward compuation for a single element (cuda version) */
extern "C"
DTYPE CudaLossBackwardPoint(DTYPE t, DTYPE y, LOSS_FUNCTION_NAME LFName);
DTYPE _CudaLossBackwardPoint(DTYPE t, DTYPE y, LOSS_FUNCTION_NAME LFName);
/* backward compuation for (dense) vectors (cuda version) */
extern "C"
void CudaLossBackward(XTensor * dedy, XTensor * t, XTensor * y,
void _CudaLossBackward(XTensor * dedy, XTensor * t, XTensor * y,
LOSS_FUNCTION_NAME LFName,
int leadDim = -1, int tBeg = 0, int tLen = -1, int yBeg = 0);
......
......@@ -48,21 +48,21 @@ loss function to measure the "number" of errors
/* compute the loss */
extern "C"
DTYPE LossCompute(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
DTYPE _LossCompute(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
bool isLogOutput, int leadDim, int gBeg, int gLen, int oBeg);
/* compute the loss (log version) */
extern "C"
DTYPE LossComputeForLogScale(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
DTYPE _LossComputeForLogScale(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
int leadDim, int gBeg, int gLen, int oBeg);
/* backward compuation for a single element */
extern "C"
DTYPE LossBackwardPoint(DTYPE t, DTYPE y, LOSS_FUNCTION_NAME LFName);
DTYPE _LossBackwardPoint(DTYPE t, DTYPE y, LOSS_FUNCTION_NAME LFName);
/* backward compuation for (dense) vectors */
extern "C"
void LossBackward(XTensor * dEdY, XTensor * t, XTensor * y,
void _LossBackward(XTensor * dEdY, XTensor * t, XTensor * y,
LOSS_FUNCTION_NAME LFName,
int leadDim = -1, int tBeg = 0, int tLen = -1, int yBeg = 0);
......
......@@ -19,6 +19,7 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "../XName.h"
#include "Rectify.h"
#include "Rectify.cuh"
......@@ -55,6 +56,28 @@ void _Rectify(const XTensor * x, XTensor * y)
}
/*
rectify function y = max(0, x) (return a XTensor structure)
make a new tensor to keep the result and return it
>> input - input tensor
<< return - y
*/
XTensor Rectify(const XTensor &x)
{
XTensor y(&x);
y.SetTMP();
/* call _Rectify function */
_Rectify(&x, &y);
/* tensor connection */
XLink::MakeLink(&x, NULL, &y, FUNC_RECTIFY);
return y;
}
/*
backward computation
dE/dx = dE/dy * dy/dx
......@@ -94,7 +117,7 @@ void _RectifyBackward(XTensor * gold, XTensor * y, XTensor * x,
{
/* calculate dE/dy */
if(lossName != NOLOSS)
LossBackward(dedy, gold, y, lossName);
_LossBackward(dedy, gold, y, lossName);
DTYPE * dedyp = (DTYPE*)dedy->data;
DTYPE * dedxp = (DTYPE*)dedx->data;
......
......@@ -134,7 +134,7 @@ void _CudaRectifyBackward(XTensor * gold, XTensor * y, XTensor * x,
/* calculate dE/dy */
if(lossName != NOLOSS)
CudaLossBackward(dedy, gold, y, lossName);
_CudaLossBackward(dedy, gold, y, lossName);
int gridSize[3], blockSize[3];
......
......@@ -28,11 +28,12 @@
namespace nts{ // namespace nts(NiuTrans.Tensor)
/* rectify function y = max(0, x) */
extern "C"
void _Rectify(const XTensor * x, XTensor * y);
/* rectify function y = max(0, x) (return a XTensor structure) */
XTensor Rectify(const XTensor &x);
/* de/dx */
extern "C"
void _RectifyBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
LOSS_FUNCTION_NAME lossName);
......
......@@ -19,6 +19,7 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-25
*/
#include "../XName.h"
#include <math.h>
#include "Sigmoid.h"
#include "Sigmoid.cuh"
......@@ -53,6 +54,27 @@ void _Sigmoid(const XTensor * x, XTensor * y)
}
/*
sigmoid function y = 1/(1+exp(-x)) (return a XTensor structure)
make a new tensor to keep the result and return it
>> x - input tensor
<< return - y
*/
XTensor Sigmoid(const XTensor &x)
{
XTensor y(&x);
y.SetTMP();
/* call _Sigmoid function */
_Sigmoid(&x, &y);
/* tensor connection */
XLink::MakeLink(&x, NULL, &y, FUNC_SIGMOID);
return y;
}
/*
backward computation
dE/ds = dE/dy * dy/dx
......@@ -86,7 +108,7 @@ void _SigmoidBackward(XTensor * gold, XTensor * y, XTensor * x,
{
/* calculate dE/dy */
if(lossName != NOLOSS)
LossBackward(dedy, gold, y, lossName);
_LossBackward(dedy, gold, y, lossName);
DTYPE * dedyp = (DTYPE*)dedy->data;
DTYPE * dedxp = (DTYPE*)dedx->data;
......
......@@ -129,7 +129,7 @@ void _CudaSigmoidBackward(XTensor * gold, XTensor * y, XTensor * x,
if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
/* calculate dE/dy */
if(lossName != NOLOSS)
LossBackward(dedy, gold, y, lossName);
_LossBackward(dedy, gold, y, lossName);
int gridSize[3], blockSize[3];
......
......@@ -28,11 +28,12 @@
namespace nts{ // namespace nts(NiuTrans.Tensor)
/* sigmoid function y = 1/(1+exp(-x)) */
extern "C"
void _Sigmoid(const XTensor * x, XTensor * y);
/* sigmoid function y = 1/(1+exp(-x)) (return a XTensor structure) */
XTensor Sigmoid(const XTensor &x);
/* de/dx */
extern "C"
void _SigmoidBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
LOSS_FUNCTION_NAME lossName);
......
......@@ -22,6 +22,7 @@
#include <math.h>
#include "Softmax.h"
#include "Softmax.cuh"
#include "../XName.h"
#include "../XUtility.h"
#include "../core/reduce/ReduceSum.h"
#include "../core/reduce/ReduceMax.h"
......@@ -130,6 +131,28 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)
}
/*
softmax y = e^x / \sum_{i} e^{x_i} (return a XTensor structure)
make a new tensor to keep the result and return it
>> x - input vector
>> leadDim - leading dimension (along which we perform reduction)
<< return - y
*/
XTensor Softmax(const XTensor &x, int leadDim)
{
XTensor y(&x);
y.SetTMP();
/* call _Softmax function */
_Softmax(&x, &y, leadDim);
/* tensor connection */
XLink::MakeLink(&x, NULL, &y, FUNC_SOFTMAX);
return y;
}
/*
backward computation for dense tensors
dE/dx = dE/dy * dy/dx
......
......@@ -31,7 +31,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* softmax y = e^x / \sum_{i} e^{x_i} (Cuda version) */
extern "C"
void _CudaSotmax(const XTensor * input, XTensor * output, int leadDim);
void _CudaSoftmax(const XTensor * input, XTensor * output, int leadDim);
/* softmax y = e^x / \sum_{i} e^{x_i} (Cuda version) */
extern "C"
......
......@@ -28,11 +28,12 @@
namespace nts{ // namespace nts(NiuTrans.Tensor)
/* softmax y = e^x / \sum_{i} e^{x_i} */
extern "C"
void _Softmax(const XTensor * x, XTensor * y, int leadDim);
/* softmax y = e^x / \sum_{i} e^{x_i} (return a XTensor structure) */
XTensor Softmax(const XTensor &x, int leadDim);
/* de/dx */
extern "C"
void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
int leadDim,
......
......@@ -51,15 +51,21 @@ bool TestAbsolute1()
/* create tensors */
XTensor * a = NewTensor(aOrder, aDimSize);
XTensor * b = NewTensor(aOrder, aDimSize);
XTensor * aMe = NewTensor(aOrder, aDimSize);
XTensor bUser;
/* initialize variables */
a->SetData(aData, aUnitNum);
aMe->SetData(aData, aUnitNum);
/* call Absolute function */
_Absolute(a);
_Absolute(a, b);
_AbsoluteMe(aMe);
bUser = Absolute(*a);
/* check results */
cpuTest = a->CheckData(answer, aUnitNum, 1e-4F);
cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && aMe->CheckData(answer, aUnitNum, 1e-4F) && bUser.CheckData(answer, aUnitNum, 1e-4F);
#ifdef USE_CUDA
/* GPU test */
......@@ -67,25 +73,37 @@ bool TestAbsolute1()
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * aMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor bUserGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
aMeGPU->SetData(aData, aUnitNum);
/* call Absolute function */
_Absolute(aGPU);
_Absolute(aGPU, bGPU);
_AbsoluteMe(aMeGPU);
bUserGPU = Absolute(*aGPU);
/* check results */
gpuTest = aGPU->CheckData(answer, aUnitNum, 1e-4F);
gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
/* destroy variables */
delete a;
delete b;
delete aMe;
delete aGPU;
delete bGPU;
delete aMeGPU;
delete[] aDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
delete aMe;
delete[] aDimSize;
return cpuTest;
......
......@@ -76,6 +76,7 @@ bool TestConcatenate1()
XTensor * s1 = NewTensor(sOrder1, sDimSize1);
XTensor * s2 = NewTensor(sOrder2, sDimSize2);
XTensor * t = NewTensor(tOrder, tDimSize);
XTensor tUser;
/* initialize variables */
s1->SetData(sData1, sUnitNum1);
......@@ -88,9 +89,10 @@ bool TestConcatenate1()
/* call Concatenate function */
_Concatenate(sList, t, 1);
tUser = Concatenate(*sList, 1);
/* check results */
cpuTest = t->CheckData(answer, tUnitNum);
cpuTest = t->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum);
#ifdef USE_CUDA
/* GPU test */
......@@ -100,6 +102,7 @@ bool TestConcatenate1()
XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* Initialize variables */
sGPU1->SetData(sData1, sUnitNum1);
......@@ -115,9 +118,10 @@ bool TestConcatenate1()
/* call Concatenate function */
_Concatenate(sList, tGPU, 1);
tUserGPU = Concatenate(*sList, 1);
/* check results */
gpuTest = tGPU->CheckData(answer, tUnitNum);
gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);
/* destroy variables */
delete sList;
......@@ -201,6 +205,7 @@ bool TestConcatenate2()
XTensor * s1 = NewTensor(sOrder1, sDimSize1);
XTensor * s2 = NewTensor(sOrder2, sDimSize2);
XTensor * t = NewTensor(tOrder, tDimSize);
XTensor tUser;
/* initialize variables */
s1->SetData(sData1, sUnitNum1);
......@@ -213,9 +218,10 @@ bool TestConcatenate2()
/* call Concatenate function */
_Concatenate(sList, t, 0);
tUser = Concatenate(*sList, 0);
/* check results */
cpuTest = t->CheckData(answer, tUnitNum);
cpuTest = t->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum);
#ifdef USE_CUDA
/* GPU test */
......@@ -225,6 +231,7 @@ bool TestConcatenate2()
XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* Initialize variables */
sGPU1->SetData(sData1, sUnitNum1);
......@@ -240,9 +247,10 @@ bool TestConcatenate2()
/* call Concatenate function */
_Concatenate(sList, tGPU, 0);
tUserGPU = Concatenate(*sList, 0);
/* check results */
gpuTest = tGPU->CheckData(answer, tUnitNum);
gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);
/* destroy variables */
delete sList;
......@@ -324,6 +332,7 @@ bool TestConcatenate3()
XTensor * s1 = NewTensor(sOrder1, sDimSize1);
XTensor * s2 = NewTensor(sOrder2, sDimSize2);
XTensor * t = NewTensor(tOrder, tDimSize);
XTensor tUser;
/* initialize variables */
s1->SetData(sData1, sUnitNum1);
......@@ -336,9 +345,10 @@ bool TestConcatenate3()
/* call Concatenate function */
_Concatenate(sList, t, 1);
tUser = Concatenate(*sList, 1);
/* check results */
cpuTest = t->CheckData(answer, tUnitNum);
cpuTest = t->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum);
#ifdef USE_CUDA
/* GPU test */
......@@ -348,6 +358,7 @@ bool TestConcatenate3()
XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* Initialize variables */
sGPU1->SetData(sData1, sUnitNum1);
......@@ -363,9 +374,10 @@ bool TestConcatenate3()
/* call Concatenate function */
_Concatenate(sList, tGPU, 1);
tUserGPU = Concatenate(*sList, 1);
/* check results */
gpuTest = tGPU->CheckData(answer, tUnitNum);
gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);
/* destroy variables */
delete sList;
......@@ -444,6 +456,7 @@ bool TestConcatenate4()
XTensor * s1 = NewTensor(sOrder1, sDimSize1);
XTensor * s2 = NewTensor(sOrder2, sDimSize2);
XTensor * t = NewTensor(tOrder, tDimSize);
XTensor tUser;
/* initialize variables */
s1->SetData(sData1, sUnitNum1);
......@@ -452,9 +465,10 @@ bool TestConcatenate4()
/* call Concatenate function */
_Concatenate(s1, s2, t, 1);
tUser = Concatenate(*s1, *s2, 1);
/* check results */
cpuTest = t->CheckData(answer, tUnitNum);
cpuTest = t->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum);
#ifdef USE_CUDA
/* GPU test */
......@@ -464,6 +478,7 @@ bool TestConcatenate4()
XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* Initialize variables */
sGPU1->SetData(sData1, sUnitNum1);
......@@ -472,9 +487,10 @@ bool TestConcatenate4()
/* call Concatenate function */
_Concatenate(sGPU1, sGPU2, tGPU, 1);
tUserGPU = Concatenate(*sGPU1, *sGPU2, 1);
/* check results */
gpuTest = tGPU->CheckData(answer, tUnitNum);
gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);
/* destroy variables */
delete s1;
......
......@@ -53,6 +53,7 @@ bool TestHardTanH1()
/* create tensors */
XTensor * x = NewTensor(order, dimSize);
XTensor * y = NewTensor(order, dimSize);
XTensor yUser;
/* initialize variables */
x->SetData(xData, unitNum);
......@@ -60,9 +61,10 @@ bool TestHardTanH1()
/* call hardtanh function */
_HardTanH(x, y);
yUser = HardTanH(*x);
/* check results */
cpuTest = y->CheckData(answer, unitNum, 1e-4F);
cpuTest = y->CheckData(answer, unitNum, 1e-4F) && yUser.CheckData(answer, unitNum, 1e-4F);
#ifdef USE_CUDA
/* GPU test */
......@@ -71,6 +73,7 @@ bool TestHardTanH1()
/* create tensor */
XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor yUserGPU;
/* Initialize variables */
xGPU->SetData(xData, unitNum);
......@@ -78,9 +81,10 @@ bool TestHardTanH1()
/* call hardtanh function */
_HardTanH(xGPU, yGPU);
yUserGPU = HardTanH(*xGPU);
/* check results */
gpuTest = yGPU->CheckData(answer, unitNum, 1e-4F);
gpuTest = yGPU->CheckData(answer, unitNum, 1e-4F) && yUserGPU.CheckData(answer, unitNum, 1e-4F);
/* destroy variables */
delete x;
......
......@@ -51,6 +51,7 @@ bool TestIdentity1()
/* create tensors */
XTensor * x = NewTensor(order, dimSize);
XTensor * y = NewTensor(order, dimSize);
XTensor yUser;
/* initialize variables */
x->SetData(xData, unitNum);
......@@ -58,9 +59,10 @@ bool TestIdentity1()
/* call Identity function */
_Identity(x, y);
yUser = Identity(*x);
/* check result */
cpuTest = y->CheckData(answer, unitNum);
cpuTest = y->CheckData(answer, unitNum, 1e-4F) && yUser.CheckData(answer, unitNum, 1e-4F);
#ifdef USE_CUDA
/* GPU test */
......@@ -69,6 +71,7 @@ bool TestIdentity1()
/* create tensors */
XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor yUserGPU;
/* initialize variables */
xGPU->SetData(xData, unitNum);
......@@ -76,9 +79,10 @@ bool TestIdentity1()
/* call Identity function */
_Identity(xGPU, yGPU);
yUserGPU = Identity(*xGPU);
/* check result */
gpuTest = yGPU->CheckData(answer, unitNum);
gpuTest = yGPU->CheckData(answer, unitNum, 1e-4F) && yUserGPU.CheckData(answer, unitNum, 1e-4F);
/* destroy variables */
delete x;
......
......@@ -51,15 +51,21 @@ bool TestLog1()
/* create tensors */
XTensor * a = NewTensor(aOrder, aDimSize);
XTensor * b = NewTensor(aOrder, aDimSize);
XTensor * aMe = NewTensor(aOrder, aDimSize);
XTensor bUser;
/* initialize variables */
a->SetData(aData, aUnitNum);
aMe->SetData(aData, aUnitNum);
/* call Log function */
_Log(a);
_Log(a, b);
_LogMe(aMe);
bUser = Log(*a);
/* check results */
cpuTest = a->CheckData(answer, aUnitNum, 1e-4F);
cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && aMe->CheckData(answer, aUnitNum, 1e-4F) && bUser.CheckData(answer, aUnitNum, 1e-4F);
#ifdef USE_CUDA
/* GPU test */
......@@ -67,25 +73,37 @@ bool TestLog1()
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * aMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor bUserGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
aMeGPU->SetData(aData, aUnitNum);
/* call Log function */
_Log(aGPU);
_Log(aGPU, bGPU);
_LogMe(aMeGPU);
bUserGPU = Log(*aGPU);
/* check results */
gpuTest = aGPU->CheckData(answer, aUnitNum, 1e-4F);
gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
/* destroy variables */
delete a;
delete b;
delete aMe;
delete aGPU;
delete bGPU;
delete aMeGPU;
delete[] aDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
delete aMe;
delete[] aDimSize;
return cpuTest;
......
......@@ -51,6 +51,7 @@ bool TestLogSoftmax1()
/* create tensors */
XTensor * x = NewTensor(order, dimSize);
XTensor * y = NewTensor(order, dimSize);
XTensor yUser;
/* initialize variables */
x->SetData(xData, unitNum);
......@@ -58,9 +59,10 @@ bool TestLogSoftmax1()
/* call LogSoftmax function */
_LogSoftmax(x, y, 1);
yUser = LogSoftmax(*x, 1);
/* check result */
cpuTest = y->CheckData(answer, unitNum, 1e-4F);
cpuTest = y->CheckData(answer, unitNum, 1e-4F) && yUser.CheckData(answer, unitNum, 1e-4F);
#ifdef USE_CUDA
/* GPU test */
......@@ -69,6 +71,7 @@ bool TestLogSoftmax1()
/* create tensors */
XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor yUserGPU;
/* initialize variables */
xGPU->SetData(xData, unitNum);
......@@ -76,9 +79,10 @@ bool TestLogSoftmax1()
/* call LogSoftmax function */
_LogSoftmax(xGPU, yGPU, 1);
yUserGPU = LogSoftmax(*xGPU, 1);
/* check result */
gpuTest = yGPU->CheckData(answer, unitNum, 1e-4F);
gpuTest = yGPU->CheckData(answer, unitNum, 1e-4F) && yUserGPU.CheckData(answer, unitNum, 1e-4F);
/* destroy variables */
delete x;
......
......@@ -46,6 +46,7 @@ bool TestLoss1()
bool cpuTest = true;
DTYPE answer = 5.0F;
DTYPE error;
/* create tensors */
XTensor * output = NewTensor(order, dimSize);
......@@ -57,8 +58,8 @@ bool TestLoss1()
_ScaleAndShiftMe(output, 1, 1);
_ScaleAndShiftMe(gold, 1, 2);
DTYPE error;
error = LossCompute(gold, output, SQUAREDERROR, false, 0, 0, dimSize[0], 0);
/* call LossCompute function */
error = _LossCompute(gold, output, SQUAREDERROR, false, 0, 0, dimSize[0], 0);
/* check results */
cpuTest = (error == answer);
......@@ -78,7 +79,7 @@ bool TestLoss1()
_ScaleAndShiftMe(goldGPU, 1, 2);
/* call LossCompute function */
error = LossCompute(goldGPU, outputGPU, SQUAREDERROR, false, 0, 0, dimSize[0], 0);
error = _LossCompute(goldGPU, outputGPU, SQUAREDERROR, false, 0, 0, dimSize[0], 0);
/* check results */
gpuTest = (error == answer);
......@@ -123,6 +124,7 @@ bool TestLoss2()
bool cpuTest = true;
DTYPE answer = 0.0F;
DTYPE error;
/* create tensors */
XTensor * output = NewTensor(order, dimSize);
......@@ -134,8 +136,8 @@ bool TestLoss2()
_ScaleAndShiftMe(output, 1, 1);
_ScaleAndShiftMe(gold, 1, 2);
DTYPE error;
error = LossCompute(gold, output, CROSSENTROPY, false, 0, 0, dimSize[0], 0);
/* call LossCompute function */
error = _LossCompute(gold, output, CROSSENTROPY, false, 0, 0, dimSize[0], 0);
/* check results */
cpuTest = (error == answer);
......@@ -155,7 +157,7 @@ bool TestLoss2()
_ScaleAndShiftMe(goldGPU, 1, 2);
/* call LossCompute function */
error = LossCompute(goldGPU, outputGPU, CROSSENTROPY, false, 0, 0, dimSize[0], 0);
error = _LossCompute(goldGPU, outputGPU, CROSSENTROPY, false, 0, 0, dimSize[0], 0);
/* check results */
gpuTest = (error == answer);
......@@ -210,6 +212,7 @@ bool TestLoss3()
bool cpuTest = true;
DTYPE answer = 0.25F;
DTYPE error;
/* create tensors */
XTensor * output = NewTensor(order, dimSize);
......@@ -219,8 +222,8 @@ bool TestLoss3()
output->SetData(outputData, unitNum);
gold->SetData(goldData, unitNum);
DTYPE error;
error = LossCompute(gold, output, ONEHOTERROR, false, 0, 0, dimSize[0], 0);
/* call LossCompute function */
error = _LossCompute(gold, output, ONEHOTERROR, false, 0, 0, dimSize[0], 0);
/* check results */
cpuTest = (error == answer);
......@@ -238,7 +241,7 @@ bool TestLoss3()
goldGPU->SetData(goldData, unitNum);
/* call LossCompute function */
error = LossCompute(goldGPU, outputGPU, ONEHOTERROR, false, 0, 0, dimSize[0], 0);
error = _LossCompute(goldGPU, outputGPU, ONEHOTERROR, false, 0, 0, dimSize[0], 0);
/* check results */
gpuTest = (error == answer);
......
......@@ -75,6 +75,7 @@ bool TestMatrixMulBatched1()
XTensor * s1 = NewTensor(sOrder1, sDimSize1);
XTensor * s2 = NewTensor(sOrder2, sDimSize2);
XTensor * t = NewTensor(tOrder, tDimSize);
XTensor tUser;
/* initialize variables */
s1->SetData(sData1, sUnitNum1);
......@@ -83,9 +84,10 @@ bool TestMatrixMulBatched1()
/* call MatrixMulBatched function */
_MatrixMulBatched(s1, X_NOTRANS, s2, X_NOTRANS, t);
tUser = MatrixMulBatched(*s1, X_NOTRANS, *s2, X_NOTRANS);
/* check results */
cpuTest = t->CheckData(answer, tUnitNum);
cpuTest = t->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum);
#ifdef USE_CUDA
/* GPU test */
......@@ -95,6 +97,7 @@ bool TestMatrixMulBatched1()
XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* Initialize variables */
sGPU1->SetData(sData1, sUnitNum1);
......@@ -103,9 +106,10 @@ bool TestMatrixMulBatched1()
/* call MatrixMulBatched function */
_MatrixMulBatched(sGPU1, X_NOTRANS, sGPU2, X_NOTRANS, tGPU);
tUserGPU = MatrixMulBatched(*sGPU1, X_NOTRANS, *sGPU2, X_NOTRANS);
/* check results */
gpuTest = tGPU->CheckData(answer, tUnitNum);
gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);
/* destroy variables */
delete s1;
......@@ -193,6 +197,7 @@ bool TestMatrixMulBatched2()
XTensor * s1 = NewTensor(sOrder1, sDimSize1);
XTensor * s2 = NewTensor(sOrder2, sDimSize2);
XTensor * t = NewTensor(tOrder, tDimSize);
XTensor tUser;
/* initialize variables */
s1->SetData(sData1, sUnitNum1);
......@@ -201,10 +206,11 @@ bool TestMatrixMulBatched2()
/* call MatrixMulBatched function */
_MatrixMulBatched(s1, X_NOTRANS, s2, X_NOTRANS, t);
tUser = MatrixMulBatched(*s1, X_NOTRANS, *s2, X_NOTRANS);
/* check results */
cpuTest = t->CheckData(answer, tUnitNum);
cpuTest = t->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
......@@ -213,6 +219,7 @@ bool TestMatrixMulBatched2()
XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* Initialize variables */
sGPU1->SetData(sData1, sUnitNum1);
......@@ -221,9 +228,10 @@ bool TestMatrixMulBatched2()
/* call MatrixMulBatched function */
_MatrixMulBatched(sGPU1, X_NOTRANS, sGPU2, X_NOTRANS, tGPU);
tUserGPU = MatrixMulBatched(*sGPU1, X_NOTRANS, *sGPU2, X_NOTRANS);
/* check results */
gpuTest = tGPU->CheckData(answer, tUnitNum);
gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);
/* destroy variables */
delete s1;
......
......@@ -60,16 +60,18 @@ bool TestMerge1()
/* create tensors */
XTensor * s = NewTensor(sOrder, sDimSize);
XTensor * t = NewTensor(tOrder, tDimSize);
XTensor tUser;
/* initialize variables */
s->SetData(sData, sUnitNum);
t->SetZeroAll();
/* call merge function */
/* call Merge function */
_Merge(s, t, 1, 0);
tUser = Merge(*s, 1, 0);
/* check results */
cpuTest = t->CheckData(answer, tUnitNum);
cpuTest = t->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum);
#ifdef USE_CUDA
/* GPU test */
......@@ -78,16 +80,18 @@ bool TestMerge1()
/* create tensor */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* Initialize variables */
sGPU->SetData(sData, sUnitNum);
tGPU->SetZeroAll();
/* call merge function */
/* call Merge function */
_Merge(sGPU, tGPU, 1, 0);
tUserGPU = Merge(*sGPU, 1, 0);
/* check results */
gpuTest = tGPU->CheckData(answer, tUnitNum);
gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);
/* destroy variables */
delete s;
......@@ -166,18 +170,23 @@ bool TestMerge2()
XTensor * s = NewTensor(sOrder, sDimSize);
XTensor * t1 = NewTensor(tOrder1, tDimSize1);
XTensor * t2 = NewTensor(tOrder2, tDimSize2);
XTensor tUser1;
XTensor tUser2;
/* initialize variables */
s->SetData(sData, sUnitNum);
t1->SetZeroAll();
t2->SetZeroAll();
/* call merge function */
/* call Merge function */
_Merge(s, t1, 1, 0);
_Merge(s, t2, 2, 0);
tUser1 = Merge(*s, 1, 0);
tUser2 = Merge(*s, 2, 0);
/* check results */
cpuTest = t1->CheckData(answer1, tUnitNum1) && t2->CheckData(answer2, tUnitNum2);
cpuTest = t1->CheckData(answer1, tUnitNum1) && tUser1.CheckData(answer1, tUnitNum1)
&& t2->CheckData(answer2, tUnitNum2) && tUser2.CheckData(answer2, tUnitNum2);
#ifdef USE_CUDA
/* GPU test */
......@@ -187,18 +196,23 @@ bool TestMerge2()
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU1 = NewTensor(tOrder1, tDimSize1, X_FLOAT, 1.0F, 0);
XTensor * tGPU2 = NewTensor(tOrder2, tDimSize2, X_FLOAT, 1.0F, 0);
XTensor tUserGPU1;
XTensor tUserGPU2;
/* Initialize variables */
sGPU->SetData(sData, sUnitNum);
tGPU1->SetZeroAll();
tGPU2->SetZeroAll();
/* call merge function */
/* call Merge function */
_Merge(sGPU, tGPU1, 1, 0);
_Merge(sGPU, tGPU2, 2, 0);
tUserGPU1 = Merge(*sGPU, 1, 0);
tUserGPU2 = Merge(*sGPU, 2, 0);
/* check results */
gpuTest = tGPU1->CheckData(answer1, tUnitNum1) && tGPU2->CheckData(answer2, tUnitNum2);
gpuTest = tGPU1->CheckData(answer1, tUnitNum1) && tUserGPU1.CheckData(answer1, tUnitNum1)
&& tGPU2->CheckData(answer2, tUnitNum2) && tUserGPU2.CheckData(answer2, tUnitNum2);
/* destroy variables */
delete s;
......@@ -271,6 +285,7 @@ bool TestMerge3()
XTensor * s1 = NewTensor(sOrder, sDimSize);
XTensor * s2 = NewTensor(sOrder, sDimSize);
XTensor * t = NewTensor(tOrder, tDimSize);
XTensor tUser;
/* initialize variables */
s1->SetData(sData1, sUnitNum);
......@@ -281,11 +296,12 @@ bool TestMerge3()
smallList->Add(s1);
smallList->Add(s2);
/* call merge function */
/* call Merge function */
_Merge(smallList, t, 0);
tUser = Merge(*smallList, 0);
/* check results */
cpuTest = t->CheckData(answer, tUnitNum);
cpuTest = t->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum);
#ifdef USE_CUDA
/* GPU test */
......@@ -298,6 +314,7 @@ bool TestMerge3()
XTensor * sGPU1 = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * sGPU2 = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(tOrder, tDimSize);
XTensor tUserGPU;
/* initialize variables */
sGPU1->SetData(sData1, sUnitNum);
......@@ -308,11 +325,12 @@ bool TestMerge3()
smallList->Add(sGPU1);
smallList->Add(sGPU2);
/* call merge function */
/* call Merge function */
_Merge(smallList, tGPU, 0);
tUserGPU = Merge(*smallList, 0);
/* check results */
cpuTest = tGPU->CheckData(answer, tUnitNum);
gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);
/* destroy variables */
delete s1;
......@@ -383,6 +401,7 @@ bool TestMerge4()
XTensor * s1 = NewTensor(sOrder, sDimSize);
XTensor * s2 = NewTensor(sOrder, sDimSize);
XTensor * t = NewTensor(tOrder, tDimSize);
XTensor tUser;
/* initialize variables */
s1->SetData(sData1, sUnitNum);
......@@ -393,11 +412,12 @@ bool TestMerge4()
smallList->Add(s1);
smallList->Add(s2);
/* call merge function */
/* call Merge function */
_Merge(smallList, t, 1);
tUser = Merge(*smallList, 1);
/* check results */
cpuTest = t->CheckData(answer, tUnitNum);
cpuTest = t->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum);
#ifdef USE_CUDA
/* GPU test */
......@@ -410,6 +430,7 @@ bool TestMerge4()
XTensor * sGPU1 = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * sGPU2 = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(tOrder, tDimSize);
XTensor tUserGPU;
/* initialize variables */
sGPU1->SetData(sData1, sUnitNum);
......@@ -420,11 +441,12 @@ bool TestMerge4()
smallList->Add(sGPU1);
smallList->Add(sGPU2);
/* call merge function */
/* call Merge function */
_Merge(smallList, tGPU, 1);
tUserGPU = Merge(*smallList, 1);
/* check results */
cpuTest = tGPU->CheckData(answer, tUnitNum);
gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);
/* destroy variables */
delete s1;
......
......@@ -48,15 +48,21 @@ bool TestNegate1()
/* create tensors */
XTensor * a = NewTensor(aOrder, aDimSize);
XTensor * b = NewTensor(aOrder, aDimSize);
XTensor * aMe = NewTensor(aOrder, aDimSize);
XTensor bUser;
/* initialize variables */
a->SetData(aData, aUnitNum);
aMe->SetData(aData, aUnitNum);
/* call Negate function */
_Negate(a);
_Negate(a, b);
_NegateMe(aMe);
bUser = Negate(*a);
/* check results */
cpuTest = a->CheckData(answer, aUnitNum);
cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && aMe->CheckData(answer, aUnitNum, 1e-4F) && bUser.CheckData(answer, aUnitNum, 1e-4F);
#ifdef USE_CUDA
/* GPU test */
......@@ -64,25 +70,37 @@ bool TestNegate1()
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * aMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor bUserGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
aMeGPU->SetData(aData, aUnitNum);
/* call Negate function */
_Negate(aGPU);
_Negate(aGPU, bGPU);
_NegateMe(aMeGPU);
bUserGPU = Negate(*aGPU);
/* check results */
gpuTest = aGPU->CheckData(answer, aUnitNum);
gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
/* destroy variables */
delete a;
delete b;
delete aMe;
delete aGPU;
delete bGPU;
delete aMeGPU;
delete[] aDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
delete aMe;
delete[] aDimSize;
return cpuTest;
......@@ -114,41 +132,59 @@ bool TestNegate2()
/* create tensors */
XTensor * a = NewTensor(aOrder, aDimSize);
XTensor * b = NewTensor(aOrder, aDimSize);
XTensor * aMe = NewTensor(aOrder, aDimSize);
XTensor bUser;
/* initialize variables */
a->SetData(aData, aUnitNum);
aMe->SetData(aData, aUnitNum);
/* call Negate function */
_Negate(a);
_Negate(a, b);
_NegateMe(aMe);
bUser = Negate(*a);
/* check results */
cpuTest = a->CheckData(answer, aUnitNum);
cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && aMe->CheckData(answer, aUnitNum, 1e-4F) && bUser.CheckData(answer, aUnitNum, 1e-4F);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * aMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor bUserGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
aMeGPU->SetData(aData, aUnitNum);
/* call Negate function */
_Negate(aGPU);
_Negate(aGPU, bGPU);
_NegateMe(aMeGPU);
bUserGPU = Negate(*aGPU);
/* check results */
gpuTest = aGPU->CheckData(answer, aUnitNum);
gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
/* destroy variables */
delete a;
delete b;
delete aMe;
delete aGPU;
delete bGPU;
delete aMeGPU;
delete[] aDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
delete aMe;
delete[] aDimSize;
return cpuTest;
......
......@@ -52,41 +52,59 @@ bool TestPower1()
/* create tensors */
XTensor * a = NewTensor(aOrder, aDimSize);
XTensor * b = NewTensor(aOrder, aDimSize);
XTensor * aMe = NewTensor(aOrder, aDimSize);
XTensor bUser;
/* initialize variables */
a->SetData(aData, aUnitNum);
aMe->SetData(aData, aUnitNum);
/* call Power function */
_Power(a, 2.0F);
_Power(a, b, 2.0F);
_PowerMe(aMe, 2.0F);
bUser = Power(*a, 2.0F);
/* check results */
cpuTest = a->CheckData(answer, aUnitNum, 1e-4F);
cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && aMe->CheckData(answer, aUnitNum, 1e-4F) && bUser.CheckData(answer, aUnitNum, 1e-4F);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * aMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor bUserGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
aMeGPU->SetData(aData, aUnitNum);
/* call power function */
_Power(aGPU, 2.0F);
_Power(aGPU, bGPU, 2.0F);
_PowerMe(aMeGPU, 2.0F);
bUserGPU = Power(*aGPU, 2.0F);
/* check results */
gpuTest = aGPU->CheckData(answer, aUnitNum, 1e-4F);
gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
/* destroy variables */
delete a;
delete b;
delete aMe;
delete aGPU;
delete bGPU;
delete aMeGPU;
delete[] aDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
delete aMe;
delete[] aDimSize;
return cpuTest;
......@@ -121,41 +139,59 @@ bool TestPower2()
/* create tensors */
XTensor * a = NewTensor(aOrder, aDimSize);
XTensor * b = NewTensor(aOrder, aDimSize);
XTensor * aMe = NewTensor(aOrder, aDimSize);
XTensor bUser;
/* initialize variables */
a->SetData(aData, aUnitNum);
aMe->SetData(aData, aUnitNum);
/* call Power function */
_Power(a, 1.0F);
_Power(a, b, 1.0F);
_PowerMe(aMe, 1.0F);
bUser = Power(*a, 1.0F);
/* check results */
cpuTest = a->CheckData(answer, aUnitNum, 1e-4F);
cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && aMe->CheckData(answer, aUnitNum, 1e-4F) && bUser.CheckData(answer, aUnitNum, 1e-4F);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * aMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor bUserGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
aMeGPU->SetData(aData, aUnitNum);
/* call Power function */
_Power(aGPU, 1.0F);
_Power(aGPU, bGPU, 1.0F);
_PowerMe(aMeGPU, 1.0F);
bUserGPU = Power(*aGPU, 1.0F);
/* check results */
gpuTest = aGPU->CheckData(answer, aUnitNum, 1e-4F);
gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
/* destroy variables */
delete a;
delete b;
delete aMe;
delete aGPU;
delete bGPU;
delete aMeGPU;
delete[] aDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
delete aMe;
delete[] aDimSize;
return cpuTest;
......@@ -190,41 +226,59 @@ bool TestPower3()
/* create tensors */
XTensor * a = NewTensor(aOrder, aDimSize);
XTensor * b = NewTensor(aOrder, aDimSize);
XTensor * aMe = NewTensor(aOrder, aDimSize);
XTensor bUser;
/* initialize variables */
a->SetData(aData, aUnitNum);
aMe->SetData(aData, aUnitNum);
/* call Power function */
_Power(a, 0.0F);
_Power(a, b, 0.0F);
_PowerMe(aMe, 0.0F);
bUser = Power(*a, 0.0F);
/* check results */
cpuTest = a->CheckData(answer, aUnitNum, 1e-4F);
cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && aMe->CheckData(answer, aUnitNum, 1e-4F) && bUser.CheckData(answer, aUnitNum, 1e-4F);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * aMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor bUserGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
aMeGPU->SetData(aData, aUnitNum);
/* call Power function */
_Power(aGPU, 0.0F);
_Power(aGPU, bGPU, 0.0F);
_PowerMe(aMeGPU, 0.0F);
bUserGPU = Power(*aGPU, 0.0F);
/* check results */
gpuTest = aGPU->CheckData(answer, aUnitNum, 1e-4F);
gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
/* destroy variables */
delete a;
delete b;
delete aMe;
delete aGPU;
delete bGPU;
delete aMeGPU;
delete[] aDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
delete aMe;
delete[] aDimSize;
return cpuTest;
......
......@@ -50,6 +50,7 @@ bool TestRectify1()
/* create tensors */
XTensor * x = NewTensor(order, dimSize);
XTensor * y = NewTensor(order, dimSize);
XTensor yUser;
/* initialize variables */
x->SetData(xData, unitNum);
......@@ -57,9 +58,10 @@ bool TestRectify1()
/* call Rectify function */
_Rectify(x, y);
yUser = Rectify(*x);
/* check results */
cpuTest = y->CheckData(answer, unitNum);
cpuTest = y->CheckData(answer, unitNum, 1e-4F) && yUser.CheckData(answer, unitNum, 1e-4F);
#ifdef USE_CUDA
/* GPU test */
......@@ -68,6 +70,7 @@ bool TestRectify1()
/* create tensor */
XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor yUserGPU;
/* Initialize variables */
xGPU->SetData(xData, unitNum);
......@@ -75,9 +78,10 @@ bool TestRectify1()
/* call Rectify function */
_Rectify(xGPU, yGPU);
yUserGPU = Rectify(*xGPU);
/* check results */
gpuTest = yGPU->CheckData(answer, unitNum);
gpuTest = yGPU->CheckData(answer, unitNum, 1e-4F) && yUserGPU.CheckData(answer, unitNum, 1e-4F);
/* destroy variables */
delete x;
......
......@@ -71,6 +71,8 @@ bool TestReduceMax1()
XTensor * s = NewTensor(sOrder, sDimSize);
XTensor * t1 = NewTensor(tOrder1, tDimSize1);
XTensor * t2 = NewTensor(tOrder2, tDimSize2);
XTensor tUser1;
XTensor tUser2;
/* initialize variables */
s->SetData(sData, sUnitNum);
......@@ -80,9 +82,12 @@ bool TestReduceMax1()
/* call ReduceMax function */
_ReduceMax(s, t1, 0);
_ReduceMax(s, t2, 1);
tUser1 = ReduceMax(*s, 0);
tUser2 = ReduceMax(*s, 1);
/* check results */
cpuTest = t1->CheckData(answer1, tUnitNum1) && t2->CheckData(answer2, tUnitNum2);
cpuTest = t1->CheckData(answer1, tUnitNum1) && tUser1.CheckData(answer1, tUnitNum1)
&& t2->CheckData(answer2, tUnitNum2) && tUser2.CheckData(answer2, tUnitNum2);
#ifdef USE_CUDA
/* GPU test */
......@@ -92,6 +97,8 @@ bool TestReduceMax1()
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU1 = NewTensor(tOrder1, tDimSize1, X_FLOAT, 1.0F, 0);
XTensor * tGPU2 = NewTensor(tOrder2, tDimSize2, X_FLOAT, 1.0F, 0);
XTensor tUserGPU1;
XTensor tUserGPU2;
/* initialize variables */
sGPU->SetData(sData, sUnitNum);
......@@ -101,9 +108,12 @@ bool TestReduceMax1()
/* call ReduceMax function */
_ReduceMax(sGPU, tGPU1, 0);
_ReduceMax(sGPU, tGPU2, 1);
tUserGPU1 = ReduceMax(*sGPU, 0);
tUserGPU2 = ReduceMax(*sGPU, 1);
/* check results */
gpuTest = tGPU1->CheckData(answer1, tUnitNum1) && tGPU2->CheckData(answer2, tUnitNum2);
gpuTest = tGPU1->CheckData(answer1, tUnitNum1) && tUserGPU1.CheckData(answer1, tUnitNum1)
&& tGPU2->CheckData(answer2, tUnitNum2) && tUserGPU2.CheckData(answer2, tUnitNum2);
/* destroy variables */
delete s;
......
......@@ -66,6 +66,8 @@ bool TestReduceMean1()
XTensor * s = NewTensor(sOrder, sDimSize);
XTensor * t1 = NewTensor(tOrder1, tDimSize1);
XTensor * t2 = NewTensor(tOrder2, tDimSize2);
XTensor tUser1;
XTensor tUser2;
/* initialize variables */
s->SetData(sData, sUnitNum);
......@@ -75,9 +77,12 @@ bool TestReduceMean1()
/* call ReduceMean function */
_ReduceMean(s, t1, 0);
_ReduceMean(s, t2, 1);
tUser1 = ReduceMean(*s, 0);
tUser2 = ReduceMean(*s, 1);
/* check results */
cpuTest = t1->CheckData(answer1, tUnitNum1) && t2->CheckData(answer2, tUnitNum2);
cpuTest = t1->CheckData(answer1, tUnitNum1) && tUser1.CheckData(answer1, tUnitNum1)
&& t2->CheckData(answer2, tUnitNum2) && tUser2.CheckData(answer2, tUnitNum2);
#ifdef USE_CUDA
/* GPU test */
......@@ -87,6 +92,8 @@ bool TestReduceMean1()
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU1 = NewTensor(tOrder1, tDimSize1, X_FLOAT, 1.0F, 0);
XTensor * tGPU2 = NewTensor(tOrder2, tDimSize2, X_FLOAT, 1.0F, 0);
XTensor tUserGPU1;
XTensor tUserGPU2;
/* Initialize variables */
sGPU->SetData(sData, sUnitNum);
......@@ -96,9 +103,12 @@ bool TestReduceMean1()
/* call ReduceMean function */
_ReduceMean(sGPU, tGPU1, 0);
_ReduceMean(sGPU, tGPU2, 1);
tUserGPU1 = ReduceMean(*sGPU, 0);
tUserGPU2 = ReduceMean(*sGPU, 1);
/* check results */
cpuTest = tGPU1->CheckData(answer1, tUnitNum1) && tGPU2->CheckData(answer2, tUnitNum2);
gpuTest = tGPU1->CheckData(answer1, tUnitNum1) && tUserGPU1.CheckData(answer1, tUnitNum1)
&& tGPU2->CheckData(answer2, tUnitNum2) && tUserGPU2.CheckData(answer2, tUnitNum2);
/* destroy variables */
delete s;
......
......@@ -69,20 +69,29 @@ bool TestReduceSum1()
/* create tensors */
XTensor * s = NewTensor(sOrder, sDimSize);
XTensor * shift1 = NewTensor(tOrder1, tDimSize1);
XTensor * shift2 = NewTensor(tOrder2, tDimSize2);
XTensor * t1 = NewTensor(tOrder1, tDimSize1);
XTensor * t2 = NewTensor(tOrder2, tDimSize2);
XTensor tUser1;
XTensor tUser2;
/* initialize variables */
s->SetData(sData, sUnitNum);
shift1->SetZeroAll();
shift2->SetZeroAll();
t1->SetZeroAll();
t2->SetZeroAll();
/* call ReduceSum function */
_ReduceSum(s, t1, 0);
_ReduceSum(s, t2, 1);
tUser1 = ReduceSum(*s, 0, *shift1);
tUser2 = ReduceSum(*s, 1, *shift2);
/* check results */
cpuTest = t1->CheckData(answer1, tUnitNum1) && t2->CheckData(answer2, tUnitNum2);
cpuTest = t1->CheckData(answer1, tUnitNum1) && tUser1.CheckData(answer1, tUnitNum1)
&& t2->CheckData(answer2, tUnitNum2) && tUser2.CheckData(answer2, tUnitNum2);
#ifdef USE_CUDA
/* GPU test */
......@@ -90,26 +99,39 @@ bool TestReduceSum1()
/* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * shiftGPU1 = NewTensor(tOrder1, tDimSize1, X_FLOAT, 1.0F, 0);
XTensor * shiftGPU2 = NewTensor(tOrder2, tDimSize2, X_FLOAT, 1.0F, 0);
XTensor * tGPU1 = NewTensor(tOrder1, tDimSize1, X_FLOAT, 1.0F, 0);
XTensor * tGPU2 = NewTensor(tOrder2, tDimSize2, X_FLOAT, 1.0F, 0);
XTensor tUserGPU1;
XTensor tUserGPU2;
/* initialize variables */
sGPU->SetData(sData, sUnitNum);
shiftGPU1->SetZeroAll();
shiftGPU2->SetZeroAll();
tGPU1->SetZeroAll();
tGPU2->SetZeroAll();
/* call ReduceSum function */
_ReduceSum(sGPU, tGPU1, 0);
_ReduceSum(sGPU, tGPU2, 1);
tUserGPU1 = ReduceSum(*sGPU, 0, *shiftGPU1);
tUserGPU2 = ReduceSum(*sGPU, 1, *shiftGPU2);
/* check results */
cpuTest = tGPU1->CheckData(answer1, tUnitNum1) && tGPU2->CheckData(answer2, tUnitNum2);
gpuTest = tGPU1->CheckData(answer1, tUnitNum1) && tUserGPU1.CheckData(answer1, tUnitNum1)
&& tGPU2->CheckData(answer2, tUnitNum2) && tUserGPU2.CheckData(answer2, tUnitNum2);
/* destroy variables */
delete s;
delete shift1;
delete shift2;
delete t1;
delete t2;
delete sGPU;
delete shiftGPU1;
delete shiftGPU2;
delete tGPU1;
delete tGPU2;
delete[] sDimSize;
......@@ -120,6 +142,8 @@ bool TestReduceSum1()
#else
/* destroy variables */
delete s;
delete shift1;
delete shift2;
delete t1;
delete t2;
delete[] sDimSize;
......
......@@ -70,6 +70,7 @@ bool TestReduceSumSquared1()
XTensor * s = NewTensor(sOrder, sDimSize);
XTensor * t = NewTensor(tOrder, tDimSize);
XTensor * shift = NewTensor(shiftOrder, shiftDimSize);
XTensor tUser;
/* initialize variables */
s->SetData(sData, sUnitNum);
......@@ -78,9 +79,10 @@ bool TestReduceSumSquared1()
/* call ReduceSumSquared function */
_ReduceSumSquared(s, t, 0, shift);
tUser = ReduceSumSquared(*s, 0, *shift);
/* check results */
cpuTest = t->CheckData(answer, tUnitNum);
cpuTest = t->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum);
#ifdef USE_CUDA
/* GPU test */
......@@ -90,6 +92,7 @@ bool TestReduceSumSquared1()
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * shiftGPU = NewTensor(shiftOrder, shiftDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* initialize variables */
sGPU->SetData(sData, sUnitNum);
......@@ -98,9 +101,10 @@ bool TestReduceSumSquared1()
/* call ReduceSumSquared function */
_ReduceSumSquared(sGPU, tGPU, 0, shiftGPU);
tUserGPU = ReduceSumSquared(*sGPU, 0, *shiftGPU);
/* check results */
gpuTest = tGPU->CheckData(answer, tUnitNum);
gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);
/* destroy variables */
delete s;
......@@ -174,6 +178,7 @@ bool TestReduceSumSquared2()
XTensor * s = NewTensor(sOrder, sDimSize);
XTensor * t = NewTensor(tOrder, tDimSize);
XTensor * shift = NewTensor(shiftOrder, shiftDimSize);
XTensor tUser;
/* initialize variables */
s->SetData(sData, sUnitNum);
......@@ -182,9 +187,10 @@ bool TestReduceSumSquared2()
/* call ReduceSumSquared function */
_ReduceSumSquared(s, t, 1, shift);
tUser = ReduceSumSquared(*s, 1, *shift);
/* check results */
cpuTest = t->CheckData(answer, tUnitNum);
cpuTest = t->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum);
#ifdef USE_CUDA
/* GPU test */
......@@ -194,6 +200,7 @@ bool TestReduceSumSquared2()
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * shiftGPU = NewTensor(shiftOrder, shiftDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* initialize variables */
sGPU->SetData(sData, sUnitNum);
......@@ -202,9 +209,10 @@ bool TestReduceSumSquared2()
/* call ReduceSumSquared function */
_ReduceSumSquared(sGPU, tGPU, 1, shiftGPU);
tUserGPU = ReduceSumSquared(*sGPU, 1, *shiftGPU);
/* check results */
gpuTest = tGPU->CheckData(answer, tUnitNum);
gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);
/* destroy variables */
delete s;
......
......@@ -70,6 +70,7 @@ bool TestReduceVariance1()
XTensor * s = NewTensor(sOrder, sDimSize);
XTensor * t = NewTensor(tOrder, tDimSize);
XTensor * mean = NewTensor(meanOrder, meanDimSize);
XTensor tUser;
/* initialize variables */
s->SetData(sData, sUnitNum);
......@@ -78,9 +79,10 @@ bool TestReduceVariance1()
/* call ReduceVariance function */
_ReduceVariance(s, t, 0, mean);
tUser = ReduceVariance(*s, 0, *mean);
/* check results */
cpuTest = t->CheckData(answer, tUnitNum);
cpuTest = t->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum);
#ifdef USE_CUDA
/* GPU test */
......@@ -90,6 +92,7 @@ bool TestReduceVariance1()
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * meanGPU = NewTensor(meanOrder, meanDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* initialize variables */
sGPU->SetData(sData, sUnitNum);
......@@ -98,9 +101,10 @@ bool TestReduceVariance1()
/* call ReduceVariance function */
_ReduceVariance(sGPU, tGPU, 0, meanGPU);
tUserGPU = ReduceVariance(*sGPU, 0, *meanGPU);
/* check results */
gpuTest = t->CheckData(answer, tUnitNum);
gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);
/* destroy variables */
delete s;
......
......@@ -48,6 +48,7 @@ bool TestSigmoid1()
/* create tensors */
XTensor * x = NewTensor(order, dimSize);
XTensor * y = NewTensor(order, dimSize);
XTensor yUser;
/* initialize variables */
x->SetData(xData, unitNum);
......@@ -55,9 +56,10 @@ bool TestSigmoid1()
/* call Sigmoid function */
_Sigmoid(x, y);
yUser = Sigmoid(*x);
/* check result */
cpuTest = y->CheckData(answer, unitNum, 1e-4F);
cpuTest = y->CheckData(answer, unitNum, 1e-4F) && yUser.CheckData(answer, unitNum, 1e-4F);
#ifdef USE_CUDA
/* GPU test */
......@@ -66,6 +68,7 @@ bool TestSigmoid1()
/* create tensors */
XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor yUserGPU;
/* initialize variables */
xGPU->SetData(xData, unitNum);
......@@ -73,9 +76,10 @@ bool TestSigmoid1()
/* call Sigmoid function */
_Sigmoid(xGPU, yGPU);
yUserGPU = Sigmoid(*xGPU);
/* check result */
gpuTest = yGPU->CheckData(answer, unitNum, 1e-4F);
gpuTest = yGPU->CheckData(answer, unitNum, 1e-4F) && yUserGPU.CheckData(answer, unitNum, 1e-4F);
/* destroy variables */
delete x;
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论