Commit 0887fae1 by liyinqiao

Format correction.

parent 42f995ae
......@@ -28,6 +28,10 @@
#include "Concatenate.h"
#include "ConcatenateSolely.h"
#include "CopyBlocks.h"
#include "CopyBlocksInGrid.h"
#include "CopyBlocksOnSite.h"
#include "CopyData2D.h"
#include "CopyIndexed.h"
#include "CopyInGrid.h"
#include "CopyValues.h"
......@@ -53,6 +57,7 @@
#include "ReduceSumSquared.h"
#include "ReduceVariance.h"
#include "ScaleAndShift.h"
#include "Select.h"
#include "SetData.h"
#include "Sort.h"
#include "Split.h"
......
......@@ -53,6 +53,10 @@ void Concatenate(XList * smalls, XTensor * big, int dim)
/*
concatenate two tensors along a given dimension
>> smallA - one tensor for concatenation
>> smallB - the other tensor for concatenation
>> big - the resulting tensor
>> dim - which dimension we perform the concatenation
*/
void Concatenate(XTensor * smallA, XTensor * smallB, XTensor * big, int dim)
{
......
......@@ -29,7 +29,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/*
concatenate a list of tensors along a given dimension
Note that this is actually a wrapper that selects "ConcatenateSolely"
or "Merge" by means of the tensor shapes */
or "Merge" by means of the tensor shapes
*/
void Concatenate(XList * smalls, XTensor * big, int dim);
/* concatenate two tensors along a given dimension */
......
......@@ -64,9 +64,11 @@ void ConcatenateSolely(XList * smalls, XTensor * big, int dim)
int offset = 0;
/* two strategies are used - we can either resort to memcpy2d for the case of
/*
two strategies are used - we can either resort to memcpy2d for the case of
concatenation of a few items, or use MergeBlockLists to merge a large number
of data blocks */
of data blocks
*/
if (smalls->count <= MIN_TENSOR_CAT_NUM) {
for (int i = 0; i < smalls->count; i++) {
XTensor * tensor = (XTensor*)smalls->GetItem(i);
......
......@@ -26,7 +26,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* concatenate a list of tensors along a given dimension */
extern "C"
void ConcatenateSolely(XList * smalls, XTensor * big, int dim);
......
......@@ -78,9 +78,11 @@ void CopyBlocks(void * source, int blockSize, int * sourceBlocks, int blockNum,
else {
int devID = myMem != NULL ? myMem->devID : -1;
/* The following code should be fine with GPUs, but too many
/*
The following code should be fine with GPUs, but too many
kernel calls would slow down the system. We prefer to use
one kernel to do block copy in batch (kernel fusion). */
one kernel to do block copy in batch (kernel fusion).
*/
for (int i = 0; i < blockNum; i++) {
XMemCopy((char*)target + targetBlocks[i] * blockSize, devID,
(char*)source + sourceBlocks[i] * blockSize, devID, blockSize);
......
......@@ -25,6 +25,7 @@
#include "CopyBlocksOnSite.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
copy a number of blocks to target positions. Here we assume that
all the data has been on the device (CPU/GPU) already.
......@@ -47,9 +48,11 @@ void CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target,
else {
int devID = myMem != NULL ? myMem->devID : -1;
/* The following code should be fine with GPUs, but too many
/*
The following code should be fine with GPUs, but too many
kernel calls would slow down the system. We prefer to use
one kernel to do block copy in batch (kernel fusion). */
one kernel to do block copy in batch (kernel fusion).
*/
for (int i = 0, b = 0; i < blockNum; i++, b += blockSize) {
XMemCopy((char*)target + targetBlocks[i] * blockSize, devID,
(char*)source + b, devID, blockSize);
......
......@@ -34,7 +34,7 @@ i.e., reorder the data blocks in the same memory piece
in the k-th grid
>> blockDim - leading dimension of blocks
>> blockNumInGrid - number of blocks in each grid
>> isOnDev - indicates whether the index is on the device already
>> isIndexOnDev - indicates whether the index is on the device already
*/
void CopyInGrid(XTensor * s, XTensor * t, int * index, int blockDim, int blockNumInGrid, bool isIndexOnDev)
{
......
......@@ -36,6 +36,7 @@ copy indexed sub-tensors
>> tgtIndex - index of the target sub-tensors
>> copyNum - number of the sub-tensors we copy for each source index, e.g.,
for srcIndex = [1,4] and copyNum = 2, we actually copy the source sub-tensors 1, 2, 4, 5
<< return - whether copy indexed operation was successful
*/
bool CopyIndexed(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum)
{
......
......@@ -28,7 +28,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/**************************************/
/* copy all elements from a source matrix to a target matrix */
extern "C"
bool CudaCopyValues(XTensor * s, XTensor * t, XStream * stream = NULL);
......
......@@ -52,7 +52,6 @@ void CudaCPUToGPUFlush(XList * mList, int devID, XMem * GPUMem)
else
reqiredSize = m->unitSize * m->unitNum;
//reqiredSize = (int)GPUMem->GetPitch(GPUMem->devID, (MTYPE)GPUMem->GetAddress() + size, reqiredSize);
size += reqiredSize;
}
......@@ -70,7 +69,6 @@ void CudaCPUToGPUFlush(XList * mList, int devID, XMem * GPUMem)
else
pSize = m->unitSize * m->unitNum;
//reqiredSize = (int)GPUMem->GetPitch(GPUMem->devID, (MTYPE)GPUMem->GetAddress() + p, pSize);
reqiredSize = pSize;
memcpy(data + p, m->data, pSize);
......
......@@ -24,6 +24,7 @@
#include "MakeSplitBlockIndex.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
set target data block index for the data movement in split
>> blockIndex - block index
......
......@@ -51,6 +51,7 @@ void KernelMakeSplitBlockIndex(int * blockIndex, int splitNum, int blockSplitSiz
/*
set target data block index for the data movement in split
>> devID - device id
>> blockIndex - block index
>> splitNum - number of splits
>> blockSplitSize - size of the splitted block
......
......@@ -33,9 +33,9 @@ c_i = trans(a_i) * trans(b_i) * \alpha + c_i * \beta for each i in [0,count-1]
>> transposedA - indicate whether the matrix a is transposed
>> b - another list of input matrices (2d tensors)
>> transposedB - indicate whether the matrix b is transposed
>> c - output matrix (2d tensor)
>> alpha - scalar
>> beta - scalar
>> c - output matrix (2d tensor)
*/
void MatrixMULBatchedCPU(XList * a, MATRIX_TRANS_TYPE transposedA,
XList * b, MATRIX_TRANS_TYPE transposedB,
......@@ -64,10 +64,6 @@ void MatrixMULBatchedCPU(XList * a, MATRIX_TRANS_TYPE transposedA,
}
}
//if(isUniform){
//}
//else{
for (int i = 0; i < a->count; i++) {
XTensor * ai = (XTensor*)a->GetItem(i);
XTensor * bi = (XTensor*)b->GetItem(i);
......
......@@ -39,7 +39,7 @@ normal matrix multiplication if A = y * z and B = x * y.
*/
extern "C"
void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA, XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -104,7 +104,7 @@ void MatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA,
int num = *((int*)b->data);
char * p = (char*)b->data + sizeof(int); // pointer to the first tuple
/* a * b */
/* a * b */
if (transposedA == X_NOTRANS && transposedB == X_NOTRANS) {
for (int i = 0; i < num; i++) {
int key = *((int*)p);
......
......@@ -37,11 +37,13 @@ c = a * b * \alpha
>> aColSize - column size of matrix a
>> aRowSize - row size of matrix a
>> b - a sparse matrix
>> transposedA - indicates whether b is transposed
>> transposedB - indicates whether b is transposed
>> bNonZeroNum - number of non-zero items in b
>> bColSize - column size of matrix b
>> bRowSize - row size of matrix b
>> c - the resulting (dense) matrix
>> cColSize - column size of matrix c
>> cRowSize - row size of matrix c
>> alpha - the scaling factor
*/
extern "C" __global__
......@@ -147,7 +149,6 @@ void CudaMatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA,
if (!a->isSparse && !b->isSparse) {
CheckNTErrors((!c->isSparse), "Illegal use of sparse matrix in multiplication!");
//cublasHandle_t * handle = GDevs->GetCudaHandle(a->devID);
cublasHandle_t * handle = a->mem == NULL ? GDevs.GetCudaHandle(a->devID) : a->mem->GetCublasHandle();
/* !!!! might have problems */
......@@ -183,7 +184,6 @@ void CudaMatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA,
if (beta == 0)
c->SetZeroAll();
else if (beta != 1.0F) {
//XTensor::ScaleAndShift(c, beta, 0);
ShowNTErrors("TODO!");
}
......
......@@ -40,6 +40,7 @@ where trans() returns the transposed matrix if the flag is fired
>> c - where we keep a*b
>> alpha - a coefficient
>> beta - another coefficient
>> parallelRunner - parallel processing module
*/
void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA,
XTensor * b, MATRIX_TRANS_TYPE transposedB,
......
......@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
transform a tensor by merging it alone with a dimension, e.g., (N/3, M, 3) -> (N, M)
>> s - the source tensor
......
......@@ -27,12 +27,12 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
merge data by blocks
>> sourceList - list of source data array
>> blockSizes - list of the block size for each source data array
>> blockNum - number of blocks kept in each data array
>> target - target data array
>> myMem - memory pool
merge data by blocks
>> sourceList - list of source data array
>> blockSizes - list of the block size for each source data array
>> blockNum - number of blocks kept in each data array
>> target - target data array
>> myMem - memory pool
*/
void MergeBlockLists(XList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem)
{
......
......@@ -34,10 +34,9 @@ copy a number of blocks (of different sizes) to target positions
>> sourceBlockSizes - the size of the block_i
>> sourceBlockNum - number of blocks to merge
>> targetList - list of data arrays to copy to
>> target - target data array
*/
__global__
void KernelCopyBlockLists(DTYPE * sourceList[], int * sourceBlockSizes, int sourceBlockNum, DTYPE * targetList[])
void KernelCopyBlockLists(DTYPE * sourceList[], int * sourceBlockSizes, int sourceBlockNum, DTYPE * targetList[])
{
__shared__ int iBlockSizes[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__ DTYPE * iSourceList[MAX_CUDA_THREAD_NUM_PER_BLOCK];
......@@ -82,7 +81,6 @@ void CudaMergeBlockLists(XList * sourceList, int * blockSizes, int blockNum, voi
int minBlockSize = MAX_INT;
int maxBlockSize = -MAX_INT;
//int realMinBlockSize = 1;
int realMaxBlockSize = 1;
DTYPE ** sourceArrays = new DTYPE*[newBlockListSize];
DTYPE ** targetArrays = new DTYPE*[newBlockListSize];
......@@ -110,7 +108,6 @@ void CudaMergeBlockLists(XList * sourceList, int * blockSizes, int blockNum, voi
CheckNTErrors((minBlockSize % sizeof(DTYPE) == 0), "Unsupported block size!");
CheckNTErrors((maxBlockSize % sizeof(DTYPE) == 0), "Unsupported block size!");
//realMinBlockSize = minBlockSize/sizeof(DTYPE);
realMaxBlockSize = maxBlockSize / sizeof(DTYPE);
int cudaGridSizes[3];
......@@ -120,31 +117,16 @@ void CudaMergeBlockLists(XList * sourceList, int * blockSizes, int blockNum, voi
cudaGridSizes, cudaBlockSizes);
myMem->SetPinBuf();
//MTYPE offset0 = myMem->bufUsed;
int * sizesGPU = (int*)myMem->AllocBuf(myMem->devID, sizeof(int) * newBlockListSize, 256);
//MTYPE offset1 = myMem->bufUsed;
DTYPE ** sourceArraysGPU = (DTYPE**)myMem->AllocBuf(myMem->devID, sizeof(DTYPE*) * newBlockListSize, 256);
//MTYPE offset2 = myMem->bufUsed;
DTYPE ** targetArraysGPU = (DTYPE**)myMem->AllocBuf(myMem->devID, sizeof(DTYPE*) * newBlockListSize, 256);
//MTYPE bufSize = myMem->bufUsed - offset0;
//char * CPUBuf = new char[bufSize];
//memset(CPUBuf, 0 , bufSize);
//memcpy(CPUBuf, sizes, sizeof(int) * newBlockListSize);
//memcpy(CPUBuf + (offset1 - offset0), sourceArrays, sizeof(DTYPE*) * newBlockListSize);
//memcpy(CPUBuf + (offset2 - offset0), targetArrays, sizeof(DTYPE*) * newBlockListSize);
XMemCopy(sizesGPU, myMem->devID, sizes, -1, sizeof(int) * newBlockListSize);
XMemCopy(sourceArraysGPU, myMem->devID, sourceArrays, -1, sizeof(DTYPE*) * newBlockListSize);
XMemCopy(targetArraysGPU, myMem->devID, targetArrays, -1, sizeof(DTYPE*) * newBlockListSize);
/* it is VERY tricky here because we squeeze three data copies into one */
//XMemCopy(sizesGPU, myMem->devID, CPUBuf, -1, bufSize);
KernelCopyBlockLists << <dim3(cudaGridSizes[0], cudaGridSizes[1]), dim3(cudaBlockSizes[0], cudaBlockSizes[1]) >> >
(sourceArraysGPU, sizesGPU, newBlockListSize, targetArraysGPU);
......@@ -154,7 +136,6 @@ void CudaMergeBlockLists(XList * sourceList, int * blockSizes, int blockNum, voi
delete[] targetArrays;
delete[] sizes;
delete[] offsets;
//delete[] CPUBuf;
}
#endif // USE_CUDA
......
......@@ -24,6 +24,7 @@
#include "MultiplyElementWise.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
element-wise product of two tensors
c(i) = a(i)*b(i) + \alpha * c(i)
......
......@@ -68,6 +68,7 @@ where |a_lead| means the size of the leading dimension of a
>> a - tensor a
>> b - tensor b
>> c - result tensor
>> alpha - the coefficient
>> stride - the number of items we go over when move next along the leading dimension in a block
>> ldSizeA - size of the leading dimension of a
>> ldSizeB - size of the leading dimension of b
......
......@@ -26,8 +26,8 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
set every entry to its minus value
>> a - the tensor we are processing
set every entry to its minus value
>> a - the tensor we are processing
*/
void Negate(XTensor * a)
{
......
......@@ -42,10 +42,10 @@ void KernelNegate(DTYPE * d, int size)
}
/*
set each entry to its negtive value (CUDA Kernel)
This is for float16 computation
>> d - pointer to the data array
>> size - size of the data array
set each entry to its negtive value (CUDA Kernel)
This is for float16 computation
>> d - pointer to the data array
>> size - size of the data array
*/
__global__
void KernelNegate(__half * d, int size)
......
......@@ -25,6 +25,7 @@
#include "Normalize.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
normalized the data with normal distribution. For an input x,
y = a * (x-mean)/sqrt(variance+\epsilon) + b
......
......@@ -25,6 +25,7 @@
#include "Normalize.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/*
normalized the data with normal distribution (kernel code). For an input x,
......
......@@ -28,7 +28,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* normalized the data with normal distribution (Kernel code). For an input x,
/*
normalized the data with normal distribution (Kernel code). For an input x,
y = a * (x-mean)/sqrt(variance+\epsilon) + b
where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter
*/
......@@ -37,7 +38,8 @@ void KernelNormalize(DTYPE * input, DTYPE * output, DTYPE * mean, DTYPE * var,
DTYPE * a, DTYPE * b, DTYPE epsilon,
int stride, int strideNum, int blockNum);
/* normalized the data with normal distribution. For an input x,
/*
normalized the data with normal distribution. For an input x,
y = a * (x-mean)/sqrt(variance+\epsilon) + b
where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter
*/
......
......@@ -25,10 +25,11 @@
#include "Power.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
get the power(a, p)
>> a - the tensor
>> power - as it is
>> p - as it is
*/
void Power(XTensor * a, DTYPE p)
{
......
......@@ -87,9 +87,6 @@ __global__
void KernelPower(__half * d, __half p, int size)
{
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
//int i = blockDim.x * blockIdx.x + threadIdx.x;
//if (i < size)
// d[i] = hpow(d[i], p);
#else
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size)
......@@ -126,9 +123,6 @@ void CudaPower(XTensor * a, DTYPE p)
}
else if (p != (DTYPE)1.0) {
ShowNTErrors("TODO!");
//unsigned short p2 = FloatToFloat16(p);
//__half * pp = (__half*)&p2;
//KernelPower<<<blocks, threads>>>((__half*)a->data, *pp, a->unitNum);
}
}
else {
......
......@@ -31,14 +31,10 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
/*
reduce a tensor to another that keeps the max value along a dimension - slow version
Given a block of data, we go over each dimension i in the stride and we have
sum_i = max_{0<=j<strideNum} input_{i,j}
where we can view the block as a matrix and input_{i,j} represent the item at the
crossing of the i-th columne and the j-th row.
>> input - the input array (representing a tensor)
>> output - the sum over each block. NOTE: output is also an array
>> stride - stride that we need to move to the next item
......@@ -89,82 +85,77 @@ void KernelReduceMax(DTYPE * input, DTYPE * output,
}
/*
reduce a tensor to another that keeps the max value along a dimension - slow version
Given a block of data, we go over each dimension i in the stride and we have
sum_i = max_{0<=j<strideNum} input_{i,j}
where we can view the block as a matrix and input_{i,j} represent the item at the
crossing of the i-th columne and the j-th row.
>> input - the input array (representing a tensor)
>> output - the sum over each block. NOTE: output is also an array
>> stride - stride that we need to move to the next item
>> strideNum - how many strides we need to finish the reduce
>> reducedStrideNum - the number of strides after reducation
>> blockSize - size of the block (i.e., stride * strideNum)
>> blockNum - how many blocks
*/
__global__
void KernelReduceMax(__half * input, __half * output,
int stride, int strideNum, int reducedStrideNum,
int blockSize, int blockNum)
{
int idx = threadIdx.x * blockDim.y + threadIdx.y;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int j = blockIdx.y*blockDim.y + threadIdx.y;
/*
reduce a tensor to another that keeps the max value along a dimension - slow version
Given a block of data, we go over each dimension i in the stride and we have
sum_i = max_{0<=j<strideNum} input_{i,j}
where we can view the block as a matrix and input_{i,j} represent the item at the
crossing of the i-th columne and the j-th row.
>> input - the input array (representing a tensor)
>> output - the sum over each block. NOTE: output is also an array
>> stride - stride that we need to move to the next item
>> strideNum - how many strides we need to finish the reduce
>> reducedStrideNum - the number of strides after reducation
>> blockSize - size of the block (i.e., stride * strideNum)
>> blockNum - how many blocks
*/
__global__
void KernelReduceMax(__half * input, __half * output,
int stride, int strideNum, int reducedStrideNum,
int blockSize, int blockNum)
{
int idx = threadIdx.x * blockDim.y + threadIdx.y;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int j = blockIdx.y*blockDim.y + threadIdx.y;
if (i >= stride * blockNum)
return;
if (i >= stride * blockNum)
return;
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
__shared__ __half iData[MAX_CUDA_THREAD_NUM_PER_BLOCK * MIN_CUDA_SHARED_MEM_COL_SIZE / 2];
__shared__ __half iData[MAX_CUDA_THREAD_NUM_PER_BLOCK * MIN_CUDA_SHARED_MEM_COL_SIZE / 2];
#else
__shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK * MIN_CUDA_SHARED_MEM_COL_SIZE / 2];
__shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK * MIN_CUDA_SHARED_MEM_COL_SIZE / 2];
#endif
__syncthreads();
__syncthreads();
int k = i / stride;
int iOffset = i % stride;
int k = i / stride;
int iOffset = i % stride;
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
__half value = (i < stride * blockNum && j < strideNum) ?
__half value = (i < stride * blockNum && j < strideNum) ?
input[blockSize * k + stride * j + iOffset] : __half(FLOAT16_MIN);
#else
DTYPE value = (i < stride * blockNum && j < strideNum) ?
__half2float(input[blockSize * k + stride * j + iOffset]) : FLOAT_MIN;
DTYPE value = (i < stride * blockNum && j < strideNum) ?
__half2float(input[blockSize * k + stride * j + iOffset]) : FLOAT_MIN;
#endif
/* load data into the shared mem */
iData[threadIdx.x * blockDim.y + threadIdx.y] = value;
/* load data into the shared mem */
iData[threadIdx.x * blockDim.y + threadIdx.y] = value;
__syncthreads();
__syncthreads();
/* do reduction in shared mem */
for (unsigned int s = blockDim.y / 2; s > 0; s >>= 1) {
if (threadIdx.y < s && iData[idx] < iData[idx + s]) {
iData[idx] = iData[idx + s];
}
/* do reduction in shared mem */
for (unsigned int s = blockDim.y / 2; s > 0; s >>= 1) {
if (threadIdx.y < s && iData[idx] < iData[idx + s]) {
iData[idx] = iData[idx + s];
}
__syncthreads();
}
__syncthreads();
}
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
/* write result for this block to the output array */
if (threadIdx.y == 0 && blockIdx.y < reducedStrideNum)
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = iData[threadIdx.x * blockDim.y];
/* write result for this block to the output array */
if (threadIdx.y == 0 && blockIdx.y < reducedStrideNum)
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = iData[threadIdx.x * blockDim.y];
#else
/* write result for this block to the output array */
if (threadIdx.y == 0 && blockIdx.y < reducedStrideNum)
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = __half(iData[threadIdx.x * blockDim.y]);
/* write result for this block to the output array */
if (threadIdx.y == 0 && blockIdx.y < reducedStrideNum)
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = __half(iData[threadIdx.x * blockDim.y]);
#endif
}
/*
reduce a tensor to another that keeps the max value along a dimension - fast version
>> input - the input array (representing a tensor)
......@@ -338,9 +329,7 @@ void KernelReduceMaxSimpleFast(DTYPE * input, DTYPE * output,
/*
get the max-valued items along a dimension of the tensor (cuda version).
For a 1-dimensional data array a,
sum_i = max_{0<=j<strideNum} input_{i,j}
>> input - the input tensor
>> output - the output tensor
>> dim - which dimension to reduce
......
......@@ -28,7 +28,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
/*
get the mean value along a dimension of the tensor. For a 1-dimensional data array a,
mean = (1/n) * sum_i input_i
>> input - the input tensor
>> output - the output tensor
>> dim - the dimension where the reduction is performed on
......@@ -44,5 +43,4 @@ void ReduceMean(XTensor * input, XTensor * output, int dim)
ScaleAndShift(output, (DTYPE)1/num, 0);
}
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
......@@ -29,13 +29,11 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
/*
reduce a tensor to another that keeps the sum along a dimension - slow version
Given a block of data, we go over each dimension i in the stride and we have
sum_i = sum_{0<=j<strideNum} exp(input_{i,j} - shift) if isExp == true;
= sum_{0<=j<strideNum} input_{i,j} - shift if isExp == false;
where we can view the block as a matrix and input_{i,j} represent the item at the
crossing of the i-th columne and the j-th row.
>> input - the input array (representing a tensor)
>> output - the sum over each block. NOTE: output is also an array
>> stride - stride that we need to move to the next item
......@@ -107,13 +105,11 @@ void KernelReduceSum(DTYPE * input, DTYPE * output,
/*
reduce a tensor to another that keeps the sum along a dimension - slow version
This is for float16 reduction.
Given a block of data, we go over each dimension i in the stride and we have
sum_i = sum_{0<=j<strideNum} exp(input_{i,j} - shift) if isExp == true;
= sum_{0<=j<strideNum} input_{i,j} - shift if isExp == false;
where we can view the block as a matrix and input_{i,j} represent the item at the
crossing of the i-th columne and the j-th row.
>> input - the input array (representing a tensor)
>> output - the sum over each block. NOTE: output is also an array
>> stride - stride that we need to move to the next item
......@@ -304,7 +300,6 @@ void KernelReduceSumFast(DTYPE * input, DTYPE * output,
/*
reduce a tensor to another that keeps the sum along a dimension - fast version
This is for float16 reduction
>> input - the input array (representing a tensor)
>> output - the sum over each block. NOTE: output is also an array
>> stride - stride that we need to move to the next item
......
......@@ -28,7 +28,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
squared sum of the items along a dimension of the tensor.
For a 1-dimensional data array a,
sum = \sum_i (a_i - shift)^2
>> input - the input tensor
>> output - the output tensor
>> dim - the dimension where the reduction is performed on
......
......@@ -29,7 +29,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
variance of the items along a dimension of the tensor.
For a 1-dimensional data array a,
variance = 1/n * \sum_i (a_i - mean)^2
>> input - the input tensor
>> output - the output tensor
>> dim - the dimension where the reduction is performed on
......
......@@ -26,9 +26,7 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
/*
scale and shift all tensor entires
p = p * scale + shift
>> a - the tensor
>> scale - the scaler factor
>> shift - the shift factor
......
......@@ -80,9 +80,7 @@ void KernelScaleAndShift(__half * d, int size, __half scale, __half shift)
/*
scale and shift all matrix entires
p = p * scale + shift
>> a - the tensor
>> scale - the scaler factor
>> shift - the shift factor
......
......@@ -31,7 +31,7 @@ c = select(a)
>> dim - the dimension along with which we do the job
>> low - lower bound
>> high - higher bound.
Note that range [1,3] means that we select 1 and 2.
Note that range [1,3] means that we select 1 and 2.
>> c - result tensor
*/
void SelectRange(XTensor * a, int dim, int low, int high, XTensor * c)
......@@ -75,5 +75,4 @@ void SelectRange(XTensor * a, int dim, int low, int high, XTensor * c)
}
}
} // namespace nts(NiuTrans.Tensor)
......@@ -68,10 +68,11 @@ void SetDataRand(XTensor * tensor, DTYPE low, DTYPE high)
ShowNTErrors("TODO");
}
}
/* GPU code
The trick here is that initialize the data on a temperary tensor on CPU.
The CPU data is then copied to GPU.
TODO: generate data points on GPUs straightforwardly.
/*
GPU code
The trick here is that initialize the data on a temperary tensor on CPU.
The CPU data is then copied to GPU.
TODO: generate data points on GPUs straightforwardly.
*/
else{
XTensor * t2 = NewTensor(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, -1);
......
......@@ -39,6 +39,7 @@ void Sort(XTensor * a, XTensor * index, int dim)
CheckNTErrors((index->dataType == X_INT), "Wrong data type!");
int dimRDI = a->order - dim - 1;
/* make the index tensor */
index->SetAscendingOrder(dim);
......
......@@ -29,6 +29,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* sort the tensor along a given dimension */
extern "C"
void CudaSortBig(XTensor * a, XTensor * b, XTensor * indexA, XTensor * indexB, int dim, int k = -1);
#endif // USE_CUDA
......
......@@ -27,6 +27,7 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* transform a tensor by splitting it, e.g., (M, N) -> (M, N/3, 3) */
extern "C"
void Split(XTensor * s, XTensor * t, int whereToSplit, int splitNum);
/* split a big tensor into small tensors */
......
......@@ -25,6 +25,7 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/*
summation of data arrays (CUDA Kernel)
c = a + b * \beta
......
......@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* summation of data arrays (CUDA Kernel) */
/* summation of data arrays (CUDA Kernel) */
extern "C" __global__
void KernelADD(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta = (DTYPE)1.0);
......
......@@ -27,6 +27,7 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/*
summation of a vector (column vector) and a tensor
c = a + \sum{col} b_col * \beta
......
......@@ -26,7 +26,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* sum of a (column) vector and a tensor */
extern "C"
void SumByColumnVT(XTensor * a, XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);
......
......@@ -24,6 +24,7 @@
#include "TopK.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
get the top-k items along a given dimension
>> a - input tensor
......
......@@ -95,9 +95,11 @@ public:
/* swap */
__device__ void Swap(int i, int j)
{
/*CudaHeapNode<T> tmp = items[i];
/*
CudaHeapNode<T> tmp = items[i];
items[i] = items[j];
items[j] = tmp;*/
items[j] = tmp;
*/
int tmpIndex = items[i].index;
T tmpValue = items[i].value;
items[i] = items[j];
......@@ -239,8 +241,10 @@ void KernelTopK(T * input, int stride, int strideNum, int blockNum, int k, T min
if (threadIdx.x == 0) {
CudaXHeap<MIN_HEAP, T> heapFinal(k, k, heapData + k * threadIdx.y * blockDim.x);
/* merge the result over the workers.
This can be improved by parallel merging */
/*
merge the result over the workers.
This can be improved by parallel merging
*/
if (blockDim.x > 1) {
for (int p = 1; p < blockDim.x && p < strideNum; p++) {
CudaHeapNode<T> * hd = heapData + k * (threadIdx.y * blockDim.x + p);
......@@ -429,6 +433,7 @@ void CudaTopK(XTensor * a, XTensor * b, XTensor * index, int dim, int k)
}
}
/* we resort to sorting if the data cannot fit inside the shared memory */
else {
int dimSize[MAX_TENSOR_DIM_NUM];
......
......@@ -227,7 +227,7 @@ int SegmentTensor2D(int rowNum, int colNum, int blockNum, int * blockIndex)
x2 = colSize - 1;
y2 = rowSize - 1; // bottom-right corner
/* the main body of the matrix (after removing the margin block) */
/* the main body of the matrix (after removing the margin block) */
while (x1 <= xMax) {
y1 = 0;
x2 = x1 + colSize - 1;
......
......@@ -26,9 +26,7 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/*******************************************************************
segmentation and parallel processing for 2d tensors (i.e., matrices)
*/
/* segmentation and parallel processing for 2d tensors (i.e., matrices) */
/* segment a 2d tensor (i.e., matrix) into blocks and run jobs in parallel */
extern "C"
void RunParallel2D(XPRunner * parallelRunner, void * job, int opNum, int rowNum, int colNum, int argNum, ...);
......
......@@ -28,9 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/*
matrix multiplication via cuda version BLAS
*/
/* matrix multiplication via cuda version BLAS */
void CudaBLASMatrixMUL(cublasHandle_t * handle,
void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
......@@ -85,9 +83,7 @@ void CudaBLASMatrixMUL(cublasHandle_t * handle,
}
}
/*
matrix multiplication via cuda version BLAS
*/
/* matrix multiplication via cuda version BLAS */
void CudaBLASMatrixMULBatched(cublasHandle_t * handle,
const void ** a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
const void ** b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
......@@ -143,7 +139,6 @@ void CudaBLASMatrixMULBatched(cublasHandle_t * handle,
}
/* matrix multiplication in batch and strided mode via cuda version BLAS */
extern "C"
void CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, long long int strideA,
const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB, long long int strideB,
......@@ -198,9 +193,7 @@ void CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
}
}
/*
matrix multiplication via cuda version BLAS
*/
/* matrix multiplication via cuda version BLAS */
void CudaBLASMatrixMULList(cublasHandle_t * handle,
XList * a, MATRIX_TRANS_TYPE transposedA,
XList * b, MATRIX_TRANS_TYPE transposedB,
......
......@@ -25,7 +25,6 @@
namespace nts{ // namespace nts(NiuTrans.Tensor)
/*
hard tanh function
y = 1 if x > 1
......
......@@ -95,7 +95,6 @@ dy/dx = 1 if -1 <= x <= 1
>> y - y of the function
>> x - x of the function
>> size - size of y/x
*/
__global__
void KernelHardtanhBackward(DTYPE * dedy, DTYPE * dedx, DTYPE * gold, DTYPE * y, DTYPE * x, int size)
......
......@@ -49,7 +49,6 @@ void LogSoftmax(XTensor * x, XTensor * y, int leadDim)
dimSize[i - 1] = -x->dimSize[i];
}
XMem * mem = x->mem;
XTensor * max = NULL;
XTensor * sum = NULL;
......@@ -168,7 +167,6 @@ dE/dx = dE/dy * dy/dx
log softmax: y_i = log(e^{x_i} / \sum_{k} e^{x_k})
dy_i/dx_j
= d{log(e^{x_i} / \sum_{k} e^{x_k})}/dx_j
= d{log(e^{x_i})}/dx_j - d{log(\sum_{k} e^{x_k})}/dx_j
......
......@@ -41,7 +41,8 @@ void CudaLogSoftmax(XTensor * x, XTensor * y, int leadDim)
ShowNTErrors("You should call LogSoftmax instead!");
}
/* log softmax forward computation (Cuda kernel)
/*
log softmax forward computation (Cuda kernel)
for each column j, let y_{i,j} and x_{i,j} are the output
and state value for the i-th element of column j. We have
......@@ -85,7 +86,8 @@ void KernelLogSoftmaxComputeByRow(DTYPE * x, DTYPE * max, DTYPE * sum, DTYPE * y
}
}
/* log softmax forward computation (Cuda kernel)
/*
log softmax forward computation (Cuda kernel)
for each row i, let y_{i,j} and x_{i,j} are the output
and state value for the j-th element of row i. We have
......@@ -182,7 +184,7 @@ void CudaLogSoftmaxSumMax(XTensor * x, XTensor * y, int leadDim, XTensor * sum,
/*
set dE/dx = exp(y)
>> dedu - dE/dy
>> dedy - dE/dy
>> dedx - dE/dx
>> y - output of the function
>> size - size of output
......@@ -256,7 +258,9 @@ dE/dx_j += -gold_j
>> gold - gold standard to measure error (or loss)
>> y - output of the function
>> x - input of the function
>> size - size of input/output
>> rowNum - row number of the matrix
>> colNum - column number of the matrix
>> gNonZeroNum -
>> lossName - name of the loss function
*/
__global__
......@@ -293,7 +297,6 @@ dE/dx = dE/dy * dy/dx
log softmax: y_i = log(e^{x_i} / \sum_{k} e^{x_k})
dy_i/dx_j
= d{log(e^{x_i} / \sum_{k} e^{x_k})}/dx_j
= d{log(e^{x_i})}/dx_j - d{log(\sum_{k} e^{x_k})}/dx_j
......
......@@ -31,7 +31,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
loss function to measure the "number" of errors
*/
/*
compute the loss
>> gold - gold standard
......
......@@ -88,7 +88,6 @@ dy/dx = 1 if x >= 0
>> y - output of the function
>> x - input of the function
>> size - size of output/input
*/
__global__
void KernelRectifyBackward(DTYPE * dedy, DTYPE * dedx, DTYPE * gold, DTYPE * y, DTYPE * x, int size)
......
......@@ -25,7 +25,6 @@
namespace nts{ // namespace nts(NiuTrans.Tensor)
/*
sigmoid function y = 1/(1+exp(-x))
>> x - input tensor
......
......@@ -95,7 +95,6 @@ sigmoid: y = 1/(1+exp(-x))
>> y - output of the function
>> x - input of the function
>> size - size of output/input
*/
__global__
void KernelSigmoidBackward(DTYPE * dedy, DTYPE * dedx, DTYPE * gold, DTYPE * y, DTYPE * x, int size)
......@@ -122,7 +121,6 @@ sigmoid: y = 1/(1+exp(-x))
>> dedy - dE/dy
>> dedx - dE/dx
>> lossName - type of loss function, e.g., cross entropy
*/
void CudaSigmoidBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
......
......@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* softmax y = e^x / \sum_{i} e^{x_i} (Cuda version) */
extern "C"
void CudaSotmax(XTensor * input, XTensor * output, int leadDim);
......
......@@ -22,8 +22,10 @@
#include "TConcatenate.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: concatenate a list of tensors along a given dimension.
* In this case, 2 * (2, 1) -> (2, 2), dim=1.
/*
case 1: concatenate a list of tensors along a given dimension.
In this case, 2 * (2, 1) -> (2, 2), dim=1.
*/
bool TestConcatenate1()
{
......@@ -60,12 +62,12 @@ bool TestConcatenate1()
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
DTYPE sData1[2][1] = { {0.0},
{1.0} };
DTYPE sData2[2][1] = { {2.0},
{3.0} };
DTYPE answer[2][2] = { {0.0, 2.0},
{1.0, 3.0} };
DTYPE sData1[2][1] = { {0.0F},
{1.0F} };
DTYPE sData2[2][1] = { {2.0F},
{3.0F} };
DTYPE answer[2][2] = { {0.0F, 2.0F},
{1.0F, 3.0F} };
/* CPU test */
bool cpuTest = true;
......@@ -144,8 +146,9 @@ bool TestConcatenate1()
#endif // USE_CUDA
}
/* case 2: concatenate a list of tensors along a given dimension.
* In this case, 2 * (2, 1) -> (4, 1), dim=0.
/*
case 2: concatenate a list of tensors along a given dimension.
In this case, 2 * (2, 1) -> (4, 1), dim=0.
*/
bool TestConcatenate2()
{
......@@ -182,14 +185,14 @@ bool TestConcatenate2()
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
DTYPE sData1[2][1] = { {0.0},
{1.0} };
DTYPE sData2[2][1] = { {2.0},
{3.0} };
DTYPE answer[4][1] = { {0.0},
{1.0},
{2.0},
{3.0} };
DTYPE sData1[2][1] = { {0.0F},
{1.0F} };
DTYPE sData2[2][1] = { {2.0F},
{3.0F} };
DTYPE answer[4][1] = { {0.0F},
{1.0F},
{2.0F},
{3.0F} };
/* CPU test */
bool cpuTest = true;
......@@ -268,8 +271,9 @@ bool TestConcatenate2()
#endif // USE_CUDA
}
/* case 3: concatenate a list of tensors along a given dimension.
* In this case, (2, 1) + (2, 2) -> (2, 3), dim=1.
/*
case 3: concatenate a list of tensors along a given dimension.
In this case, (2, 1) + (2, 2) -> (2, 3), dim=1.
*/
bool TestConcatenate3()
{
......@@ -306,12 +310,12 @@ bool TestConcatenate3()
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
DTYPE sData1[2][1] = { {0.0},
{1.0} };
DTYPE sData2[2][2] = { {2.0, 3.0},
{4.0, 5.0} };
DTYPE answer[2][3] = { {0.0, 2.0, 3.0},
{1.0, 4.0, 5.0} };
DTYPE sData1[2][1] = { {0.0F},
{1.0F} };
DTYPE sData2[2][2] = { {2.0F, 3.0F},
{4.0F, 5.0F} };
DTYPE answer[2][3] = { {0.0F, 2.0F, 3.0F},
{1.0F, 4.0F, 5.0F} };
/* CPU test */
bool cpuTest = true;
......@@ -390,8 +394,9 @@ bool TestConcatenate3()
#endif // USE_CUDA
}
/* case 4: concatenate two tensors along a given dimension.
* In this case, (2, 1), (2, 2) -> (2, 3), dim=1.
/*
case 4: concatenate two tensors along a given dimension.
In this case, (2, 1), (2, 2) -> (2, 3), dim=1.
*/
bool TestConcatenate4()
{
......@@ -425,12 +430,12 @@ bool TestConcatenate4()
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
DTYPE sData1[2][1] = { {0.0},
{1.0} };
DTYPE sData2[2][2] = { {2.0, 3.0},
{4.0, 5.0} };
DTYPE answer[2][3] = { {0.0, 2.0, 3.0},
{1.0, 4.0, 5.0} };
DTYPE sData1[2][1] = { {0.0F},
{1.0F} };
DTYPE sData2[2][2] = { {2.0F, 3.0F},
{4.0F, 5.0F} };
DTYPE answer[2][3] = { {0.0F, 2.0F, 3.0F},
{1.0F, 4.0F, 5.0F} };
/* CPU test */
bool cpuTest = true;
......@@ -502,7 +507,6 @@ TODO!!
*/
/* test for Concatenate Function */
extern "C"
bool TestConcatenate()
{
XPRINT(0, stdout, "[TEST CONCATENATE] concatenate a list of tensors or two tensors along a given dimension \n");
......
......@@ -19,12 +19,14 @@
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-06-14
*/
#include "TConcatenateSolely.h"
#include "../XList.h"
#include "TConcatenateSolely.h"
namespace nts { // namespace nt(NiuTrans.Tensor)
/* case 1: concatenate a list of tensors along a given dimension
* In this case, 2 * (2, 1) -> (2, 2), dim=1.
/*
case 1: concatenate a list of tensors along a given dimension
In this case, 2 * (2, 1) -> (2, 2), dim=1.
*/
bool TestConcatenateSolely1()
{
......@@ -61,12 +63,12 @@ bool TestConcatenateSolely1()
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
DTYPE sData1[2][1] = { {0.0},
{1.0} };
DTYPE sData2[2][1] = { {2.0},
{3.0} };
DTYPE answer[2][2] = { {0.0, 2.0},
{1.0, 3.0} };
DTYPE sData1[2][1] = { {0.0F},
{1.0F} };
DTYPE sData2[2][1] = { {2.0F},
{3.0F} };
DTYPE answer[2][2] = { {0.0F, 2.0F},
{1.0F, 3.0F} };
/* CPU test */
bool cpuTest = true;
......@@ -145,8 +147,9 @@ bool TestConcatenateSolely1()
#endif // USE_CUDA
}
/* case 2: concatenate a list of tensors along a given dimension
* In this case, 2 * (2, 1) -> (4, 1), dim=0.
/*
case 2: concatenate a list of tensors along a given dimension
In this case, 2 * (2, 1) -> (4, 1), dim=0.
*/
bool TestConcatenateSolely2()
{
......@@ -183,14 +186,14 @@ bool TestConcatenateSolely2()
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
DTYPE sData1[2][1] = { {0.0},
{1.0} };
DTYPE sData2[2][1] = { {2.0},
{3.0} };
DTYPE answer[4][1] = { {0.0},
{1.0},
{2.0},
{3.0} };
DTYPE sData1[2][1] = { {0.0F},
{1.0F} };
DTYPE sData2[2][1] = { {2.0F},
{3.0F} };
DTYPE answer[4][1] = { {0.0F},
{1.0F},
{2.0F},
{3.0F} };
/* CPU test */
bool cpuTest = true;
......@@ -269,8 +272,9 @@ bool TestConcatenateSolely2()
#endif // USE_CUDA
}
/* case 3: concatenate a list of tensors along a given dimension
* In this case, (2, 1) + (2, 2) -> (2, 3), dim=1.
/*
case 3: concatenate a list of tensors along a given dimension
In this case, (2, 1) + (2, 2) -> (2, 3), dim=1.
*/
bool TestConcatenateSolely3()
{
......@@ -307,12 +311,12 @@ bool TestConcatenateSolely3()
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
DTYPE sData1[2][1] = { {0.0},
{1.0} };
DTYPE sData2[2][2] = { {2.0, 3.0},
{4.0, 5.0} };
DTYPE answer[2][3] = { {0.0, 2.0, 3.0},
{1.0, 4.0, 5.0} };
DTYPE sData1[2][1] = { {0.0F},
{1.0F} };
DTYPE sData2[2][2] = { {2.0F, 3.0F},
{4.0F, 5.0F} };
DTYPE answer[2][3] = { {0.0F, 2.0F, 3.0F},
{1.0F, 4.0F, 5.0F} };
/* CPU test */
bool cpuTest = true;
......@@ -397,7 +401,6 @@ TODO!!
*/
/* test for ConcatenateSolely Function */
extern "C"
bool TestConcatenateSolely()
{
XPRINT(0, stdout, "[TEST CONCATENATESOLELY] concatenate a list of tensors along a given dimension \n");
......
......@@ -22,9 +22,11 @@
#include "TCopyIndexed.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1 copy indexed sub-tensors
* In this case, (3, 2, 3) -> (3, 2, 2), dim = 2, indexSize = 2,
* srcIndex = [0, 2], tgtIndex = [0, 1], copyNum = 1.
/*
case 1 copy indexed sub-tensors
In this case, (3, 2, 3) -> (3, 2, 2), dim = 2, indexSize = 2,
srcIndex = [0, 2], tgtIndex = [0, 1], copyNum = 1.
*/
bool TestCopyIndexed1()
{
......@@ -50,19 +52,19 @@ bool TestCopyIndexed1()
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
DTYPE sData[3][2][3] = { { {0.0, -1.0, 2.0},
{2.0, 1.0, 3.0} },
{ {1.0, 2.0, 4.0},
{3.0, 1.0, 2.0}},
{ {-1.0, 3.0, 2.0},
{1.0, -1.0, 0.0} } };
DTYPE answer[3][2][2] = { { {0.0, 2.0},
{2.0, 3.0} },
{ {1.0, 4.0},
{3.0, 2.0}},
{ {-1.0, 2.0},
{1.0, 0.0} } };
DTYPE sData[3][2][3] = { { {0.0F, -1.0F, 2.0F},
{2.0F, 1.0F, 3.0F} },
{ {1.0F, 2.0F, 4.0F},
{3.0F, 1.0F, 2.0F}},
{ {-1.0F, 3.0F, 2.0F},
{1.0F, -1.0F, 0.0F} } };
DTYPE answer[3][2][2] = { { {0.0F, 2.0F},
{2.0F, 3.0F} },
{ {1.0F, 4.0F},
{3.0F, 2.0F}},
{ {-1.0F, 2.0F},
{1.0F, 0.0F} } };
int dim = 2;
int indexSize = 2;
int srcIndex[2] = {0, 2};
......@@ -131,7 +133,6 @@ TODO!!
*/
/* test for CopyIndexed Function */
extern "C"
bool TestCopyIndexed()
{
XPRINT(0, stdout, "[TEST CopyIndexed] copy indexed sub-tensors \n");
......
......@@ -23,6 +23,7 @@
#include "TCopyValues.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: copy tensor s to tensor t */
bool TestCopyValues1()
{
......@@ -36,11 +37,11 @@ bool TestCopyValues1()
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
DTYPE sData[2][4] = { {0.0, 1.0, 2.0, 3.0},
{4.0, 5.0, 6.0, 7.0} };
DTYPE sData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE scaleFactor = 2.0;
DTYPE shiftFactor = 0.5;
DTYPE scaleFactor = 2.0F;
DTYPE shiftFactor = 0.5F;
/* CPU test */
bool cpuTest = true;
......@@ -105,7 +106,6 @@ TODO!!
*/
/* test for CopyValues Function */
extern "C"
bool TestCopyValues()
{
XPRINT(0, stdout, "[TEST CopyValues] copy tensor s to tensor t \n");
......
......@@ -22,10 +22,11 @@
#include "THardTanH.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: hard tanh function */
bool TestHardTanH1()
{
/* a x tensor of size 2 * 3 */
/* a x tensor of size (2, 3) */
int xOrder = 2;
int * xDimSize = new int[xOrder];
xDimSize[0] = 2;
......@@ -35,7 +36,7 @@ bool TestHardTanH1()
for (int i = 0; i < xOrder; i++)
xUnitNum *= xDimSize[i];
/* a y tensor of size 2 * 3 */
/* a y tensor of size (2, 3) */
int yOrder = 2;
int * yDimSize = new int[yOrder];
yDimSize[0] = 2;
......@@ -45,10 +46,10 @@ bool TestHardTanH1()
for (int i = 0; i < yOrder; i++)
yUnitNum *= yDimSize[i];
DTYPE xData[2][3] = { {0.5, -1.0, 2.0},
{3.5, -4.5, 1.0} };
DTYPE answer[2][3] = { {0.5, -1.0, 1.0},
{1.0, -1.0, 1.0} };
DTYPE xData[2][3] = { {0.5F, -1.0F, 2.0F},
{3.5F, -4.5F, 1.0F} };
DTYPE answer[2][3] = { {0.5F, -1.0F, 1.0F},
{1.0F, -1.0F, 1.0F} };
/* CPU test */
bool cpuTest = true;
......@@ -86,25 +87,32 @@ bool TestHardTanH1()
gpuTest = yGPU->CheckData(answer, yUnitNum, 1e-4F);
/* destroy variables */
delete x, y, xGPU, yGPU;
delete[] xDimSize, yDimSize;
delete x;
delete y;
delete xGPU;
delete yGPU;
delete[] xDimSize;
delete[] yDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete x, y;
delete[] xDimSize, yDimSize;
delete x;
delete y;
delete[] xDimSize;
delete[] yDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* case 2: backward computation
* In this case, lossName=CROSSENTROPY.
/*
case 2: backward computation
In this case, lossName=CROSSENTROPY.
*/
bool TestHardTanH2()
{
/* a x tensor of size 2 * 3 */
/* a x tensor of size (2, 3) */
int xOrder = 2;
int * xDimSize = new int[xOrder];
xDimSize[0] = 2;
......@@ -114,7 +122,7 @@ bool TestHardTanH2()
for (int i = 0; i < xOrder; i++)
xUnitNum *= xDimSize[i];
/* a y tensor of size 2 * 3 */
/* a y tensor of size (2, 3) */
int yOrder = 2;
int * yDimSize = new int[yOrder];
yDimSize[0] = 2;
......@@ -124,7 +132,7 @@ bool TestHardTanH2()
for (int i = 0; i < yOrder; i++)
yUnitNum *= yDimSize[i];
/* a gold tensor of size 2 * 3 */
/* a gold tensor of size (2, 3) */
int goldOrder = 2;
int * goldDimSize = new int[goldOrder];
goldDimSize[0] = 2;
......@@ -134,7 +142,7 @@ bool TestHardTanH2()
for (int i = 0; i < goldOrder; i++)
goldUnitNum *= goldDimSize[i];
/* a dedy tensor of size 2 * 3 */
/* a dedy tensor of size (2, 3) */
int dedyOrder = 2;
int * dedyDimSize = new int[dedyOrder];
dedyDimSize[0] = 2;
......@@ -144,7 +152,7 @@ bool TestHardTanH2()
for (int i = 0; i < dedyOrder; i++)
dedyUnitNum *= dedyDimSize[i];
/* a dedx tensor of size 2 * 3 */
/* a dedx tensor of size (2, 3) */
int dedxOrder = 2;
int * dedxDimSize = new int[dedxOrder];
dedxDimSize[0] = 2;
......@@ -154,16 +162,16 @@ bool TestHardTanH2()
for (int i = 0; i < dedxOrder; i++)
dedxUnitNum *= dedxDimSize[i];
DTYPE xData[2][3] = { {0.5, -1.0, 2.0},
{3.5, -4.5, 1.0} };
DTYPE yData[2][3] = { {0.5, -1.0, 1.0},
{1.0, -1.0, 1.0} };
DTYPE goldData[2][3] = { {1.0, 1.0, 1.0},
{1.0, 1.0, 1.0} };
DTYPE dedyData[2][3] = { {-2.0, 1.0, -1.0},
{-1.0, 1.0, -1.0} };
DTYPE answer[2][3] = { {-2.0, 1.0, 0.0},
{0.0, 0.0, -1.0} };
DTYPE xData[2][3] = { {0.5F, -1.0F, 2.0F},
{3.5F, -4.5F, 1.0F} };
DTYPE yData[2][3] = { {0.5F, -1.0F, 1.0F},
{1.0F, -1.0F, 1.0F} };
DTYPE goldData[2][3] = { {1.0F, 1.0F, 1.0F},
{1.0F, 1.0F, 1.0F} };
DTYPE dedyData[2][3] = { {-2.0F, 1.0F, -1.0F},
{-1.0F, 1.0F, -1.0F} };
DTYPE answer[2][3] = { {-2.0F, 1.0F, 0.0F},
{0.0F, 0.0F, -1.0F} };
/* CPU test */
bool cpuTest = true;
......@@ -226,12 +234,13 @@ bool TestHardTanH2()
#endif // USE_CUDA
}
/* case 3: backward computation
* In this case, lossName=SQUAREDERROR.
/*
case 3: backward computation
In this case, lossName=SQUAREDERROR.
*/
bool TestHardTanH3()
{
/* a x tensor of size 2 * 3 */
/* a x tensor of size (2, 3) */
int xOrder = 2;
int * xDimSize = new int[xOrder];
xDimSize[0] = 2;
......@@ -241,7 +250,7 @@ bool TestHardTanH3()
for (int i = 0; i < xOrder; i++)
xUnitNum *= xDimSize[i];
/* a y tensor of size 2 * 3 */
/* a y tensor of size (2, 3) */
int yOrder = 2;
int * yDimSize = new int[yOrder];
yDimSize[0] = 2;
......@@ -251,7 +260,7 @@ bool TestHardTanH3()
for (int i = 0; i < yOrder; i++)
yUnitNum *= yDimSize[i];
/* a gold tensor of size 2 * 3 */
/* a gold tensor of size (2, 3) */
int goldOrder = 2;
int * goldDimSize = new int[goldOrder];
goldDimSize[0] = 2;
......@@ -261,7 +270,7 @@ bool TestHardTanH3()
for (int i = 0; i < goldOrder; i++)
goldUnitNum *= goldDimSize[i];
/* a dedy tensor of size 2 * 3 */
/* a dedy tensor of size (2, 3) */
int dedyOrder = 2;
int * dedyDimSize = new int[dedyOrder];
dedyDimSize[0] = 2;
......@@ -271,7 +280,7 @@ bool TestHardTanH3()
for (int i = 0; i < dedyOrder; i++)
dedyUnitNum *= dedyDimSize[i];
/* a dedx tensor of size 2 * 3 */
/* a dedx tensor of size (2, 3) */
int dedxOrder = 2;
int * dedxDimSize = new int[dedxOrder];
dedxDimSize[0] = 2;
......@@ -281,16 +290,16 @@ bool TestHardTanH3()
for (int i = 0; i < dedxOrder; i++)
dedxUnitNum *= dedxDimSize[i];
DTYPE xData[2][3] = { {0.5, -1.0, 2.0},
{3.5, -4.5, 1.0} };
DTYPE yData[2][3] = { {0.5, -1.0, 1.0},
{1.0, -1.0, 1.0} };
DTYPE goldData[2][3] = { {1.0, 1.0, 1.0},
{1.0, 1.0, 1.0} };
DTYPE dedyData[2][3] = { {-0.5, -2.0, 0.0 },
{0.0, -2.0, 0.0 } };
DTYPE answer[2][3] = { {-0.5, -2.0, 0.0},
{0.0, 0.0, 0.0} };
DTYPE xData[2][3] = { {0.5F, -1.0F, 2.0F},
{3.5F, -4.5F, 1.0F} };
DTYPE yData[2][3] = { {0.5F, -1.0F, 1.0F},
{1.0F, -1.0F, 1.0F} };
DTYPE goldData[2][3] = { {1.0F, 1.0F, 1.0F},
{1.0F, 1.0F, 1.0F} };
DTYPE dedyData[2][3] = { {-0.5F, -2.0F, 0.0F },
{0.0F, -2.0F, 0.0F } };
DTYPE answer[2][3] = { {-0.5F, -2.0F, 0.0F},
{0.0F, 0.0F, 0.0F} };
/* CPU test */
bool cpuTest = true;
......@@ -353,12 +362,13 @@ bool TestHardTanH3()
#endif // USE_CUDA
}
/* case 4: backward computation
* In this case, lossName=ONEHOTERROR.
/*
case 4: backward computation
In this case, lossName=ONEHOTERROR.
*/
bool TestHardTanH4()
{
/* a x tensor of size 2 * 3 */
/* a x tensor of size (2, 3) */
int xOrder = 2;
int * xDimSize = new int[xOrder];
xDimSize[0] = 2;
......@@ -368,7 +378,7 @@ bool TestHardTanH4()
for (int i = 0; i < xOrder; i++)
xUnitNum *= xDimSize[i];
/* a y tensor of size 2 * 3 */
/* a y tensor of size (2, 3) */
int yOrder = 2;
int * yDimSize = new int[yOrder];
yDimSize[0] = 2;
......@@ -378,7 +388,7 @@ bool TestHardTanH4()
for (int i = 0; i < yOrder; i++)
yUnitNum *= yDimSize[i];
/* a gold tensor of size 2 * 3 */
/* a gold tensor of size (2, 3) */
int goldOrder = 2;
int * goldDimSize = new int[goldOrder];
goldDimSize[0] = 2;
......@@ -388,7 +398,7 @@ bool TestHardTanH4()
for (int i = 0; i < goldOrder; i++)
goldUnitNum *= goldDimSize[i];
/* a dedy tensor of size 2 * 3 */
/* a dedy tensor of size (2, 3) */
int dedyOrder = 2;
int * dedyDimSize = new int[dedyOrder];
dedyDimSize[0] = 2;
......@@ -398,7 +408,7 @@ bool TestHardTanH4()
for (int i = 0; i < dedyOrder; i++)
dedyUnitNum *= dedyDimSize[i];
/* a dedx tensor of size 2 * 3 */
/* a dedx tensor of size (2, 3) */
int dedxOrder = 2;
int * dedxDimSize = new int[dedxOrder];
dedxDimSize[0] = 2;
......@@ -408,16 +418,16 @@ bool TestHardTanH4()
for (int i = 0; i < dedxOrder; i++)
dedxUnitNum *= dedxDimSize[i];
DTYPE xData[2][3] = { {0.5, -1.0, 2.0},
{3.5, -4.5, 1.0} };
DTYPE yData[2][3] = { {0.5, -1.0, 1.0},
{1.0, -1.0, 1.0} };
DTYPE goldData[2][3] = { {1.0, 0.0, 1.0},
{0.0, 1.0, 1.0} };
DTYPE dedyData[2][3] = { {-0.5, 0.0, 0.0},
{0.0, -2.0, 0.0} };
DTYPE answer[2][3] = { {-0.5, 0.0, 0.0},
{0.0, 0.0, 0.0} };
DTYPE xData[2][3] = { {0.5F, -1.0F, 2.0F},
{3.5F, -4.5F, 1.0F} };
DTYPE yData[2][3] = { {0.5F, -1.0F, 1.0F},
{1.0F, -1.0F, 1.0F} };
DTYPE goldData[2][3] = { {1.0F, 0.0F, 1.0F},
{0.0F, 1.0F, 1.0F} };
DTYPE dedyData[2][3] = { {-0.5F, 0.0F, 0.0F},
{0.0F, -2.0F, 0.0F} };
DTYPE answer[2][3] = { {-0.5F, 0.0F, 0.0F},
{0.0F, 0.0F, 0.0F} };
/* CPU test */
bool cpuTest = true;
......@@ -486,10 +496,9 @@ TODO!!
*/
/* test for HardTanH Function */
extern "C"
bool TestHardTanH()
{
XPRINT(0, stdout, "[TEST HARDTANH] -------------\n");
XPRINT(0, stdout, "[TEST HARDTANH] test hardtanh and its backward computation \n");
bool returnFlag = true, caseFlag = true;
/* case 1 test */
......
......@@ -23,8 +23,10 @@
#include "TIdentity.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: test Identity function.
* Identity function: y = x
/*
case 1: test Identity function.
Identity function: y = x
*/
bool TestIdentity1()
{
......@@ -38,10 +40,10 @@ bool TestIdentity1()
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
DTYPE xData[2][3] = { {0.0, 1.0, 2.0},
{0.5, 0.7, 1.4} };
DTYPE answer[2][3] = { {0.0, 1.0, 2.0},
{0.5, 0.7, 1.4} };
DTYPE xData[2][3] = { {0.0F, 1.0F, 2.0F},
{0.5F, 0.7F, 1.4F} };
DTYPE answer[2][3] = { {0.0F, 1.0F, 2.0F},
{0.5F, 0.7F, 1.4F} };
/* CPU test */
bool cpuTest = true;
......@@ -93,8 +95,9 @@ bool TestIdentity1()
#endif // USE_CUDA
}
/* case 2: test IdentityBackward function.
* IdentityBackward function: dE/dx = dE/dy * dy/dx = dE/dy
/*
case 2: test IdentityBackward function.
IdentityBackward function: dE/dx = dE/dy * dy/dx = dE/dy
*/
bool TestIdentity2()
{
......@@ -107,9 +110,9 @@ bool TestIdentity2()
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
DTYPE xData[1][3] = { {0.0, 1.0, 2.0} };
DTYPE gData[1][3] = { {0.0, 0.0, 1.0} };
DTYPE dedxAnswer[3] = {0.090031, 0.244728, -0.334759};
DTYPE xData[1][3] = { {0.0F, 1.0F, 2.0F} };
DTYPE gData[1][3] = { {0.0F, 0.0F, 1.0F} };
DTYPE dedxAnswer[3] = {0.090031F, 0.244728F, -0.334759F};
/* CPU test */
bool cpuTest = true;
......@@ -135,7 +138,7 @@ bool TestIdentity2()
IdentityBackward(g, y, x, dedy, dedx, CROSSENTROPY);
/* check result */
cpuTest = dedx->CheckData(dedxAnswer, sUnitNum);
cpuTest = dedx->CheckData(dedxAnswer, sUnitNum, 1e-4F);
#ifdef USE_CUDA
/* GPU test */
......@@ -162,7 +165,7 @@ bool TestIdentity2()
IdentityBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, CROSSENTROPY);
/* check result */
gpuTest = dedxGPU->CheckData(dedxAnswer, sUnitNum);
gpuTest = dedxGPU->CheckData(dedxAnswer, sUnitNum, 1e-4F);
/* destroy variables */
delete x;
......@@ -197,7 +200,6 @@ bool TestIdentity2()
*/
/* test for Identity Function */
extern "C"
bool TestIdentity()
{
XPRINT(0, stdout, "[TEST Identity] identity function and its backward computation \n");
......@@ -213,15 +215,15 @@ bool TestIdentity()
else
XPRINT(0, stdout, ">> case 1 passed!\n");
///* case 2 test */
//caseFlag = TestIdentity2();
/* case 2 test */
caseFlag = TestIdentity2();
//if (!caseFlag) {
// returnFlag = false;
// XPRINT(0, stdout, ">> case 2 failed!\n");
//}
//else
// XPRINT(0, stdout, ">> case 2 passed!\n");
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n");
}
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* other cases test */
/*
......
......@@ -23,8 +23,10 @@
#include "TLogSoftmax.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: test LogSoftmax function.
* LogSoftmax function: y = log(e^x / \sum_{i} e^{x_i})
/*
case 1: test LogSoftmax function.
LogSoftmax function: y = log(e^x / \sum_{i} e^{x_i})
*/
bool TestLogSoftmax1()
{
......@@ -38,10 +40,10 @@ bool TestLogSoftmax1()
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
DTYPE xData[2][3] = { {0.0, 1.0, 2.0},
{0.5, 0.7, 1.4} };
DTYPE answer[2][3] = { {-2.4076, -1.4076, -0.4076},
{-1.5435, -1.3435, -0.6435} };
DTYPE xData[2][3] = { {0.0F, 1.0F, 2.0F},
{0.5F, 0.7F, 1.4F} };
DTYPE answer[2][3] = { {-2.4076F, -1.4076F, -0.4076F},
{-1.5435F, -1.3435F, -0.6435F} };
/* CPU test */
bool cpuTest = true;
......@@ -58,7 +60,7 @@ bool TestLogSoftmax1()
LogSoftmax(x, y, 1);
/* check result */
cpuTest = y->CheckData(answer, sUnitNum);
cpuTest = y->CheckData(answer, sUnitNum, 1e-4F);
#ifdef USE_CUDA
/* GPU test */
......@@ -76,7 +78,7 @@ bool TestLogSoftmax1()
LogSoftmax(xGPU, yGPU, 1);
/* check result */
gpuTest = yGPU->CheckData(answer, sUnitNum);
gpuTest = yGPU->CheckData(answer, sUnitNum, 1e-4F);
/* destroy variables */
delete x;
......@@ -97,9 +99,10 @@ bool TestLogSoftmax1()
#endif // USE_CUDA
}
/* case 2: test LogSoftmaxBackward function.
* dE/dx = dE/dy * dy/dx
* log softmax: y_i = log(e^{x_i} / \sum_{k} e^{x_k})
/*
case 2: test LogSoftmaxBackward function.
dE/dx = dE/dy * dy/dx
log softmax: y_i = log(e^{x_i} / \sum_{k} e^{x_k})
*/
bool TestLogSoftmax2()
{
......@@ -112,10 +115,10 @@ bool TestLogSoftmax2()
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
DTYPE xData[3] = {0.0, 1.0, 2.0};
DTYPE gData[3] = {0.5, 0.8, 1.5};
DTYPE yAnswer[3] = {-2.4076, -1.4076, -0.4076};
DTYPE dedxAnswer[3] = {-0.409969, -0.555272, -0.834759};
DTYPE xData[3] = {0.0F, 1.0F, 2.0F};
DTYPE gData[3] = {0.5F, 0.8F, 1.5F};
DTYPE yAnswer[3] = {-2.4076F, -1.4076F, -0.4076F};
DTYPE dedxAnswer[3] = {-0.409969F, -0.555272F, -0.834759F};
/* CPU test */
bool cpuTest = true;
......@@ -141,7 +144,7 @@ bool TestLogSoftmax2()
LogSoftmaxBackward(g, y, x, dedy, dedx, 0, CROSSENTROPY);
/* check result */
cpuTest = y->CheckData(yAnswer, sUnitNum) && dedx->CheckData(dedxAnswer, sUnitNum);
cpuTest = y->CheckData(yAnswer, sUnitNum, 1e-4F) && dedx->CheckData(dedxAnswer, sUnitNum, 1e-4F);
#ifdef USE_CUDA
/* GPU test */
......@@ -168,7 +171,7 @@ bool TestLogSoftmax2()
LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, 0, CROSSENTROPY);
/* check result */
gpuTest = yGPU->CheckData(yAnswer, sUnitNum) && dedxGPU->CheckData(dedxAnswer, sUnitNum);
gpuTest = yGPU->CheckData(yAnswer, sUnitNum, 1e-4F) && dedxGPU->CheckData(dedxAnswer, sUnitNum, 1e-4F);
/* destroy variables */
delete x;
......@@ -197,9 +200,10 @@ bool TestLogSoftmax2()
#endif // USE_CUDA
}
/* case 3: test LogSoftmaxBackward function.
* dE/dx = dE/dy * dy/dx
* log softmax: y_i = log(e^{x_i} / \sum_{k} e^{x_k})
/*
case 3: test LogSoftmaxBackward function.
dE/dx = dE/dy * dy/dx
log softmax: y_i = log(e^{x_i} / \sum_{k} e^{x_k})
*/
bool TestLogSoftmax3()
{
......@@ -213,10 +217,10 @@ bool TestLogSoftmax3()
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
DTYPE xData[1][3] = { {0.0, 1.0, 2.0} };
DTYPE gData[1][3] = { {0.5, 0.8, 1.5} };
DTYPE yAnswer[1][3] = {-2.4076, -1.4076, -0.4076};
DTYPE dedxAnswer[1][3] = {-0.409969, -0.555272, -0.834759};
DTYPE xData[1][3] = { {0.0F, 1.0F, 2.0F} };
DTYPE gData[1][3] = { {0.5F, 0.8F, 1.5F} };
DTYPE yAnswer[1][3] = {-2.4076F, -1.4076F, -0.4076F};
DTYPE dedxAnswer[1][3] = {-0.409969F, -0.555272F, -0.834759F};
/* CPU test */
bool cpuTest = true;
......@@ -242,7 +246,7 @@ bool TestLogSoftmax3()
LogSoftmaxBackward(g, y, x, dedy, dedx, 1, CROSSENTROPY);
/* check result */
cpuTest = y->CheckData(yAnswer, sUnitNum) && dedx->CheckData(dedxAnswer, sUnitNum);
cpuTest = y->CheckData(yAnswer, sUnitNum, 1e-4F) && dedx->CheckData(dedxAnswer, sUnitNum, 1e-4F);
#ifdef USE_CUDA
/* GPU test */
......@@ -269,7 +273,7 @@ bool TestLogSoftmax3()
LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, 1, CROSSENTROPY);
/* check result */
gpuTest = yGPU->CheckData(yAnswer, sUnitNum) && dedxGPU->CheckData(dedxAnswer, sUnitNum);
gpuTest = yGPU->CheckData(yAnswer, sUnitNum, 1e-4F) && dedxGPU->CheckData(dedxAnswer, sUnitNum, 1e-4F);
/* destroy variables */
delete x;
......@@ -305,7 +309,6 @@ bool TestLogSoftmax3()
*/
/* test for LogSoftmax Function */
extern "C"
bool TestLogSoftmax()
{
XPRINT(0, stdout, "[TEST LogSoftmax] test log softmax function and its backward computation \n");
......@@ -321,15 +324,15 @@ bool TestLogSoftmax()
else
XPRINT(0, stdout, ">> case 1 passed!\n");
///* case 2 test */
//caseFlag = TestLogSoftmax2();
/* case 2 test */
caseFlag = TestLogSoftmax2();
//if (!caseFlag) {
// returnFlag = false;
// XPRINT(0, stdout, ">> case 2 failed!\n");
//}
//else
// XPRINT(0, stdout, ">> case 2 passed!\n");
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n");
}
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* case 3 test */
caseFlag = TestLogSoftmax3();
......
......@@ -23,10 +23,12 @@
#include "../function/Loss.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: test LossCompute function
* In this case, Loss function name = SQUAREDERROR.
* loss = sum_{i} 0.5*(t_i - y_i)^2,
* where t_i is the gold standard and y_i is the model output
/*
case 1: test LossCompute function
In this case, Loss function name = SQUAREDERROR.
loss = sum_{i} 0.5*(t_i - y_i)^2,
where t_i is the gold standard and y_i is the model output
*/
bool TestLoss1()
{
......@@ -99,10 +101,11 @@ bool TestLoss1()
#endif // USE_CUDA
}
/* case 2: test LossCompute function
* In this case, Loss function name = CROSSENTROPY.
* loss = sum_{i} (-t_i * log(y_i))
* where t_i is the gold standard and y_i is the model output
/*
case 2: test LossCompute function
In this case, Loss function name = CROSSENTROPY.
loss = sum_{i} (-t_i * log(y_i))
where t_i is the gold standard and y_i is the model output
*/
bool TestLoss2()
{
......@@ -175,10 +178,11 @@ bool TestLoss2()
#endif // USE_CUDA
}
/* case 3: test LossCompute function
* In this case, Loss function name = ONEHOTERROR.
* loss = sum_{i} e_i
* where e_i = 0.5*(t_i - y_i)^2 if t_i = 1, e_i = 0 otherwise
/*
case 3: test LossCompute function
In this case, Loss function name = ONEHOTERROR.
loss = sum_{i} e_i
where e_i = 0.5*(t_i - y_i)^2 if t_i = 1, e_i = 0 otherwise
*/
bool TestLoss3()
{
......@@ -191,16 +195,16 @@ bool TestLoss3()
int unitNum = 1;
for (int i = 0; i < order; i++)
unitNum *= dimSize[i];
DTYPE outputData[5][1] = { {0.5},
{0.5},
{0.5},
{0.5},
{0.5} };
DTYPE goldData[5][1] = { {1.0},
{1.0},
{0.0},
{0.0},
{0.0} };
DTYPE outputData[5][1] = { {0.5F},
{0.5F},
{0.5F},
{0.5F},
{0.5F} };
DTYPE goldData[5][1] = { {1.0F},
{1.0F},
{0.0F},
{0.0F},
{0.0F} };
/* CPU test */
bool cpuTest = true;
......@@ -263,7 +267,6 @@ TODO!!
*/
/* test for Loss Function */
extern "C"
bool TestLoss()
{
XPRINT(0, stdout, "[TEST Loss] compute the loss \n");
......
......@@ -22,9 +22,10 @@
#include "TMatrixMULBatchedCPU.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: matrix multiplication in batch mode (CPU code).
* In this case, aList=2*(2, 3), bList=2*(3, 2) -> c=2*(2, 2),
* transposedA=X_NOTRANS, transposedB=X_NOTRANS.
/*
case 1: matrix multiplication in batch mode (CPU code).
In this case, aList=2*(2, 3), bList=2*(3, 2) -> c=2*(2, 2), transposedA=X_NOTRANS, transposedB=X_NOTRANS.
*/
bool TestMatrixMulBatchedCPU1()
{
......@@ -63,20 +64,20 @@ bool TestMatrixMulBatchedCPU1()
for (int i = 0; i < cOrder; i++)
cUnitNum *= cDimSize[i];
DTYPE aData1[2][3] = { {1.0, 2.0, 3.0},
{-4.0, 5.0, 6.0} };
DTYPE aData2[2][3] = { {1.0, -2.0, -3.0},
{-4.0, 3.0, 2.0} };
DTYPE bData1[3][2] = { {0.0, -1.0},
{1.0, 2.0},
{2.0, 1.0} };
DTYPE bData2[3][2] = { {0.0, 1.0},
{3.0, 2.0},
{2.0, 1.0} };
DTYPE answer1[2][2] = { {8.0, 6.0},
{17.0, 20.0} };
DTYPE answer2[2][2] = { {-12.0, -6.0},
{13.0, 4.0} };
DTYPE aData1[2][3] = { {1.0F, 2.0F, 3.0F},
{-4.0F, 5.0F, 6.0F} };
DTYPE aData2[2][3] = { {1.0F, -2.0F, -3.0F},
{-4.0F, 3.0F, 2.0F} };
DTYPE bData1[3][2] = { {0.0F, -1.0F},
{1.0F, 2.0F},
{2.0F, 1.0F} };
DTYPE bData2[3][2] = { {0.0F, 1.0F},
{3.0F, 2.0F},
{2.0F, 1.0F} };
DTYPE answer1[2][2] = { {8.0F, 6.0F},
{17.0F, 20.0F} };
DTYPE answer2[2][2] = { {-12.0F, -6.0F},
{13.0F, 4.0F} };
/* CPU test */
bool cpuTest = true;
......
......@@ -22,9 +22,11 @@
#include "TMatrixMul.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: matrix multiplication.
* In this case, a=(2, 3), b=(3, 2) -> c=(2, 2),
* transposedA=X_NOTRANS, transposedB=X_NOTRANS.
/*
case 1: matrix multiplication.
In this case, a=(2, 3), b=(3, 2) -> c=(2, 2),
transposedA=X_NOTRANS, transposedB=X_NOTRANS.
*/
bool TestMatrixMul1()
{
......@@ -58,13 +60,13 @@ bool TestMatrixMul1()
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
DTYPE sData1[2][3] = { {1.0, 2.0, 3.0},
{-4.0, 5.0, 6.0} };
DTYPE sData2[3][2] = { {0.0, -1.0},
{1.0, 2.0},
{2.0, 1.0} };
DTYPE answer[2][2] = { {8.0, 6.0},
{17.0, 20.0} };
DTYPE sData1[2][3] = { {1.0F, 2.0F, 3.0F},
{-4.0F, 5.0F, 6.0F} };
DTYPE sData2[3][2] = { {0.0F, -1.0F},
{1.0F, 2.0F},
{2.0F, 1.0F} };
DTYPE answer[2][2] = { {8.0F, 6.0F},
{17.0F, 20.0F} };
/* CPU test */
bool cpuTest = true;
......@@ -130,9 +132,10 @@ bool TestMatrixMul1()
#endif // USE_CUDA
}
/* case 2: matrix multiplication.
* In this case, a=(3, 2), b=(3, 2) -> c=(2, 2),
* transposedA=X_TRANS, transposedB=X_NOTRANS.
/*
case 2: matrix multiplication.
In this case, a=(3, 2), b=(3, 2) -> c=(2, 2),
transposedA=X_TRANS, transposedB=X_NOTRANS.
*/
bool TestMatrixMul2()
{
......@@ -166,14 +169,14 @@ bool TestMatrixMul2()
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
DTYPE sData1[3][2] = { {1.0, -4.0},
{2.0, 5.0},
{3.0, 6.0} };
DTYPE sData2[3][2] = { {0.0, -1.0},
{1.0, 2.0},
{2.0, 1.0} };
DTYPE answer[2][2] = { {8.0, 6.0},
{17.0, 20.0} };
DTYPE sData1[3][2] = { {1.0F, -4.0F},
{2.0F, 5.0F},
{3.0F, 6.0F} };
DTYPE sData2[3][2] = { {0.0F, -1.0F},
{1.0F, 2.0F},
{2.0F, 1.0F} };
DTYPE answer[2][2] = { {8.0F, 6.0F},
{17.0F, 20.0F} };
/* CPU test */
bool cpuTest = true;
......@@ -239,9 +242,10 @@ bool TestMatrixMul2()
#endif // USE_CUDA
}
/* case 3: matrix multiplication.
* In this case, a=(3, 2, 3), b=(2, 3, 2) -> c=(3, 2, 2, 2),
* transposedA=X_NOTRANS, transposedB=X_NOTRANS.
/*
case 3: matrix multiplication.
In this case, a=(3, 2, 3), b=(2, 3, 2) -> c=(3, 2, 2, 2),
transposedA=X_NOTRANS, transposedB=X_NOTRANS.
*/
bool TestMatrixMul3()
{
......@@ -279,30 +283,30 @@ bool TestMatrixMul3()
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
DTYPE sData1[3][2][3] = { { {0.0, -1.0, 2.0},
{2.0, 1.0, 3.0} },
{ {1.0, 2.0, 4.0},
{3.0, 1.0, 2.0}},
{ {-1.0, 3.0, 2.0},
{1.0, -1.0, 0.0} } };
DTYPE sData2[2][3][2] = { { {1.0, 2.0},
{-4.0, 3.0},
{2.0, 6.0} },
{ {1.0, 2.0},
{3.0, 4.0},
{5.0, 6.0} } };
DTYPE answer[3][2][2][2] = { { { {8.0, 9.0},
{4.0, 25.0} },
{ {7.0, 8.0},
{20.0, 26.0} } },
{ { {1.0, 32.0},
{3.0, 21.0} },
{ {27.0, 34.0},
{16.0, 22.0} } },
{ { {-9.0, 19.0},
{5.0, -1.0} },
{ {18.0, 22.0},
{-2.0, -2.0} } } };
DTYPE sData1[3][2][3] = { { {0.0F, -1.0F, 2.0F},
{2.0F, 1.0F, 3.0F} },
{ {1.0F, 2.0F, 4.0F},
{3.0F, 1.0F, 2.0F}},
{ {-1.0F, 3.0F, 2.0F},
{1.0F, -1.0F, 0.0F} } };
DTYPE sData2[2][3][2] = { { {1.0F, 2.0F},
{-4.0F, 3.0F},
{2.0F, 6.0F} },
{ {1.0F, 2.0F},
{3.0F, 4.0F},
{5.0F, 6.0F} } };
DTYPE answer[3][2][2][2] = { { { {8.0F, 9.0F},
{4.0F, 25.0F} },
{ {7.0F, 8.0F},
{20.0F, 26.0F} } },
{ { {1.0F, 32.0F},
{3.0F, 21.0F} },
{ {27.0F, 34.0F},
{16.0F, 22.0F} } },
{ { {-9.0F, 19.0F},
{5.0F, -1.0F} },
{ {18.0F, 22.0F},
{-2.0F, -2.0F} } } };
/* CPU test */
bool cpuTest = true;
......@@ -368,9 +372,10 @@ bool TestMatrixMul3()
#endif // USE_CUDA
}
/* case 4: matrix multiplication.
* In this case, a=(3, 2, 3), b=(3, 2) -> c=(3, 2, 2),
* transposedA=X_NOTRANS, transposedB=X_NOTRANS.
/*
case 4: matrix multiplication.
In this case, a=(3, 2, 3), b=(3, 2) -> c=(3, 2, 2),
transposedA=X_NOTRANS, transposedB=X_NOTRANS.
*/
bool TestMatrixMul4()
{
......@@ -406,21 +411,21 @@ bool TestMatrixMul4()
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
DTYPE sData1[3][2][3] = { { {0.0, -1.0, 2.0},
{2.0, 1.0, 3.0} },
{ {1.0, 2.0, 4.0},
{3.0, 1.0, 2.0}},
{ {-1.0, 3.0, 2.0},
{1.0, -1.0, 0.0} } };
DTYPE sData2[3][2] = { {1.0, 2.0},
{3.0, 4.0},
{5.0, 6.0} };
DTYPE answer[3][2][2] = { { {7.0, 8.0},
{20.0, 26.0} },
{ {27.0, 34.0},
{16.0, 22.0} },
{ {18.0, 22.0},
{-2.0, -2.0} } };
DTYPE sData1[3][2][3] = { { {0.0F, -1.0F, 2.0F},
{2.0F, 1.0F, 3.0F} },
{ {1.0F, 2.0F, 4.0F},
{3.0F, 1.0F, 2.0F}},
{ {-1.0F, 3.0F, 2.0F},
{1.0F, -1.0F, 0.0F} } };
DTYPE sData2[3][2] = { {1.0F, 2.0F},
{3.0F, 4.0F},
{5.0F, 6.0F} };
DTYPE answer[3][2][2] = { { {7.0F, 8.0F},
{20.0F, 26.0F} },
{ {27.0F, 34.0F},
{16.0F, 22.0F} },
{ {18.0F, 22.0F},
{-2.0F, -2.0F} } };
/* CPU test */
bool cpuTest = true;
......@@ -493,7 +498,6 @@ bool TestMatrixMul4()
*/
/* test for MatrixMul Function */
extern "C"
bool TestMatrixMul()
{
XPRINT(0, stdout, "[TEST MATRIXMUL] matrix multiplication \n");
......
......@@ -22,9 +22,11 @@
#include "TMatrixMul2D.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: matrix multiplication (for 2d tensors).
* In this case, a=(2, 3), b=(3, 2) -> c=(2, 2),
* transposedA=X_NOTRANS, transposedB=X_NOTRANS.
/*
case 1: matrix multiplication (for 2d tensors).
In this case, a=(2, 3), b=(3, 2) -> c=(2, 2),
transposedA=X_NOTRANS, transposedB=X_NOTRANS.
*/
bool TestMatrixMul2D1()
{
......@@ -58,13 +60,13 @@ bool TestMatrixMul2D1()
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
DTYPE sData1[2][3] = { {1.0, 2.0, 3.0},
{-4.0, 5.0, 6.0} };
DTYPE sData2[3][2] = { {0.0, -1.0},
{1.0, 2.0},
{2.0, 1.0} };
DTYPE answer[2][2] = { {8.0, 6.0},
{17.0, 20.0} };
DTYPE sData1[2][3] = { {1.0F, 2.0F, 3.0F},
{-4.0F, 5.0F, 6.0F} };
DTYPE sData2[3][2] = { {0.0F, -1.0F},
{1.0F, 2.0F},
{2.0F, 1.0F} };
DTYPE answer[2][2] = { {8.0F, 6.0F},
{17.0F, 20.0F} };
/* CPU test */
bool cpuTest = true;
......@@ -130,9 +132,10 @@ bool TestMatrixMul2D1()
#endif // USE_CUDA
}
/* case 2: matrix multiplication (for 2d tensors).
* In this case, a=(3, 2), b=(3, 2) -> c=(2, 2),
* transposedA=X_TRANS, transposedB=X_NOTRANS.
/*
case 2: matrix multiplication (for 2d tensors).
In this case, a=(3, 2), b=(3, 2) -> c=(2, 2),
transposedA=X_TRANS, transposedB=X_NOTRANS.
*/
bool TestMatrixMul2D2()
{
......@@ -166,14 +169,14 @@ bool TestMatrixMul2D2()
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
DTYPE sData1[3][2] = { {1.0, -4.0},
{2.0, 5.0},
{3.0, 6.0} };
DTYPE sData2[3][2] = { {0.0, -1.0},
{1.0, 2.0},
{2.0, 1.0} };
DTYPE answer[2][2] = { {8.0, 6.0},
{17.0, 20.0} };
DTYPE sData1[3][2] = { {1.0F, -4.0F},
{2.0F, 5.0F},
{3.0F, 6.0F} };
DTYPE sData2[3][2] = { {0.0F, -1.0F},
{1.0F, 2.0F},
{2.0F, 1.0F} };
DTYPE answer[2][2] = { {8.0F, 6.0F},
{17.0F, 20.0F} };
/* CPU test */
bool cpuTest = true;
......
......@@ -22,9 +22,11 @@
#include "TMatrixMul2DParallel.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: matrix multiplication (for 2d tensors) with multi-threading.
* In this case, a=(2, 3), b=(3, 2) -> c=(2, 2),
* transposedA=X_NOTRANS, transposedB=X_NOTRANS.
/*
case 1: matrix multiplication (for 2d tensors) with multi-threading.
In this case, a=(2, 3), b=(3, 2) -> c=(2, 2),
transposedA=X_NOTRANS, transposedB=X_NOTRANS.
*/
bool TestMatrixMul2DParallel1()
{
......@@ -58,13 +60,13 @@ bool TestMatrixMul2DParallel1()
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
DTYPE sData1[2][3] = { {1.0, 2.0, 3.0},
{-4.0, 5.0, 6.0} };
DTYPE sData2[3][2] = { {0.0, -1.0},
{1.0, 2.0},
{2.0, 1.0} };
DTYPE answer[2][2] = { {8.0, 6.0},
{17.0, 20.0} };
DTYPE sData1[2][3] = { {1.0F, 2.0F, 3.0F},
{-4.0F, 5.0F, 6.0F} };
DTYPE sData2[3][2] = { {0.0F, -1.0F},
{1.0F, 2.0F},
{2.0F, 1.0F} };
DTYPE answer[2][2] = { {8.0F, 6.0F},
{17.0F, 20.0F} };
/* CPU test */
bool cpuTest = true;
......@@ -96,9 +98,10 @@ bool TestMatrixMul2DParallel1()
return cpuTest;
}
/* case 2: matrix multiplication (for 2d tensors) with multi-threading.
* In this case, a=(3, 2), b=(3, 2) -> c=(2, 2),
* transposedA=X_TRANS, transposedB=X_NOTRANS.
/*
case 2: matrix multiplication (for 2d tensors) with multi-threading.
In this case, a=(3, 2), b=(3, 2) -> c=(2, 2),
transposedA=X_TRANS, transposedB=X_NOTRANS.
*/
bool TestMatrixMul2DParallel2()
{
......@@ -132,14 +135,14 @@ bool TestMatrixMul2DParallel2()
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
DTYPE sData1[3][2] = { {1.0, -4.0},
{2.0, 5.0},
{3.0, 6.0} };
DTYPE sData2[3][2] = { {0.0, -1.0},
{1.0, 2.0},
{2.0, 1.0} };
DTYPE answer[2][2] = { {8.0, 6.0},
{17.0, 20.0} };
DTYPE sData1[3][2] = { {1.0F, -4.0F},
{2.0F, 5.0F},
{3.0F, 6.0F} };
DTYPE sData2[3][2] = { {0.0F, -1.0F},
{1.0F, 2.0F},
{2.0F, 1.0F} };
DTYPE answer[2][2] = { {8.0F, 6.0F},
{17.0F, 20.0F} };
/* CPU test */
bool cpuTest = true;
......@@ -177,7 +180,6 @@ bool TestMatrixMul2DParallel2()
*/
/* test for MatrixMul2DParallel Function */
extern "C"
bool TestMatrixMul2DParallel()
{
XPRINT(0, stdout, "[TEST MatrixMul2DParallel] matrix multiplication (for 2d tensors) with multi-threading \n");
......
......@@ -22,9 +22,10 @@
#include "TMatrixMULBatched.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: matrix multiplication of the two tensors.
* In this case, a=(2, 3), b=(2, 3) -> c=(2, 2), transposedA=X_NOTRANS,
transposedB=X_NOTRANS.
/*
case 1: matrix multiplication of the two tensors.
In this case, a=(2, 3), b=(2, 3) -> c=(2, 2), transposedA=X_NOTRANS, transposedB=X_NOTRANS.
*/
bool TestMatrixMulBatched1()
{
......@@ -58,13 +59,13 @@ bool TestMatrixMulBatched1()
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
DTYPE sData1[2][3] = { {1.0, 2.0, 3.0},
{-4.0, 5.0, 6.0} };
DTYPE sData2[3][2] = { {0.0, -1.0},
{1.0, 2.0},
{2.0, 1.0} };
DTYPE answer[2][2] = { {8.0, 6.0},
{17.0, 20.0} };
DTYPE sData1[2][3] = { {1.0F, 2.0F, 3.0F},
{-4.0F, 5.0F, 6.0F} };
DTYPE sData2[3][2] = { {0.0F, -1.0F},
{1.0F, 2.0F},
{2.0F, 1.0F} };
DTYPE answer[2][2] = { {8.0F, 6.0F},
{17.0F, 20.0F} };
/* CPU test */
bool cpuTest = true;
......@@ -130,9 +131,9 @@ bool TestMatrixMulBatched1()
#endif // USE_CUDA
}
/* case 2: matrix multiplication of the two tensors.
* In this case, a=(2, 2, 3), b=(2, 3, 2) -> c=(2, 2, 2),
* transposedA=X_NOTRANS, transposedB=X_NOTRANS.
/*
case 2: matrix multiplication of the two tensors.
In this case, a=(2, 2, 3), b=(2, 3, 2) -> c=(2, 2, 2), transposedA=X_NOTRANS, transposedB=X_NOTRANS.
*/
bool TestMatrixMulBatched2()
{
......@@ -169,20 +170,20 @@ bool TestMatrixMulBatched2()
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
DTYPE sData1[2][2][3] = { { {0.0, -1.0, 2.0},
{2.0, 1.0, 3.0} },
{ {1.0, 2.0, 4.0},
{3.0, 1.0, 2.0} } };
DTYPE sData2[2][3][2] = { { {1.0, 2.0},
{-4.0, 3.0},
{2.0, 6.0} },
{ {1.0, 2.0},
{3.0, 4.0},
{5.0, 6.0} } };
DTYPE answer[2][2][2] = { { {8.0, 9.0},
{4.0, 25.0} },
{ {27.0, 34.0},
{16.0, 22.0} } };
DTYPE sData1[2][2][3] = { { {0.0F, -1.0F, 2.0F},
{2.0F, 1.0F, 3.0F} },
{ {1.0F, 2.0F, 4.0F},
{3.0F, 1.0F, 2.0F} } };
DTYPE sData2[2][3][2] = { { {1.0F, 2.0F},
{-4.0F, 3.0F},
{2.0F, 6.0F} },
{ {1.0F, 2.0F},
{3.0F, 4.0F},
{5.0F, 6.0F} } };
DTYPE answer[2][2][2] = { { {8.0F, 9.0F},
{4.0F, 25.0F} },
{ {27.0F, 34.0F},
{16.0F, 22.0F} } };
/* CPU test */
bool cpuTest = true;
......@@ -254,7 +255,6 @@ bool TestMatrixMulBatched2()
*/
/* test for TestMatrixMulBatched Function */
extern "C"
bool TestMatrixMulBatched()
{
XPRINT(0, stdout, "[TEST MATRIXMULBATCHED] matrix multiplication of the two tensors \n");
......
......@@ -24,8 +24,10 @@
#include "TMerge.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: transform a tensor by merging it along with a dimension.
* In this case, (3, 2) -> (6), whereToMerge=1, leadingDim=0.
/*
case 1: transform a tensor by merging it along with a dimension.
In this case, (3, 2) -> (6), whereToMerge=1, leadingDim=0.
*/
bool TestMerge1()
{
......@@ -48,9 +50,9 @@ bool TestMerge1()
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
DTYPE sData[2][3] = { {0.0, 1.0, 2.0},
{3.0, 4.0, 5.0} };
DTYPE answer[6] = {0.0, 1.0, 2.0, 3.0, 4.0, 5.0};
DTYPE sData[2][3] = { {0.0F, 1.0F, 2.0F},
{3.0F, 4.0F, 5.0F} };
DTYPE answer[6] = {0.0F, 1.0F, 2.0F, 3.0F, 4.0F, 5.0F};
/* CPU test */
bool cpuTest = true;
......@@ -107,8 +109,9 @@ bool TestMerge1()
#endif // USE_CUDA
}
/* case 2: transform a tensor by merging it along with a dimension.
* In this case,
/*
case 2: transform a tensor by merging it along with a dimension.
In this case,
(2, 2, 3) -> (4, 3), whereToMerge=1, leadingDim=0.
(2, 2, 3) -> (2, 6), whereToMerge=2, leadingDim=0.
*/
......@@ -145,16 +148,16 @@ bool TestMerge2()
for (int i = 0; i < tOrder2; i++)
tUnitNum2 *= tDimSize2[i];
DTYPE sData[2][2][3] = { { {0.0, 1.0, 2.0},
{4.0, 5.0, 6.0} },
{ {-1.0, 2.0, 3.0},
{-4.0, -5.0, -6.0} } };
DTYPE answer1[4][3] = { {0.0, 1.0, 2.0},
{4.0, 5.0, 6.0},
{-1.0, 2.0, 3.0},
{-4.0, -5.0, -6.0} };
DTYPE answer2[2][6] = { {0.0, 1.0, 2.0, -1.0, 2.0, 3.0},
{4.0, 5.0, 6.0, -4.0, -5.0, -6.0} };
DTYPE sData[2][2][3] = { { {0.0F, 1.0F, 2.0F},
{4.0F, 5.0F, 6.0F} },
{ {-1.0F, 2.0F, 3.0F},
{-4.0F, -5.0F, -6.0F} } };
DTYPE answer1[4][3] = { {0.0F, 1.0F, 2.0F},
{4.0F, 5.0F, 6.0F},
{-1.0F, 2.0F, 3.0F},
{-4.0F, -5.0F, -6.0F} };
DTYPE answer2[2][6] = { {0.0F, 1.0F, 2.0F, -1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, -4.0F, -5.0F, -6.0F} };
/* CPU test */
bool cpuTest = true;
......@@ -222,7 +225,8 @@ bool TestMerge2()
#endif // USE_CUDA
}
/* case 3: merge small tensors into a big tensor.
/*
case 3: merge small tensors into a big tensor.
In this case, 2 * (2, 4) -> (4, 4), whereToMerge=0.
*/
bool TestMerge3()
......@@ -240,10 +244,10 @@ bool TestMerge3()
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
DTYPE sData1[2][4] = { {0.0, 1.0, 2.0, 3.0},
{4.0, 5.0, 6.0, 7.0} };
DTYPE sData2[2][4] = { {0.0, -1.0, -2.0, -3.0},
{-4.0, -5.0, -6.0, -7.0} };
DTYPE sData1[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE sData2[2][4] = { {0.0F, -1.0F, -2.0F, -3.0F},
{-4.0F, -5.0F, -6.0F, -7.0F} };
/* a target tensor of size (4, 4) */
int tOrder = 2;
......@@ -255,10 +259,10 @@ bool TestMerge3()
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
DTYPE answer[4][4] = { {0.0, 1.0, 2.0, 3.0},
{4.0, 5.0, 6.0, 7.0},
{0.0, -1.0, -2.0, -3.0},
{-4.0, -5.0, -6.0, -7.0} };
DTYPE answer[4][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F},
{0.0F, -1.0F, -2.0F, -3.0F},
{-4.0F, -5.0F, -6.0F, -7.0F} };
/* CPU test */
bool cpuTest = true;
......@@ -336,7 +340,8 @@ bool TestMerge3()
#endif // USE_CUDA
}
/* case 4: merge small tensors into a big tensor.
/*
case 4: merge small tensors into a big tensor.
In this case, 2 * (2, 4) -> (2, 8), whereToMerge=1.
*/
bool TestMerge4()
......@@ -354,10 +359,10 @@ bool TestMerge4()
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
DTYPE sData1[2][4] = { {0.0, 1.0, 2.0, 3.0},
{4.0, 5.0, 6.0, 7.0} };
DTYPE sData2[2][4] = { {0.0, -1.0, -2.0, -3.0},
{-4.0, -5.0, -6.0, -7.0} };
DTYPE sData1[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE sData2[2][4] = { {0.0F, -1.0F, -2.0F, -3.0F},
{-4.0F, -5.0F, -6.0F, -7.0F} };
/* a target tensor of size (4, 4) */
int tOrder = 2;
......@@ -369,8 +374,8 @@ bool TestMerge4()
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
DTYPE answer[2][8] = { {0.0, 1.0, 2.0, 3.0, 0.0, -1.0, -2.0, -3.0},
{4.0, 5.0, 6.0, 7.0, -4.0, -5.0, -6.0, -7.0} };
DTYPE answer[2][8] = { {0.0F, 1.0F, 2.0F, 3.0F, 0.0F, -1.0F, -2.0F, -3.0F},
{4.0F, 5.0F, 6.0F, 7.0F, -4.0F, -5.0F, -6.0F, -7.0F} };
/* CPU test */
bool cpuTest = true;
......@@ -454,7 +459,6 @@ bool TestMerge4()
*/
/* test for Merge Function */
extern "C"
bool TestMerge()
{
XPRINT(0, stdout, "[TEST MERGE] transform a tensor by merging it alone with a dimension or merge small tensors into a big tensor\n");
......
......@@ -22,9 +22,11 @@
#include "TMultiplyElementWise.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: element-wise product of two tensors
* c(i) = a(i)*b(i) + \alpha * c(i)
* In this case, (2, 1) (2, 1) -> (2, 1), leadingDim=0, alpha=0.
/*
case 1: element-wise product of two tensors
c(i) = a(i)*b(i) + \alpha * c(i)
In this case, (2, 1) (2, 1) -> (2, 1), leadingDim=0, alpha=0.
*/
bool TestMultiplyElementWise1()
{
......@@ -58,12 +60,12 @@ bool TestMultiplyElementWise1()
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
DTYPE sData1[2][1] = { {0.0},
{1.0} };
DTYPE sData2[2][1] = { {2.0},
{3.0} };
DTYPE answer[2][1] = { {0.0},
{3.0} };
DTYPE sData1[2][1] = { {0.0F},
{1.0F} };
DTYPE sData2[2][1] = { {2.0F},
{3.0F} };
DTYPE answer[2][1] = { {0.0F},
{3.0F} };
/* CPU test */
bool cpuTest = true;
......@@ -129,9 +131,10 @@ bool TestMultiplyElementWise1()
#endif // USE_CUDA
}
/* case 2: element-wise product of two tensors
* c(i) = a(i)*b(i) + \alpha * c(i)
* In this case, (2, 2) (2, 2) -> (2, 2), leadingDim=0, alpha=0.
/*
case 2: element-wise product of two tensors
c(i) = a(i)*b(i) + \alpha * c(i)
In this case, (2, 2) (2, 2) -> (2, 2), leadingDim=0, alpha=0.
*/
bool TestMultiplyElementWise2()
{
......@@ -165,12 +168,12 @@ bool TestMultiplyElementWise2()
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
DTYPE sData1[2][2] = { {0.0, 1.0},
{2.0, 3.0} };
DTYPE sData2[2][2] = { {0.0, 1.0},
{2.0, 3.0} };
DTYPE answer[2][2] = { {0.0, 1.0},
{4.0, 9.0} };
DTYPE sData1[2][2] = { {0.0F, 1.0F},
{2.0F, 3.0F} };
DTYPE sData2[2][2] = { {0.0F, 1.0F},
{2.0F, 3.0F} };
DTYPE answer[2][2] = { {0.0F, 1.0F},
{4.0F, 9.0F} };
/* CPU test */
bool cpuTest = true;
......@@ -236,8 +239,9 @@ bool TestMultiplyElementWise2()
#endif // USE_CUDA
}
/* case 3: element-wise product of two tensors, c(i) = a(i)*b(i) + \alpha * c(i)
* In this case, (2, 2) (2, 2) -> (2, 2), leadingDim=1, alpha=0.
/*
case 3: element-wise product of two tensors, c(i) = a(i)*b(i) + \alpha * c(i)
In this case, (2, 2) (2, 2) -> (2, 2), leadingDim=1, alpha=0.
*/
bool TestMultiplyElementWise3()
{
......@@ -271,12 +275,12 @@ bool TestMultiplyElementWise3()
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
DTYPE sData1[2][2] = { {0.0, 1.0},
{2.0, 3.0} };
DTYPE sData2[2][2] = { {0.0, 1.0},
{2.0, 3.0} };
DTYPE answer[2][2] = { {0.0, 1.0},
{4.0, 9.0} };
DTYPE sData1[2][2] = { {0.0F, 1.0F},
{2.0F, 3.0F} };
DTYPE sData2[2][2] = { {0.0F, 1.0F},
{2.0F, 3.0F} };
DTYPE answer[2][2] = { {0.0F, 1.0F},
{4.0F, 9.0F} };
/* CPU test */
bool cpuTest = true;
......@@ -348,7 +352,6 @@ TODO!!
*/
/* test for MultiplyElementWise Function */
extern "C"
bool TestMultiplyElementWise()
{
XPRINT(0, stdout, "[TEST MULTIPLYELEMENTWISE] element-wise product of two tensors \n");
......
......@@ -22,6 +22,7 @@
#include "TNegate.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: set every entry to its minus value */
bool TestNegate1()
{
......@@ -35,12 +36,12 @@ bool TestNegate1()
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
DTYPE aData[3][2] = { {1.0, -2.0},
{-3.0, 4.0},
{5.0, -6.0} };
DTYPE answer[3][2] = { {-1.0, 2.0},
{3.0, -4.0},
{-5.0, 6.0} };
DTYPE aData[3][2] = { {1.0F, -2.0F},
{-3.0F, 4.0F},
{5.0F, -6.0F} };
DTYPE answer[3][2] = { {-1.0F, 2.0F},
{3.0F, -4.0F},
{-5.0F, 6.0F} };
/* CPU test */
bool cpuTest = true;
......@@ -101,12 +102,12 @@ bool TestNegate2()
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
DTYPE aData[3][2] = { {0.0, 0.0},
{0.0, 0.0},
{0.0, 0.0} };
DTYPE answer[3][2] = { {-0.0, -0.0},
{-0.0, -0.0},
{-0.0, -0.0} };
DTYPE aData[3][2] = { {0.0F, 0.0F},
{0.0F, 0.0F},
{0.0F, 0.0F} };
DTYPE answer[3][2] = { {-0.0F, -0.0F},
{-0.0F, -0.0F},
{-0.0F, -0.0F} };
/* CPU test */
bool cpuTest = true;
......@@ -160,7 +161,6 @@ TODO!!
*/
/* test for Negate Function */
extern "C"
bool TestNegate()
{
XPRINT(0, stdout, "[TEST NEGATE] set every entry to its minus value \n");
......
......@@ -22,10 +22,12 @@
#include "TNormalize.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: normalized the data with normal distribution
* For an input x, y = a * (x-mean)/sqrt(variance+\epsilon) + b.
* where a and b are the scalar and bias respectively,
* and \epsilon is the adjustment parameter.
/*
case 1: normalized the data with normal distribution
For an input x, y = a * (x-mean)/sqrt(variance+\epsilon) + b.
where a and b are the scalar and bias respectively,
and \epsilon is the adjustment parameter.
*/
bool TestNormalize1()
{
......@@ -87,14 +89,14 @@ bool TestNormalize1()
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
DTYPE sData[2][3] = { {1.0, 2.0, 3.0},
{1.5, 2.5, 3.5} };
DTYPE meanData[3] = {1.0, 1.5, 2.0};
DTYPE varData[3] = {1.0, 1.0, 4.0};
DTYPE aData[2][3] = { {1.0, 1.0, 1.0},
{1.0, 1.0, 1.0} };
DTYPE answer[2][3] = { {0.0, 0.5, 0.5},
{0.5, 1.0, 0.75} };
DTYPE sData[2][3] = { {1.0F, 2.0F, 3.0F},
{1.5F, 2.5F, 3.5F} };
DTYPE meanData[3] = {1.0F, 1.5F, 2.0F};
DTYPE varData[3] = {1.0F, 1.0F, 4.0F};
DTYPE aData[2][3] = { {1.0F, 1.0F, 1.0F},
{1.0F, 1.0F, 1.0F} };
DTYPE answer[2][3] = { {0.0F, 0.5F, 0.5F},
{0.5F, 1.0F, 0.75F} };
/* CPU test */
bool cpuTest = true;
......@@ -116,7 +118,7 @@ bool TestNormalize1()
t->SetZeroAll();
/* call normalize function */
Normalize(s, t, 0, mean, var, a, b, 0.0);
Normalize(s, t, 0, mean, var, a, b, 0.0F);
/* check results */
cpuTest = t->CheckData(answer, tUnitNum, 1e-4, 0);
......@@ -142,7 +144,7 @@ bool TestNormalize1()
tGPU->SetZeroAll();
/* call Normalize function */
Normalize(sGPU, tGPU, 0, meanGPU, varGPU, aGPU, bGPU, 0.0);
Normalize(sGPU, tGPU, 0, meanGPU, varGPU, aGPU, bGPU, 0.0F);
/* check results */
gpuTest = tGPU->CheckData(answer, tUnitNum, 1e-4, 0);
......@@ -193,7 +195,6 @@ TODO!!
*/
/* test for Normalize Function */
extern "C"
bool TestNormalize()
{
XPRINT(0, stdout, "[TEST NORMALIZE] normalized the data with normal distribution \n");
......
......@@ -23,8 +23,10 @@
#include "TPower.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: get the power(a, p)
* In this case, p=2.
/*
case 1: get the power(a, p)
In this case, p=2.
*/
bool TestPower1()
{
......@@ -38,12 +40,12 @@ bool TestPower1()
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
DTYPE aData[3][2] = { {1.0, 2.0},
{3.0, 4.0},
{5.0, 6.0} };
DTYPE answer[3][2] = { {1.0, 4.0},
{9.0, 16.0},
{25.0, 36.0} };
DTYPE aData[3][2] = { {1.0F, 2.0F},
{3.0F, 4.0F},
{5.0F, 6.0F} };
DTYPE answer[3][2] = { {1.0F, 4.0F},
{9.0F, 16.0F},
{25.0F, 36.0F} };
/* CPU test */
bool cpuTest = true;
......@@ -55,7 +57,7 @@ bool TestPower1()
a->SetData(aData, aUnitNum);
/* call Power function */
Power(a, 2.0);
Power(a, 2.0F);
/* check results */
cpuTest = a->CheckData(answer, aUnitNum, 1e-4F);
......@@ -71,7 +73,7 @@ bool TestPower1()
aGPU->SetData(aData, aUnitNum);
/* call power function */
Power(aGPU, 2.0);
Power(aGPU, 2.0F);
/* check results */
gpuTest = aGPU->CheckData(answer, aUnitNum, 1e-4F);
......@@ -91,8 +93,9 @@ bool TestPower1()
#endif // USE_CUDA
}
/* case 2: get the power(a, p)
* In this case, p=1.
/*
case 2: get the power(a, p)
In this case, p=1.
*/
bool TestPower2()
{
......@@ -106,12 +109,12 @@ bool TestPower2()
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
DTYPE aData[3][2] = { {0.0, 1.0},
{2.0, 3.0},
{4.0, 5.0} };
DTYPE answer[3][2] = { {0.0, 1.0},
{2.0, 3.0},
{4.0, 5.0} };
DTYPE aData[3][2] = { {0.0F, 1.0F},
{2.0F, 3.0F},
{4.0F, 5.0F} };
DTYPE answer[3][2] = { {0.0F, 1.0F},
{2.0F, 3.0F},
{4.0F, 5.0F} };
/* CPU test */
bool cpuTest = true;
......@@ -123,7 +126,7 @@ bool TestPower2()
a->SetData(aData, aUnitNum);
/* call Power function */
Power(a, 1.0);
Power(a, 1.0F);
/* check results */
cpuTest = a->CheckData(answer, aUnitNum, 1e-4F);
......@@ -139,7 +142,7 @@ bool TestPower2()
aGPU->SetData(aData, aUnitNum);
/* call Power function */
Power(aGPU, 1.0);
Power(aGPU, 1.0F);
/* check results */
gpuTest = aGPU->CheckData(answer, aUnitNum, 1e-4F);
......@@ -159,8 +162,9 @@ bool TestPower2()
#endif // USE_CUDA
}
/* case 3: get the power(a, p)
* In this case, p=0.
/*
case 3: get the power(a, p)
In this case, p=0.
*/
bool TestPower3()
{
......@@ -174,12 +178,12 @@ bool TestPower3()
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
DTYPE aData[3][2] = { {0.0, 1.0},
{2.0, 3.0},
{4.0, 5.0} };
DTYPE answer[3][2] = { {1.0, 1.0},
{1.0, 1.0},
{1.0, 1.0} };
DTYPE aData[3][2] = { {0.0F, 1.0F},
{2.0F, 3.0F},
{4.0F, 5.0F} };
DTYPE answer[3][2] = { {1.0F, 1.0F},
{1.0F, 1.0F},
{1.0F, 1.0F} };
/* CPU test */
bool cpuTest = true;
......@@ -191,7 +195,7 @@ bool TestPower3()
a->SetData(aData, aUnitNum);
/* call Power function */
Power(a, 0.0);
Power(a, 0.0F);
/* check results */
cpuTest = a->CheckData(answer, aUnitNum, 1e-4F);
......@@ -207,7 +211,7 @@ bool TestPower3()
aGPU->SetData(aData, aUnitNum);
/* call Power function */
Power(aGPU, 0.0);
Power(aGPU, 0.0F);
/* check results */
gpuTest = aGPU->CheckData(answer, aUnitNum, 1e-4F);
......@@ -233,7 +237,6 @@ TODO!!
*/
/* test for Power Function */
extern "C"
bool TestPower()
{
XPRINT(0, stdout, "[TEST POWER] get the power(a, p) \n");
......
......@@ -22,8 +22,10 @@
#include "TRectify.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: test rectify function
* y = max(0, x)
/*
case 1: test rectify function
In this case, y = max(0, x)
*/
bool TestRectify1()
{
......@@ -47,10 +49,10 @@ bool TestRectify1()
for (int i = 0; i < yOrder; i++)
yUnitNum *= yDimSize[i];
DTYPE xData[2][3] = { {0.0, -1.0, 2.0},
{3.0, -4.0, -5.0} };
DTYPE answer[2][3] = { {0.0, 0.0, 2.0},
{3.0, 0.0, 0.0} };
DTYPE xData[2][3] = { {0.0F, -1.0F, 2.0F},
{3.0F, -4.0F, -5.0F} };
DTYPE answer[2][3] = { {0.0F, 0.0F, 2.0F},
{3.0F, 0.0F, 0.0F} };
/* CPU test */
bool cpuTest = true;
......@@ -107,10 +109,11 @@ bool TestRectify1()
#endif // USE_CUDA
}
/* case 2: backward computation
* dE/dx = dE/dy * dy/dx
* rectified: y = max(0, x)
* In this case, lossName=CROSSENTROPY.
/*
case 2: backward computation
dE/dx = dE/dy * dy/dx
rectified: y = max(0, x)
In this case, lossName=CROSSENTROPY.
*/
bool TestRectify2()
{
......@@ -124,16 +127,16 @@ bool TestRectify2()
for (int i = 0; i < xOrder; i++)
xUnitNum *= xDimSize[i];
DTYPE xData[2][3] = { {1.0, 1.0, 2.0},
{2.0, 4.0, 5.0} };
DTYPE yData[2][3] = { {1.0, 1.0, 2.0},
{2.0, 4.0, 5.0} };
DTYPE goldData[2][3] = { {1.0, 1.0, 1.0},
{1.0, 1.0, 1.0} };
DTYPE dedyData[2][3] = { {-1.0, -1.0, -0.5},
{-0.5, -0.25, -0.2} };
DTYPE answer[2][3] = { {-1.0, -1.0, -0.5},
{-0.5, -0.25, -0.2} };
DTYPE xData[2][3] = { {1.0F, 1.0F, 2.0F},
{2.0F, 4.0F, 5.0F} };
DTYPE yData[2][3] = { {1.0F, 1.0F, 2.0F},
{2.0F, 4.0F, 5.0F} };
DTYPE goldData[2][3] = { {1.0F, 1.0F, 1.0F},
{1.0F, 1.0F, 1.0F} };
DTYPE dedyData[2][3] = { {-1.0F, -1.0F, -0.5F},
{-0.5F, -0.25F, -0.2F} };
DTYPE answer[2][3] = { {-1.0F, -1.0F, -0.5F},
{-0.5F, -0.25F, -0.2F} };
/* CPU test */
bool cpuTest = true;
......@@ -215,7 +218,6 @@ TODO!!
*/
/* test for Rectify Function */
extern "C"
bool TestRectify()
{
XPRINT(0, stdout, "[TEST RECTIFY] test rectify and its backward computation \n");
......
......@@ -22,8 +22,10 @@
#include "TReduceMax.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: get the max value of the items along a dimension of the tensor.
* In this case,
/*
case 1: get the max value of the items along a dimension of the tensor.
In this case,
(2, 4) -> (4), dim = 0
(2, 4) -> (2), dim = 1
*/
......@@ -57,10 +59,10 @@ bool TestReduceMax1()
for (int i = 0; i < tOrder2; i++)
tUnitNum2 *= tDimSize2[i];
DTYPE sData[2][4] = { {0.0, 5.0, 2.0, 3.0},
{4.0, 1.0, 6.0, 7.0} };
DTYPE answer1[4] = {4.0, 5.0, 6.0, 7.0};
DTYPE answer2[2] = {5.0, 7.0};
DTYPE sData[2][4] = { {0.0F, 5.0F, 2.0F, 3.0F},
{4.0F, 1.0F, 6.0F, 7.0F} };
DTYPE answer1[4] = {4.0F, 5.0F, 6.0F, 7.0F};
DTYPE answer2[2] = {5.0F, 7.0F};
/* CPU test */
bool cpuTest = true;
......@@ -134,7 +136,6 @@ TODO!!
*/
/* test for ReduceMax Function */
extern "C"
bool TestReduceMax()
{
XPRINT(0, stdout, "[TEST ReduceMax] get the max value of the items along a dimension of the tensor\n");
......
......@@ -22,6 +22,7 @@
#include "TReduceMean.h"
namespace nts { // namespace nt(NiuTrans.Tensor)
/* case 1: get the mean value along a dimension of the tensor */
bool TestReduceMean1()
{
......@@ -53,10 +54,10 @@ bool TestReduceMean1()
for (int i = 0; i < tOrder2; i++)
tUnitNum2 *= tDimSize2[i];
DTYPE sData[2][4] = { { 0.0, 1.0, 2.0, 3.0 },
{ 4.0, 5.0, 6.0, 7.0 } };
DTYPE answer1[4] = {2.0, 3.0, 4.0, 5.0};
DTYPE answer2[2] = {1.5, 5.5};
DTYPE sData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE answer1[4] = {2.0F, 3.0F, 4.0F, 5.0F};
DTYPE answer2[2] = {1.5F, 5.5F};
/* CPU test */
bool cpuTest = true;
......@@ -124,104 +125,12 @@ bool TestReduceMean1()
#endif // USE_CUDA
}
bool TestReduceMeanForLargescale()
{
/* a tensor of size 10000 * 500 */
int order = 2;
int order_reduce = 1;
int * dimSize = new int[order];
dimSize[0] = 10000;
dimSize[1] = 500;
int unitNum = 1;
for (int i = 0; i < order; i++)
unitNum *= dimSize[i];
/* a tensor of size 500 */
int * dimSize_reduce_a = new int[order_reduce];
dimSize_reduce_a[0] = 500;
int unitNum_a = 1;
for (int i = 0; i < order_reduce; i++)
unitNum_a *= dimSize_reduce_a[i];
/* a tensor of size 10000 */
int * dimSize_reduce_b = new int[order_reduce];
dimSize_reduce_b[0] = 10000;
int unitNum_b = 1;
for (int i = 0; i < order_reduce; i++)
unitNum_b *= dimSize_reduce_b[i];
DTYPE * data = new DTYPE[5000000];
DTYPE * tmp = data;
for (int i = 0; i < unitNum; i++)
*tmp++ = 1;
DTYPE answer_a[500];
for (int i = 0; i < unitNum_a; i++)
answer_a[i] = 1;
DTYPE answer_b[10000];
for (int i = 0; i < unitNum_b; i++)
answer_b[i] = 1;
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensor(order, dimSize);
XTensor * reduce_a = NewTensor(order_reduce, dimSize_reduce_a);
XTensor * b = NewTensor(order, dimSize);
XTensor * reduce_b = NewTensor(order_reduce, dimSize_reduce_b);
/* initialize variables */
a->SetData(data, unitNum);
b->SetData(data, unitNum);
/* call reduce max function */
ReduceMean(a, reduce_a, 0);
ReduceMean(b, reduce_b, 1);
/* check results */
cpuTest = reduce_a->CheckData(answer_a, unitNum_a) && reduce_b->CheckData(answer_b, unitNum_b);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT);
XTensor * reduce_aGPU = NewTensor(order_reduce, dimSize_reduce_a, X_FLOAT);
XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT);
XTensor * reduce_bGPU = NewTensor(order_reduce, dimSize_reduce_b, X_FLOAT);
/* Initialize variables */
aGPU->SetData(data, unitNum);
bGPU->SetData(data, unitNum);
/* call reduce max function */
ReduceMean(aGPU, reduce_aGPU, 0);
ReduceMean(bGPU, reduce_bGPU, 1);
/* check results */
gpuTest = reduce_aGPU->CheckData(answer_a, unitNum_a) && reduce_bGPU->CheckData(answer_b, unitNum_b);
/* destroy variables */
delete aGPU, bGPU, reduce_aGPU, reduce_bGPU;
delete[] dimSize, dimSize_reduce_a, dimSize_reduce_b;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
*/
/* test for ReduceMean Function */
extern "C"
bool TestReduceMean()
{
XPRINT(0, stdout, "[TEST ReduceMean] get the mean value along a dimension of the tensor \n");
......@@ -236,15 +145,6 @@ bool TestReduceMean()
else
XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 2 test */
caseFlag = TestReduceMeanForLargescale();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n");
}
else
XPRINT(0, stdout, ">> case 2 passed!\n");
///* other cases test */
///*
//TODO!!
......
......@@ -24,13 +24,13 @@
#include "../core/ReduceMean.h"
namespace nts { // namespace nt(NiuTrans.Tensor)
namespace nts { // namespace nts(NiuTrans.Tensor)
/* test for ReduceMean Function */
extern "C"
bool TestReduceMean();
} // namespace nt(NiuTrans.Tensor)
} // namespace nts(NiuTrans.Tensor)
#endif // __TEST_REDUCEMEAN_H__
......@@ -22,8 +22,10 @@
#include "TReduceSum.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: sum the items along a dimension of the tensor.
* In this case,
/*
case 1: sum the items along a dimension of the tensor.
In this case,
(2, 4) -> (4), dim = 0
(2, 4) -> (2), dim = 1
*/
......@@ -57,10 +59,10 @@ bool TestReduceSum1()
for (int i = 0; i < tOrder2; i++)
tUnitNum2 *= tDimSize2[i];
DTYPE sData[2][4] = { {0.0, 1.0, 2.0, 3.0},
{4.0, 5.0, 6.0, 7.0} };
DTYPE answer1[4] = {4.0, 6.0, 8.0, 10.0};
DTYPE answer2[2] = {6.0, 22.0};
DTYPE sData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE answer1[4] = {4.0F, 6.0F, 8.0F, 10.0F};
DTYPE answer2[2] = {6.0F, 22.0F};
/* CPU test */
bool cpuTest = true;
......@@ -128,103 +130,12 @@ bool TestReduceSum1()
#endif // USE_CUDA
}
bool TestReduceSumForLargescale()
{
/* a tensor of size 10000 * 500 */
int order = 2;
int orderReduce = 1;
int * dimSize = new int[order];
dimSize[0] = 10000;
dimSize[1] = 500;
int unitNum = 1;
for (int i = 0; i < order; i++)
unitNum *= dimSize[i];
/* a tensor of size 500 */
int * dimSize_reduce_a = new int[orderReduce];
dimSize_reduce_a[0] = 500;
int unitNum_a = 1;
for (int i = 0; i < orderReduce; i++)
unitNum_a *= dimSize_reduce_a[i];
/* a tensor of size 10000 */
int * dimSize_reduce_b = new int[orderReduce];
dimSize_reduce_b[0] = 10000;
int unitNum_b = 1;
for (int i = 0; i < orderReduce; i++)
unitNum_b *= dimSize_reduce_b[i];
DTYPE * data = new DTYPE[5000000];
DTYPE * tmp = data;
for (int i = 0; i < unitNum; i++)
*tmp++ = 1;
DTYPE answer_a[500];
for (int i = 0; i < unitNum_a; i++)
answer_a[i] = 10000;
DTYPE answer_b[10000];
for (int i = 0; i < unitNum_b; i++)
answer_b[i] = 500;
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensor(order, dimSize);
XTensor * reduce_a = NewTensor(orderReduce, dimSize_reduce_a);
XTensor * b = NewTensor(order, dimSize);
XTensor * reduce_b = NewTensor(orderReduce, dimSize_reduce_b);
/* initialize variables */
a->SetData(data, unitNum);
b->SetData(data, unitNum);
/* call reduce sum function */
ReduceSum(a, reduce_a, 0);
ReduceSum(b, reduce_b, 1);
/* check results */
cpuTest = reduce_a->CheckData(answer_a, unitNum_a) && reduce_b->CheckData(answer_b, unitNum_b);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT);
XTensor * reduce_aGPU = NewTensor(orderReduce, dimSize_reduce_a, X_FLOAT);
XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT);
XTensor * reduce_bGPU = NewTensor(orderReduce, dimSize_reduce_b, X_FLOAT);
/* Initialize variables */
aGPU->SetData(data, unitNum);
bGPU->SetData(data, unitNum);
/* call reduce max function */
ReduceSum(aGPU, reduce_aGPU, 0);
ReduceSum(bGPU, reduce_bGPU, 1);
/* check results */
gpuTest = reduce_aGPU->CheckData(answer_a, unitNum_a) && reduce_bGPU->CheckData(answer_b, unitNum_b);
/* destroy variables */
delete aGPU, bGPU, reduce_aGPU, reduce_bGPU;
delete[] dimSize, dimSize_reduce_a, dimSize_reduce_b;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
*/
/* test for ReduceSum Function */
extern "C"
bool TestReduceSum()
{
XPRINT(0, stdout, "[TEST ReduceSum] sum the items along a dimension of the tensor.\n");
......@@ -239,15 +150,6 @@ bool TestReduceSum()
else
XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 2 test */
caseFlag = TestReduceSumForLargescale();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n");
}
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* other cases test */
/*
TODO!!
......
......@@ -22,9 +22,11 @@
#include "TReduceSumSquared.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: squared sum of the items along a dimension of the tensor.
* For a 1-dimensional data array a, sum = \sum_i (a_i - shift)^2.
* In this case, (2, 4) -> (4), dim = 0.
/*
case 1: squared sum of the items along a dimension of the tensor.
For a 1-dimensional data array a, sum = \sum_i (a_i - shift)^2.
In this case, (2, 4) -> (4), dim = 0.
*/
bool TestReduceSumSquared1()
{
......@@ -56,10 +58,10 @@ bool TestReduceSumSquared1()
for (int i = 0; i < shiftOrder; i++)
shiftUnitNum *= shiftDimSize[i];
DTYPE sData[2][4] = { {0.0, 1.0, 2.0, 3.0},
{4.0, 5.0, 6.0, 7.0} };
DTYPE shiftData[4] = {1.0, -1.0, -1.0, 0.0};
DTYPE answer[4] = {10.0, 40.0, 58.0, 58.0};
DTYPE sData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE shiftData[4] = {1.0F, -1.0F, -1.0F, 0.0F};
DTYPE answer[4] = {10.0F, 40.0F, 58.0F, 58.0F};
/* CPU test */
bool cpuTest = true;
......@@ -125,9 +127,10 @@ bool TestReduceSumSquared1()
#endif // USE_CUDA
}
/* case 1: squared sum of the items along a dimension of the tensor.
* For a 1-dimensional data array a, sum = \sum_i (a_i - shift)^2.
* In this case, (2, 4) -> (2), dim = 1.
/*
case 2: squared sum of the items along a dimension of the tensor.
For a 1-dimensional data array a, sum = \sum_i (a_i - shift)^2.
In this case, (2, 4) -> (2), dim = 1.
*/
bool TestReduceSumSquared2()
{
......@@ -141,7 +144,7 @@ bool TestReduceSumSquared2()
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
/* a output tensor of size (4) */
/* a output tensor of size (2) */
int tOrder = 1;
int * tDimSize = new int[tOrder];
tDimSize[0] = 2;
......@@ -150,7 +153,7 @@ bool TestReduceSumSquared2()
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
/* a shift tensor of size (4) */
/* a shift tensor of size (2) */
int shiftOrder = 1;
int * shiftDimSize = new int[shiftOrder];
shiftDimSize[0] = 2;
......@@ -159,10 +162,10 @@ bool TestReduceSumSquared2()
for (int i = 0; i < shiftOrder; i++)
shiftUnitNum *= shiftDimSize[i];
DTYPE sData[2][4] = { {0.0, 1.0, 2.0, 3.0},
{4.0, 5.0, 6.0, 7.0} };
DTYPE shiftData[2] = {-1.0, 1.0};
DTYPE answer[2] = {30.0, 86.0};
DTYPE sData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE shiftData[2] = {-1.0F, 1.0F};
DTYPE answer[2] = {30.0F, 86.0F};
/* CPU test */
bool cpuTest = true;
......@@ -234,7 +237,6 @@ TODO!!
*/
/* test for ReduceSumSquared Function */
extern "C"
bool TestReduceSumSquared()
{
XPRINT(0, stdout, "[TEST ReduceSumSquared] squared sum of the items along a dimension of the tensor\n");
......
......@@ -22,9 +22,11 @@
#include "TReduceVariance.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: variance of the items along a dimension of the tensor.
* For a 1-dimensional data array a, variance = 1/n * \sum_i (a_i - mean)^2.
* In this case, (2, 4) -> (4), dim = 0.
/*
case 1: variance of the items along a dimension of the tensor.
For a 1-dimensional data array a, variance = 1/n * \sum_i (a_i - mean)^2.
In this case, (2, 4) -> (4), dim = 0.
*/
bool TestReduceVariance1()
{
......@@ -131,7 +133,6 @@ TODO!!
*/
/* test for ReduceVariance Function */
extern "C"
bool TestReduceVariance()
{
XPRINT(0, stdout, "[TEST ReduceVariance] variance of the items along a dimension of the tensor\n");
......
......@@ -22,8 +22,10 @@
#include "TScaleAndShift.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: scale and shift all tensor entires.
* p = p * scale + shift
/*
case 1: scale and shift all tensor entires.
p = p * scale + shift
*/
bool TestScaleAndShift1()
{
......@@ -42,8 +44,8 @@ bool TestScaleAndShift1()
DTYPE answer[2][4] = { {0.5F, 2.5F, 4.5F, 6.5F},
{8.5F, 10.5F, 12.5F, 14.5F} };
DTYPE scaleFactor = 2.0;
DTYPE shiftFactor = 0.5;
DTYPE scaleFactor = 2.0F;
DTYPE shiftFactor = 0.5F;
/* CPU test */
bool cpuTest = true;
......@@ -97,7 +99,6 @@ TODO!!
*/
/* test for ScaleAndShift Function */
extern "C"
bool TestScaleAndShift()
{
XPRINT(0, stdout, "[TEST ScaleAndShift] scale and shift all tensor entires\n");
......
......@@ -20,12 +20,14 @@
*/
#include "TSelect.h"
#include "../xc/Mycode.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: test SelectRange function.
* It can generate a tensor with seleccted data
* in range[low,high] along the given dimension.
* In this case, (2, 2, 4) -> (2, 2, 2), dim = 2, low = 1, high = 3.
/*
case 1: test SelectRange function.
It can generate a tensor with seleccted data in range[low,high] along the given dimension.
In this case, (2, 2, 4) -> (2, 2, 2), dim = 2, low = 1, high = 3.
*/
bool TestSelect1()
{
......@@ -76,25 +78,25 @@ bool TestSelect1()
/* check results */
cpuTest = t->CheckData(answer, tUnitNum);
return cpuTest;
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
/* initialize variables */
sGPU->SetData(sData, sUnitNum);
tGPU->SetZeroAll();
/* call Select function */
SelectRange(sGPU, 1, 1, 3, tGPU);
SelectRange(sGPU, 2, 1, 3, tGPU);
/* check results */
gpuTest = tGPU->CheckData(answer, sUnitNum);
gpuTest = tGPU->CheckData(answer, tUnitNum);
/* destroy variables */
delete s;
delete t;
......@@ -121,7 +123,6 @@ TODO!!
*/
/* test for Select Function */
extern "C"
bool TestSelect()
{
XPRINT(0, stdout, "[TEST Select] generate a tensor with seleccted data in range[low,high] along the given dimension \n");
......
......@@ -22,6 +22,7 @@
#include "TSetAscendingOrder.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: set the cell to the ascending order along a given dimension.
*/
bool TestSetAscendingOrder1()
......@@ -92,7 +93,6 @@ TODO!!
*/
/* test for SetAscendingOrder Function */
extern "C"
bool TestSetAscendingOrder()
{
XPRINT(0, stdout, "[TEST SetAscendingOrder] set the cell to the ascending order along a given dimension \n");
......
......@@ -22,8 +22,8 @@
#include "TSetData.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: set the cell to the ascending order along a given dimension.
*/
/* case 1: set the cell to the ascending order along a given dimension. */
bool TestSetData1()
{
/* a input tensor of size (2, 4) */
......@@ -83,7 +83,6 @@ TODO!!
*/
/* test for SetData Function */
extern "C"
bool TestSetData()
{
XPRINT(0, stdout, "[TEST SetData] set the data of tensor \n");
......
......@@ -23,9 +23,11 @@
#include "TSigmoid.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: test Sigmoid function and SigmoidBackward function.
* sigmoid function: y = 1/(1+exp(-x))
* backward computation: dE/ds = dE/dy * dy/dx
/*
case 1: test Sigmoid function and SigmoidBackward function.
sigmoid function: y = 1/(1+exp(-x))
backward computation: dE/ds = dE/dy * dy/dx
*/
bool TestSigmoid1()
{
......@@ -124,9 +126,10 @@ bool TestSigmoid1()
#endif // USE_CUDA
}
/* case 2: test Sigmoid function and SigmoidBackward function.
* sigmoid function: y = 1/(1+exp(-x))
* backward computation: dE/ds = dE/dy * dy/dx
/*
case 2: test Sigmoid function and SigmoidBackward function.
sigmoid function: y = 1/(1+exp(-x))
backward computation: dE/ds = dE/dy * dy/dx
*/
bool TestSigmoid2()
{
......@@ -234,7 +237,6 @@ bool TestSigmoid2()
*/
/* test for Sigmoid Function */
extern "C"
bool TestSigmoid()
{
XPRINT(0, stdout, "[TEST SIGMOID] sigmoid function and its backward computation \n");
......
......@@ -24,8 +24,10 @@
#include "TSoftmax.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: test Softmax function.
* softmax function: y = e^x / \sum_{i} e^{x_i}
/*
case 1: test Softmax function.
softmax function: y = e^x / \sum_{i} e^{x_i}
*/
bool TestSoftmax1()
{
......@@ -96,8 +98,9 @@ bool TestSoftmax1()
#endif // USE_CUDA
}
/* case 2: test SoftmaxBackward function.
* SoftmaxBackward function: dE/dx_j = -gold_j + y_j
/*
case 2: test SoftmaxBackward function.
SoftmaxBackward function: dE/dx_j = -gold_j + y_j
*/
bool TestSoftmax2()
{
......@@ -200,7 +203,6 @@ bool TestSoftmax2()
*/
/* test for Softmax Function */
extern "C"
bool TestSoftmax()
{
XPRINT(0, stdout, "[TEST SOFTMAX] softmax function and its backward computation \n");
......
......@@ -22,7 +22,8 @@
#include "TSort.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: sort the tensor along a given dimension*/
/* case 1: sort the tensor along a given dimension */
bool TestSort1()
{
/* a tensor of size (2, 4) */
......@@ -35,10 +36,10 @@ bool TestSort1()
for (int i = 0; i < order; i++)
unitNum *= dimSize[i];
DTYPE aData[2][4] = { { 0.0F, 1.0F, 2.0F, 3.0F },
{ 4.0F, 5.0F, 6.0F, 7.0F } };
DTYPE answer[2][4] = { { 4.0F, 5.0F, 6.0F, 7.0F },
{ 0.0F, 1.0F, 2.0F, 3.0F } };
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE answer[2][4] = { {4.0F, 5.0F, 6.0F, 7.0F},
{0.0F, 1.0F, 2.0F, 3.0F} };
/* CPU test */
bool cpuTest = true;
......@@ -104,10 +105,10 @@ bool TestSort2()
for (int i = 0; i < order; i++)
unitNum *= dimSize[i];
DTYPE aData[2][4] = { { 0.0, 1.0, 2.0, 3.0 },
{ 4.0, 5.0, 6.0, 7.0 } };
DTYPE answer[2][4] = { { 3.0, 2.0, 1.0, 0.0 },
{ 7.0, 6.0, 5.0, 4.0 } };
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE answer[2][4] = { {3.0F, 2.0F, 1.0F, 0.0F},
{7.0F, 6.0F, 5.0F, 4.0F} };
/* CPU test */
bool cpuTest = true;
......@@ -166,7 +167,6 @@ TODO!!
*/
/* test for Sort Function */
extern "C"
bool TestSort()
{
XPRINT(0, stdout, "[TEST SORT] sort the tensor along a given dimension \n");
......
......@@ -19,18 +19,17 @@
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-06-13
*/
#include "../XTensor.h"
#include "../XDevice.h"
#include "../core/Split.h"
#include "../XList.h"
#include "TSplit.h"
namespace nts { // namespace nt(NiuTrans.Tensor)
/* case 1: transform a tensor by splitting it, e.g., (N, M) -> (N/3, M, 3)
* In this case, 4 * 3 -> 2 * 2 * 3, whereToSplit=0, splitNum=2.
/*
case 1: transform a tensor by splitting it, e.g., (N, M) -> (N/3, M, 3)
In this case, (4, 3) -> (2, 2, 3), whereToSplit=0, splitNum=2.
*/
bool TestSplit1()
{
/* a source tensor of size 4 * 3 */
/* a source tensor of size (4, 3) */
int sOrder = 2;
int * sDimSize = new int[sOrder];
sDimSize[0] = 4;
......@@ -40,7 +39,7 @@ bool TestSplit1()
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
/* a target tensor of size 2 * 2 * 3 */
/* a target tensor of size (2, 2, 3) */
int tOrder = 3;
int * tDimSize = new int[tOrder];
tDimSize[0] = 2;
......@@ -109,12 +108,13 @@ bool TestSplit1()
#endif // USE_CUDA
}
/* case 2: transform a tensor by splitting it, e.g., (N, M) -> (N/3, M, 3)
* In this case, 3 * 4 -> 2 * 3 * 2, whereToSplit=1, splitNum=2.
/*
case 2: transform a tensor by splitting it, e.g., (N, M) -> (N/3, M, 3)
In this case, (3, 4) -> (2, 3, 2), whereToSplit=1, splitNum=2.
*/
bool TestSplit2()
{
/* a source tensor of size 3 * 4 */
/* a source tensor of size (3, 4) */
int sOrder = 2;
int * sDimSize = new int[sOrder];
sDimSize[0] = 3;
......@@ -124,7 +124,7 @@ bool TestSplit2()
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
/* a target tensor of size 2 * 3 * 2 */
/* a target tensor of size (2, 3, 2) */
int tOrder = 3;
int * tDimSize = new int[tOrder];
tDimSize[0] = 2;
......@@ -194,8 +194,9 @@ bool TestSplit2()
#endif // USE_CUDA
}
/* case 3: split a big tensor into small tensors
* In this case, 3 * 4 -> 2 * (3 * 2) , whereToSplit=1, splitNum=2.
/*
case 3: split a big tensor into small tensors
In this case, (3, 4) -> 2 * (3, 2) , whereToSplit=1, splitNum=2.
*/
bool TestSplit3()
{
......@@ -203,7 +204,7 @@ bool TestSplit3()
XList tList;
tList = XList();
/* a source tensor of size (3 * 4) */
/* a source tensor of size (3, 4) */
int sOrder = 2;
int * sDimSize = new int[sOrder];
sDimSize[0] = 3;
......@@ -213,7 +214,7 @@ bool TestSplit3()
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
/* a target tensor of size (3 * 2) */
/* a target tensor of size (3, 2) */
int tOrder1 = 2;
int * tDimSize1 = new int[tOrder1];
tDimSize1[0] = 3;
......@@ -313,10 +314,9 @@ TODO!!
*/
/* test for Split Function */
extern "C"
bool TestSplit()
bool TestSplit()
{
XPRINT(0, stdout, "[TEST SPLIT] -------------\n");
XPRINT(0, stdout, "[TEST SPLIT] split a big tensor into small tensors \n");
bool returnFlag = true, caseFlag = true;
/* case 1 test */
......
......@@ -22,7 +22,8 @@
#include "TSum.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1 */
/* case 1: tensor summation c = a + b * \beta */
bool TestSum1()
{
/* a tensor of size (2, 4) */
......@@ -35,12 +36,12 @@ bool TestSum1()
for (int i = 0; i < order; i++)
unitNum *= dimSize[i];
DTYPE aData[2][4] = { {0.0, 1.0, 2.0, 3.0},
{4.0, 5.0, 6.0, 7.0} };
DTYPE bData[2][4] = { {1.0, -1.0, -3.0, -5.0},
{-7.0, -9.0, -11.0, -13.0} };
DTYPE answer[2][4] = { {1.0, 0.0, -1.0, -2.0},
{-3.0, -4.0, -5.0, -6.0} };
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[2][4] = { {1.0F, -1.0F, -3.0F, -5.0F},
{-7.0F, -9.0F, -11.0F, -13.0F} };
DTYPE answer[2][4] = { {1.0F, 0.0F, -1.0F, -2.0F},
{-3.0F, -4.0F, -5.0F, -6.0F} };
/* CPU test */
bool cpuTest = true;
......@@ -95,7 +96,7 @@ bool TestSum1()
#endif // USE_CUDA
}
/* case 2 */
/* case 2: tensor summation c = a + b * \beta */
bool TestSum2()
{
/* a tensor of size (2, 4) */
......@@ -108,12 +109,12 @@ bool TestSum2()
for (int i = 0; i < order; i++) {
unitNum *= dimSize[i];
}
DTYPE aData[2][4] = { {0.0, 1.0, 2.0, 3.0},
{4.0, 5.0, 6.0, 7.0} };
DTYPE bData[2][4] = { {1.0, -1.0, -3.0, -5.0},
{-7.0, -9.0, -11.0, -13.0} };
DTYPE answer[2][4] = { {0.5, 0.5, 0.5, 0.5},
{0.5, 0.5, 0.5, 0.5} };
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[2][4] = { {1.0F, -1.0F, -3.0F, -5.0F},
{-7.0F, -9.0F, -11.0F, -13.0F} };
DTYPE answer[2][4] = { {0.5F, 0.5F, 0.5F, 0.5F},
{0.5F, 0.5F, 0.5F, 0.5F} };
float beta = 0.5F;
/* CPU test */
......@@ -129,7 +130,7 @@ bool TestSum2()
b->SetData(bData, unitNum);
c->SetZeroAll();
/* call sum function */
/* call Sum function */
Sum(a, b, c, beta);
/* check results */
......@@ -149,7 +150,7 @@ bool TestSum2()
bGPU->SetData(bData, unitNum);
cGPU->SetZeroAll();
/* call sum function */
/* call Sum function */
Sum(aGPU, bGPU, cGPU, beta);
/* check results */
......@@ -182,8 +183,7 @@ bool TestSum2()
*/
/* test for Sum Function */
extern "C"
bool TestSum()
bool TestSum()
{
XPRINT(0, stdout, "[TEST SUM] tensor summation c = a + b * beta\n");
bool returnFlag = true, caseFlag = true;
......
......@@ -22,9 +22,10 @@
#include "TSumByColumnTV.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: test SumByColumnTV function
* sum of a tensor and a vector (column vector)
* in a column by column manner
/*
case 1: test SumByColumnTV function
sum of a tensor and a vector (column vector) in a column by column manner
*/
bool TestSumByColumnTV1()
{
......@@ -58,12 +59,12 @@ bool TestSumByColumnTV1()
for (int i = 0; i < cOrder; i++)
cUnitNum *= cDimSize[i];
DTYPE aData[2][4] = { {0.0, 1.0, 2.0, 3.0},
{4.0, 5.0, 6.0, 7.0} };
DTYPE bData[2][1] = { {1.0},
{0.0} };
DTYPE answer[2][4] = { {1.0, 2.0, 3.0, 4.0},
{4.0, 5.0, 6.0, 7.0} };
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[2][1] = { {1.0F},
{0.0F} };
DTYPE answer[2][4] = { {1.0F, 2.0F, 3.0F, 4.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
/* CPU test */
bool cpuTest = true;
......@@ -128,9 +129,9 @@ bool TestSumByColumnTV1()
#endif // USE_CUDA
}
/* case 2: test SumByColumnTV function
* sum of a tensor and a vector (column vector)
* in a column by column manner
/*
case 2: test SumByColumnTV function
sum of a tensor and a vector (column vector) in a column by column manner
*/
bool TestSumByColumnTV2()
{
......@@ -154,12 +155,12 @@ bool TestSumByColumnTV2()
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
DTYPE aData[2][4] = { {0.0, 1.0, 2.0, 3.0},
{4.0, 5.0, 6.0, 7.0} };
DTYPE bData[2][1] = { {1.0},
{0.0} };
DTYPE answer[2][4] = { {1.0, 2.0, 3.0, 4.0},
{4.0, 5.0, 6.0, 7.0} };
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[2][1] = { {1.0F},
{0.0F} };
DTYPE answer[2][4] = { {1.0F, 2.0F, 3.0F, 4.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
/* CPU test */
bool cpuTest = true;
......@@ -222,7 +223,6 @@ bool TestSumByColumnTV2()
*/
/* test for SumByColumnTV Function */
extern "C"
bool TestSumByColumnTV()
{
XPRINT(0, stdout, "[TEST SumByColumnTV] sum of a tensor and a vector (column vector) in a column by column manner \n");
......
......@@ -22,9 +22,10 @@
#include "TSumByColumnVT.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: test SumByColumnVT function
* sum of a vector (column vector) and a tensor
* in a column by column manner
/*
case 1: test SumByColumnVT function
sum of a vector (column vector) and a tensor in a column by column manner
*/
bool TestSumByColumnVT1()
{
......@@ -58,12 +59,12 @@ bool TestSumByColumnVT1()
for (int i = 0; i < cOrder; i++)
cUnitNum *= cDimSize[i];
DTYPE aData[2][1] = { {1.0},
{0.0} };
DTYPE bData[2][4] = { {0.0, 1.0, 2.0, 3.0},
{4.0, 5.0, 6.0, 7.0} };
DTYPE answer[2][1] = { {7.0},
{22.0} };
DTYPE aData[2][1] = { {1.0F},
{0.0F} };
DTYPE bData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE answer[2][1] = { {7.0F},
{22.0F} };
/* CPU test */
bool cpuTest = true;
......@@ -129,9 +130,9 @@ bool TestSumByColumnVT1()
#endif // USE_CUDA
}
/* case 2: test SumByColumnVT function
* sum of a vector (column vector) and a tensor
* in a column by column manner
/*
case 2: test SumByColumnVT function
sum of a vector (column vector) and a tensor in a column by column manner
*/
bool TestSumByColumnVT2()
{
......@@ -155,12 +156,12 @@ bool TestSumByColumnVT2()
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
DTYPE aData[2][1] = { {1.0},
{0.0} };
DTYPE bData[2][4] = { {0.0, 1.0, 2.0, 3.0},
{4.0, 5.0, 6.0, 7.0} };
DTYPE answer[2][1] = { {7.0},
{22.0} };
DTYPE aData[2][1] = { {1.0F},
{0.0F} };
DTYPE bData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE answer[2][1] = { {7.0F},
{22.0F} };
/* CPU test */
bool cpuTest = true;
......@@ -223,7 +224,6 @@ bool TestSumByColumnVT2()
*/
/* test for SumByColumnVT Function */
extern "C"
bool TestSumByColumnVT()
{
XPRINT(0, stdout, "[TEST SumByColumnVT] sum of a vector (column vector) and a tensor in a column by column manner \n");
......
......@@ -22,10 +22,12 @@
#include "TTopK.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: get the top-k items along a given dimension.
* In this case,
* (2, 4) -> (2, 4), dim = 0, k = 2
* (2, 4) -> (2, 4), dim = 1, k = 4
/*
case 1: get the top-k items along a given dimension.
In this case,
(2, 4) -> (2, 4), dim = 0, k = 2
(2, 4) -> (2, 4), dim = 1, k = 4
*/
bool TestTopK1()
{
......@@ -49,16 +51,16 @@ bool TestTopK1()
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
DTYPE sData[2][4] = { {5.0, 1.0, 2.0, 8.0},
{4.0, 3.0, 7.0, 6.0} };
DTYPE sData[2][4] = { {5.0F, 1.0F, 2.0F, 8.0F},
{4.0F, 3.0F, 7.0F, 6.0F} };
DTYPE tAnswer1[2][4] = { {5.0, 3.0, 7.0, 8.0},
{4.0, 1.0, 2.0, 6.0} };
DTYPE tAnswer1[2][4] = { {5.0F, 3.0F, 7.0F, 8.0F},
{4.0F, 1.0F, 2.0F, 6.0F} };
int indexAnswer1[2][4] = { {0, 1, 1, 0},
{1, 0, 0, 1} };
DTYPE tAnswer2[2][4] = { {8.0, 5.0, 2.0, 1.0},
{7.0, 6.0, 4.0, 3.0} };
DTYPE tAnswer2[2][4] = { {8.0F, 5.0F, 2.0F, 1.0F},
{7.0F, 6.0F, 4.0F, 3.0F} };
int indexAnswer2[2][4] = { {3, 0, 2, 1},
{2, 3, 0, 1} };
......@@ -156,9 +158,9 @@ bool TestTopK1()
#endif // USE_CUDA
}
/* case 2: get the top-k items along a given dimension.
* In this case,
* (2, 4) -> (2, 2), dim = 1, k = 2
/*
case 2: get the top-k items along a given dimension.
In this case, (2, 4) -> (2, 2), dim = 1, k = 2.
*/
bool TestTopK2()
{
......@@ -182,10 +184,10 @@ bool TestTopK2()
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
DTYPE sData[2][4] = { {5.0, 1.0, 2.0, 8.0},
{4.0, 3.0, 7.0, 6.0} };
DTYPE tAnswer[2][2] = { {8.0, 5.0},
{7.0, 6.0} };
DTYPE sData[2][4] = { {5.0F, 1.0F, 2.0F, 8.0F},
{4.0F, 3.0F, 7.0F, 6.0F} };
DTYPE tAnswer[2][2] = { {8.0F, 5.0F},
{7.0F, 6.0F} };
int indexAnswer[2][2] = { {3, 0},
{2, 3} };
......@@ -255,14 +257,12 @@ bool TestTopK2()
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
*/
/* test for TopK Function */
extern "C"
bool TestTopK()
{
XPRINT(0, stdout, "[TEST TopK] get the top-k items along a given dimension\n");
......
......@@ -19,15 +19,16 @@
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-13
*/
#include "../XTensor.h"
#include "../core/Unsqueeze.h"
#include "../XList.h"
#include "TUnsqueeze.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: insert a dimension by copying the blocks for x times (where x is the size of the inerted dimension)
* In this case,
* (2, 3) -> (2, 2, 3), dim=1, dSize=2
* (2, 3) -> (2, 3, 2), dim=2, dSize=2
/*
case 1: insert a dimension by copying the blocks for x times (where x is the size of the inerted dimension)
In this case,
(2, 3) -> (2, 2, 3), dim=1, dSize=2
(2, 3) -> (2, 3, 2), dim=2, dSize=2
*/
bool TestUnsqueeze1()
{
......@@ -63,18 +64,18 @@ bool TestUnsqueeze1()
for (int i = 0; i < tOrder2; i++)
tUnitNum2 *= tDimSize2[i];
DTYPE sData[2][3] = { {0.0, 1.0, 2.0},
{3.0, 4.0, 5.0} };
DTYPE answer1[2][2][3] = { { {0.0, 1.0, 2.0},
{0.0, 1.0, 2.0} },
{ {3.0, 4.0, 5.0},
{3.0, 4.0, 5.0} } };
DTYPE answer2[2][3][2] = { { {0.0, 0.0},
{1.0, 1.0},
{2.0, 2.0} },
{ {3.0, 3.0},
{4.0, 4.0},
{5.0, 5.0} } };
DTYPE sData[2][3] = { {0.0F, 1.0F, 2.0F},
{3.0F, 4.0F, 5.0F} };
DTYPE answer1[2][2][3] = { { {0.0F, 1.0F, 2.0F},
{0.0F, 1.0F, 2.0F} },
{ {3.0F, 4.0F, 5.0F},
{3.0F, 4.0F, 5.0F} } };
DTYPE answer2[2][3][2] = { { {0.0F, 0.0F},
{1.0F, 1.0F},
{2.0F, 2.0F} },
{ {3.0F, 3.0F},
{4.0F, 4.0F},
{5.0F, 5.0F} } };
/* CPU test */
bool cpuTest = true;
......@@ -148,7 +149,6 @@ bool TestUnsqueeze1()
*/
/* test for Unsqueeze Function */
extern "C"
bool TestUnsqueeze()
{
XPRINT(0, stdout, "[TEST Unsqueeze] insert a dimension by copying the blocks for x times\n");
......
......@@ -19,14 +19,13 @@
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-6-24
*/
#include "TXMem.h"
#include "../XGlobal.h"
#include "../XUtility.h"
#include "../XMem.h"
#include "TXMem.h"
/* the nts (NiuTrans.Tensor) namespace */
namespace nts{
namespace nts{ // namespace nts(NiuTrans.Tensor)
/* case 1: test memory pool class */
bool TestXMemCase1()
{
bool ok = true;
......@@ -83,6 +82,7 @@ bool TestXMemCase1()
return ok;
}
/* test for memory pool class */
bool TestXMem()
{
XPRINT(0, stdout, "[Test] Memory pool ... Began\n");
......@@ -93,11 +93,18 @@ bool TestXMem()
/* case 1 test */
caseFlag = TestXMemCase1();
if (!caseFlag) { returnFlag = false; XPRINT(0, stdout, ">> case 1 failed!\n"); }
else {XPRINT(0, stdout, ">> case 1 passed!\n");}
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 1 failed!\n");
}
else
XPRINT(0, stdout, ">> case 1 passed!\n");
if (returnFlag) { XPRINT(0, stdout, ">> All Passed!\n"); }
else { XPRINT(0, stdout, ">> Failed!\n"); }
if (returnFlag) {
XPRINT(0, stdout, ">> All Passed!\n");
}
else
XPRINT(0, stdout, ">> Failed!\n");
double endT = GetClock();
......@@ -106,4 +113,4 @@ bool TestXMem()
return returnFlag;
}
} /* end of the nts (NiuTrans.Tensor) namespace */
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论