Commit 0887fae1 by liyinqiao

Format correction.

parent 42f995ae
...@@ -28,6 +28,10 @@ ...@@ -28,6 +28,10 @@
#include "Concatenate.h" #include "Concatenate.h"
#include "ConcatenateSolely.h" #include "ConcatenateSolely.h"
#include "CopyBlocks.h"
#include "CopyBlocksInGrid.h"
#include "CopyBlocksOnSite.h"
#include "CopyData2D.h"
#include "CopyIndexed.h" #include "CopyIndexed.h"
#include "CopyInGrid.h" #include "CopyInGrid.h"
#include "CopyValues.h" #include "CopyValues.h"
...@@ -53,6 +57,7 @@ ...@@ -53,6 +57,7 @@
#include "ReduceSumSquared.h" #include "ReduceSumSquared.h"
#include "ReduceVariance.h" #include "ReduceVariance.h"
#include "ScaleAndShift.h" #include "ScaleAndShift.h"
#include "Select.h"
#include "SetData.h" #include "SetData.h"
#include "Sort.h" #include "Sort.h"
#include "Split.h" #include "Split.h"
......
...@@ -53,6 +53,10 @@ void Concatenate(XList * smalls, XTensor * big, int dim) ...@@ -53,6 +53,10 @@ void Concatenate(XList * smalls, XTensor * big, int dim)
/* /*
concatenate two tensors along a given dimension concatenate two tensors along a given dimension
>> smallA - one tensor for concatenation
>> smallB - the other tensor for concatenation
>> big - the resulting tensor
>> dim - which dimension we perform the concatenation
*/ */
void Concatenate(XTensor * smallA, XTensor * smallB, XTensor * big, int dim) void Concatenate(XTensor * smallA, XTensor * smallB, XTensor * big, int dim)
{ {
......
...@@ -29,7 +29,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -29,7 +29,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* /*
concatenate a list of tensors along a given dimension concatenate a list of tensors along a given dimension
Note that this is actually a wrapper that selects "ConcatenateSolely" Note that this is actually a wrapper that selects "ConcatenateSolely"
or "Merge" by means of the tensor shapes */ or "Merge" by means of the tensor shapes
*/
void Concatenate(XList * smalls, XTensor * big, int dim); void Concatenate(XList * smalls, XTensor * big, int dim);
/* concatenate two tensors along a given dimension */ /* concatenate two tensors along a given dimension */
......
...@@ -64,9 +64,11 @@ void ConcatenateSolely(XList * smalls, XTensor * big, int dim) ...@@ -64,9 +64,11 @@ void ConcatenateSolely(XList * smalls, XTensor * big, int dim)
int offset = 0; int offset = 0;
/* two strategies are used - we can either resort to memcpy2d for the case of /*
two strategies are used - we can either resort to memcpy2d for the case of
concatenation of a few items, or use MergeBlockLists to merge a large number concatenation of a few items, or use MergeBlockLists to merge a large number
of data blocks */ of data blocks
*/
if (smalls->count <= MIN_TENSOR_CAT_NUM) { if (smalls->count <= MIN_TENSOR_CAT_NUM) {
for (int i = 0; i < smalls->count; i++) { for (int i = 0; i < smalls->count; i++) {
XTensor * tensor = (XTensor*)smalls->GetItem(i); XTensor * tensor = (XTensor*)smalls->GetItem(i);
......
...@@ -26,7 +26,6 @@ ...@@ -26,7 +26,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* concatenate a list of tensors along a given dimension */ /* concatenate a list of tensors along a given dimension */
extern "C" extern "C"
void ConcatenateSolely(XList * smalls, XTensor * big, int dim); void ConcatenateSolely(XList * smalls, XTensor * big, int dim);
......
...@@ -78,9 +78,11 @@ void CopyBlocks(void * source, int blockSize, int * sourceBlocks, int blockNum, ...@@ -78,9 +78,11 @@ void CopyBlocks(void * source, int blockSize, int * sourceBlocks, int blockNum,
else { else {
int devID = myMem != NULL ? myMem->devID : -1; int devID = myMem != NULL ? myMem->devID : -1;
/* The following code should be fine with GPUs, but too many /*
The following code should be fine with GPUs, but too many
kernel calls would slow down the system. We prefer to use kernel calls would slow down the system. We prefer to use
one kernel to do block copy in batch (kernel fusion). */ one kernel to do block copy in batch (kernel fusion).
*/
for (int i = 0; i < blockNum; i++) { for (int i = 0; i < blockNum; i++) {
XMemCopy((char*)target + targetBlocks[i] * blockSize, devID, XMemCopy((char*)target + targetBlocks[i] * blockSize, devID,
(char*)source + sourceBlocks[i] * blockSize, devID, blockSize); (char*)source + sourceBlocks[i] * blockSize, devID, blockSize);
......
...@@ -25,6 +25,7 @@ ...@@ -25,6 +25,7 @@
#include "CopyBlocksOnSite.cuh" #include "CopyBlocksOnSite.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* /*
copy a number of blocks to target positions. Here we assume that copy a number of blocks to target positions. Here we assume that
all the data has been on the device (CPU/GPU) already. all the data has been on the device (CPU/GPU) already.
...@@ -47,9 +48,11 @@ void CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target, ...@@ -47,9 +48,11 @@ void CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target,
else { else {
int devID = myMem != NULL ? myMem->devID : -1; int devID = myMem != NULL ? myMem->devID : -1;
/* The following code should be fine with GPUs, but too many /*
The following code should be fine with GPUs, but too many
kernel calls would slow down the system. We prefer to use kernel calls would slow down the system. We prefer to use
one kernel to do block copy in batch (kernel fusion). */ one kernel to do block copy in batch (kernel fusion).
*/
for (int i = 0, b = 0; i < blockNum; i++, b += blockSize) { for (int i = 0, b = 0; i < blockNum; i++, b += blockSize) {
XMemCopy((char*)target + targetBlocks[i] * blockSize, devID, XMemCopy((char*)target + targetBlocks[i] * blockSize, devID,
(char*)source + b, devID, blockSize); (char*)source + b, devID, blockSize);
......
...@@ -34,7 +34,7 @@ i.e., reorder the data blocks in the same memory piece ...@@ -34,7 +34,7 @@ i.e., reorder the data blocks in the same memory piece
in the k-th grid in the k-th grid
>> blockDim - leading dimension of blocks >> blockDim - leading dimension of blocks
>> blockNumInGrid - number of blocks in each grid >> blockNumInGrid - number of blocks in each grid
>> isOnDev - indicates whether the index is on the device already >> isIndexOnDev - indicates whether the index is on the device already
*/ */
void CopyInGrid(XTensor * s, XTensor * t, int * index, int blockDim, int blockNumInGrid, bool isIndexOnDev) void CopyInGrid(XTensor * s, XTensor * t, int * index, int blockDim, int blockNumInGrid, bool isIndexOnDev)
{ {
......
...@@ -36,6 +36,7 @@ copy indexed sub-tensors ...@@ -36,6 +36,7 @@ copy indexed sub-tensors
>> tgtIndex - index of the target sub-tensors >> tgtIndex - index of the target sub-tensors
>> copyNum - number of the sub-tensors we copy for each source index, e.g., >> copyNum - number of the sub-tensors we copy for each source index, e.g.,
for srcIndex = [1,4] and copyNum = 2, we actually copy the source sub-tensors 1, 2, 4, 5 for srcIndex = [1,4] and copyNum = 2, we actually copy the source sub-tensors 1, 2, 4, 5
<< return - whether copy indexed operation was successful
*/ */
bool CopyIndexed(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum) bool CopyIndexed(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum)
{ {
......
...@@ -28,7 +28,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -28,7 +28,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/**************************************/
/* copy all elements from a source matrix to a target matrix */ /* copy all elements from a source matrix to a target matrix */
extern "C" extern "C"
bool CudaCopyValues(XTensor * s, XTensor * t, XStream * stream = NULL); bool CudaCopyValues(XTensor * s, XTensor * t, XStream * stream = NULL);
......
...@@ -52,7 +52,6 @@ void CudaCPUToGPUFlush(XList * mList, int devID, XMem * GPUMem) ...@@ -52,7 +52,6 @@ void CudaCPUToGPUFlush(XList * mList, int devID, XMem * GPUMem)
else else
reqiredSize = m->unitSize * m->unitNum; reqiredSize = m->unitSize * m->unitNum;
//reqiredSize = (int)GPUMem->GetPitch(GPUMem->devID, (MTYPE)GPUMem->GetAddress() + size, reqiredSize);
size += reqiredSize; size += reqiredSize;
} }
...@@ -70,7 +69,6 @@ void CudaCPUToGPUFlush(XList * mList, int devID, XMem * GPUMem) ...@@ -70,7 +69,6 @@ void CudaCPUToGPUFlush(XList * mList, int devID, XMem * GPUMem)
else else
pSize = m->unitSize * m->unitNum; pSize = m->unitSize * m->unitNum;
//reqiredSize = (int)GPUMem->GetPitch(GPUMem->devID, (MTYPE)GPUMem->GetAddress() + p, pSize);
reqiredSize = pSize; reqiredSize = pSize;
memcpy(data + p, m->data, pSize); memcpy(data + p, m->data, pSize);
......
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
#include "MakeSplitBlockIndex.cuh" #include "MakeSplitBlockIndex.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* /*
set target data block index for the data movement in split set target data block index for the data movement in split
>> blockIndex - block index >> blockIndex - block index
......
...@@ -51,6 +51,7 @@ void KernelMakeSplitBlockIndex(int * blockIndex, int splitNum, int blockSplitSiz ...@@ -51,6 +51,7 @@ void KernelMakeSplitBlockIndex(int * blockIndex, int splitNum, int blockSplitSiz
/* /*
set target data block index for the data movement in split set target data block index for the data movement in split
>> devID - device id
>> blockIndex - block index >> blockIndex - block index
>> splitNum - number of splits >> splitNum - number of splits
>> blockSplitSize - size of the splitted block >> blockSplitSize - size of the splitted block
......
...@@ -33,9 +33,9 @@ c_i = trans(a_i) * trans(b_i) * \alpha + c_i * \beta for each i in [0,count-1] ...@@ -33,9 +33,9 @@ c_i = trans(a_i) * trans(b_i) * \alpha + c_i * \beta for each i in [0,count-1]
>> transposedA - indicate whether the matrix a is transposed >> transposedA - indicate whether the matrix a is transposed
>> b - another list of input matrices (2d tensors) >> b - another list of input matrices (2d tensors)
>> transposedB - indicate whether the matrix b is transposed >> transposedB - indicate whether the matrix b is transposed
>> c - output matrix (2d tensor)
>> alpha - scalar >> alpha - scalar
>> beta - scalar >> beta - scalar
>> c - output matrix (2d tensor)
*/ */
void MatrixMULBatchedCPU(XList * a, MATRIX_TRANS_TYPE transposedA, void MatrixMULBatchedCPU(XList * a, MATRIX_TRANS_TYPE transposedA,
XList * b, MATRIX_TRANS_TYPE transposedB, XList * b, MATRIX_TRANS_TYPE transposedB,
...@@ -64,10 +64,6 @@ void MatrixMULBatchedCPU(XList * a, MATRIX_TRANS_TYPE transposedA, ...@@ -64,10 +64,6 @@ void MatrixMULBatchedCPU(XList * a, MATRIX_TRANS_TYPE transposedA,
} }
} }
//if(isUniform){
//}
//else{
for (int i = 0; i < a->count; i++) { for (int i = 0; i < a->count; i++) {
XTensor * ai = (XTensor*)a->GetItem(i); XTensor * ai = (XTensor*)a->GetItem(i);
XTensor * bi = (XTensor*)b->GetItem(i); XTensor * bi = (XTensor*)b->GetItem(i);
......
...@@ -39,7 +39,7 @@ normal matrix multiplication if A = y * z and B = x * y. ...@@ -39,7 +39,7 @@ normal matrix multiplication if A = y * z and B = x * y.
*/ */
extern "C" extern "C"
void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA, XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c, void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA, XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL); DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -104,7 +104,7 @@ void MatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -104,7 +104,7 @@ void MatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA,
int num = *((int*)b->data); int num = *((int*)b->data);
char * p = (char*)b->data + sizeof(int); // pointer to the first tuple char * p = (char*)b->data + sizeof(int); // pointer to the first tuple
/* a * b */ /* a * b */
if (transposedA == X_NOTRANS && transposedB == X_NOTRANS) { if (transposedA == X_NOTRANS && transposedB == X_NOTRANS) {
for (int i = 0; i < num; i++) { for (int i = 0; i < num; i++) {
int key = *((int*)p); int key = *((int*)p);
......
...@@ -37,11 +37,13 @@ c = a * b * \alpha ...@@ -37,11 +37,13 @@ c = a * b * \alpha
>> aColSize - column size of matrix a >> aColSize - column size of matrix a
>> aRowSize - row size of matrix a >> aRowSize - row size of matrix a
>> b - a sparse matrix >> b - a sparse matrix
>> transposedA - indicates whether b is transposed >> transposedB - indicates whether b is transposed
>> bNonZeroNum - number of non-zero items in b >> bNonZeroNum - number of non-zero items in b
>> bColSize - column size of matrix b >> bColSize - column size of matrix b
>> bRowSize - row size of matrix b >> bRowSize - row size of matrix b
>> c - the resulting (dense) matrix >> c - the resulting (dense) matrix
>> cColSize - column size of matrix c
>> cRowSize - row size of matrix c
>> alpha - the scaling factor >> alpha - the scaling factor
*/ */
extern "C" __global__ extern "C" __global__
...@@ -147,7 +149,6 @@ void CudaMatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -147,7 +149,6 @@ void CudaMatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA,
if (!a->isSparse && !b->isSparse) { if (!a->isSparse && !b->isSparse) {
CheckNTErrors((!c->isSparse), "Illegal use of sparse matrix in multiplication!"); CheckNTErrors((!c->isSparse), "Illegal use of sparse matrix in multiplication!");
//cublasHandle_t * handle = GDevs->GetCudaHandle(a->devID);
cublasHandle_t * handle = a->mem == NULL ? GDevs.GetCudaHandle(a->devID) : a->mem->GetCublasHandle(); cublasHandle_t * handle = a->mem == NULL ? GDevs.GetCudaHandle(a->devID) : a->mem->GetCublasHandle();
/* !!!! might have problems */ /* !!!! might have problems */
...@@ -183,7 +184,6 @@ void CudaMatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -183,7 +184,6 @@ void CudaMatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA,
if (beta == 0) if (beta == 0)
c->SetZeroAll(); c->SetZeroAll();
else if (beta != 1.0F) { else if (beta != 1.0F) {
//XTensor::ScaleAndShift(c, beta, 0);
ShowNTErrors("TODO!"); ShowNTErrors("TODO!");
} }
......
...@@ -40,6 +40,7 @@ where trans() returns the transposed matrix if the flag is fired ...@@ -40,6 +40,7 @@ where trans() returns the transposed matrix if the flag is fired
>> c - where we keep a*b >> c - where we keep a*b
>> alpha - a coefficient >> alpha - a coefficient
>> beta - another coefficient >> beta - another coefficient
>> parallelRunner - parallel processing module
*/ */
void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA, void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA,
XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * b, MATRIX_TRANS_TYPE transposedB,
......
...@@ -27,7 +27,6 @@ ...@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* /*
transform a tensor by merging it alone with a dimension, e.g., (N/3, M, 3) -> (N, M) transform a tensor by merging it alone with a dimension, e.g., (N/3, M, 3) -> (N, M)
>> s - the source tensor >> s - the source tensor
......
...@@ -27,12 +27,12 @@ ...@@ -27,12 +27,12 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* /*
merge data by blocks merge data by blocks
>> sourceList - list of source data array >> sourceList - list of source data array
>> blockSizes - list of the block size for each source data array >> blockSizes - list of the block size for each source data array
>> blockNum - number of blocks kept in each data array >> blockNum - number of blocks kept in each data array
>> target - target data array >> target - target data array
>> myMem - memory pool >> myMem - memory pool
*/ */
void MergeBlockLists(XList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem) void MergeBlockLists(XList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem)
{ {
......
...@@ -34,10 +34,9 @@ copy a number of blocks (of different sizes) to target positions ...@@ -34,10 +34,9 @@ copy a number of blocks (of different sizes) to target positions
>> sourceBlockSizes - the size of the block_i >> sourceBlockSizes - the size of the block_i
>> sourceBlockNum - number of blocks to merge >> sourceBlockNum - number of blocks to merge
>> targetList - list of data arrays to copy to >> targetList - list of data arrays to copy to
>> target - target data array
*/ */
__global__ __global__
void KernelCopyBlockLists(DTYPE * sourceList[], int * sourceBlockSizes, int sourceBlockNum, DTYPE * targetList[]) void KernelCopyBlockLists(DTYPE * sourceList[], int * sourceBlockSizes, int sourceBlockNum, DTYPE * targetList[])
{ {
__shared__ int iBlockSizes[MAX_CUDA_THREAD_NUM_PER_BLOCK]; __shared__ int iBlockSizes[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__ DTYPE * iSourceList[MAX_CUDA_THREAD_NUM_PER_BLOCK]; __shared__ DTYPE * iSourceList[MAX_CUDA_THREAD_NUM_PER_BLOCK];
...@@ -82,7 +81,6 @@ void CudaMergeBlockLists(XList * sourceList, int * blockSizes, int blockNum, voi ...@@ -82,7 +81,6 @@ void CudaMergeBlockLists(XList * sourceList, int * blockSizes, int blockNum, voi
int minBlockSize = MAX_INT; int minBlockSize = MAX_INT;
int maxBlockSize = -MAX_INT; int maxBlockSize = -MAX_INT;
//int realMinBlockSize = 1;
int realMaxBlockSize = 1; int realMaxBlockSize = 1;
DTYPE ** sourceArrays = new DTYPE*[newBlockListSize]; DTYPE ** sourceArrays = new DTYPE*[newBlockListSize];
DTYPE ** targetArrays = new DTYPE*[newBlockListSize]; DTYPE ** targetArrays = new DTYPE*[newBlockListSize];
...@@ -110,7 +108,6 @@ void CudaMergeBlockLists(XList * sourceList, int * blockSizes, int blockNum, voi ...@@ -110,7 +108,6 @@ void CudaMergeBlockLists(XList * sourceList, int * blockSizes, int blockNum, voi
CheckNTErrors((minBlockSize % sizeof(DTYPE) == 0), "Unsupported block size!"); CheckNTErrors((minBlockSize % sizeof(DTYPE) == 0), "Unsupported block size!");
CheckNTErrors((maxBlockSize % sizeof(DTYPE) == 0), "Unsupported block size!"); CheckNTErrors((maxBlockSize % sizeof(DTYPE) == 0), "Unsupported block size!");
//realMinBlockSize = minBlockSize/sizeof(DTYPE);
realMaxBlockSize = maxBlockSize / sizeof(DTYPE); realMaxBlockSize = maxBlockSize / sizeof(DTYPE);
int cudaGridSizes[3]; int cudaGridSizes[3];
...@@ -120,31 +117,16 @@ void CudaMergeBlockLists(XList * sourceList, int * blockSizes, int blockNum, voi ...@@ -120,31 +117,16 @@ void CudaMergeBlockLists(XList * sourceList, int * blockSizes, int blockNum, voi
cudaGridSizes, cudaBlockSizes); cudaGridSizes, cudaBlockSizes);
myMem->SetPinBuf(); myMem->SetPinBuf();
//MTYPE offset0 = myMem->bufUsed;
int * sizesGPU = (int*)myMem->AllocBuf(myMem->devID, sizeof(int) * newBlockListSize, 256); int * sizesGPU = (int*)myMem->AllocBuf(myMem->devID, sizeof(int) * newBlockListSize, 256);
//MTYPE offset1 = myMem->bufUsed;
DTYPE ** sourceArraysGPU = (DTYPE**)myMem->AllocBuf(myMem->devID, sizeof(DTYPE*) * newBlockListSize, 256); DTYPE ** sourceArraysGPU = (DTYPE**)myMem->AllocBuf(myMem->devID, sizeof(DTYPE*) * newBlockListSize, 256);
//MTYPE offset2 = myMem->bufUsed;
DTYPE ** targetArraysGPU = (DTYPE**)myMem->AllocBuf(myMem->devID, sizeof(DTYPE*) * newBlockListSize, 256); DTYPE ** targetArraysGPU = (DTYPE**)myMem->AllocBuf(myMem->devID, sizeof(DTYPE*) * newBlockListSize, 256);
//MTYPE bufSize = myMem->bufUsed - offset0;
//char * CPUBuf = new char[bufSize];
//memset(CPUBuf, 0 , bufSize);
//memcpy(CPUBuf, sizes, sizeof(int) * newBlockListSize);
//memcpy(CPUBuf + (offset1 - offset0), sourceArrays, sizeof(DTYPE*) * newBlockListSize);
//memcpy(CPUBuf + (offset2 - offset0), targetArrays, sizeof(DTYPE*) * newBlockListSize);
XMemCopy(sizesGPU, myMem->devID, sizes, -1, sizeof(int) * newBlockListSize); XMemCopy(sizesGPU, myMem->devID, sizes, -1, sizeof(int) * newBlockListSize);
XMemCopy(sourceArraysGPU, myMem->devID, sourceArrays, -1, sizeof(DTYPE*) * newBlockListSize); XMemCopy(sourceArraysGPU, myMem->devID, sourceArrays, -1, sizeof(DTYPE*) * newBlockListSize);
XMemCopy(targetArraysGPU, myMem->devID, targetArrays, -1, sizeof(DTYPE*) * newBlockListSize); XMemCopy(targetArraysGPU, myMem->devID, targetArrays, -1, sizeof(DTYPE*) * newBlockListSize);
/* it is VERY tricky here because we squeeze three data copies into one */
//XMemCopy(sizesGPU, myMem->devID, CPUBuf, -1, bufSize);
KernelCopyBlockLists << <dim3(cudaGridSizes[0], cudaGridSizes[1]), dim3(cudaBlockSizes[0], cudaBlockSizes[1]) >> > KernelCopyBlockLists << <dim3(cudaGridSizes[0], cudaGridSizes[1]), dim3(cudaBlockSizes[0], cudaBlockSizes[1]) >> >
(sourceArraysGPU, sizesGPU, newBlockListSize, targetArraysGPU); (sourceArraysGPU, sizesGPU, newBlockListSize, targetArraysGPU);
...@@ -154,7 +136,6 @@ void CudaMergeBlockLists(XList * sourceList, int * blockSizes, int blockNum, voi ...@@ -154,7 +136,6 @@ void CudaMergeBlockLists(XList * sourceList, int * blockSizes, int blockNum, voi
delete[] targetArrays; delete[] targetArrays;
delete[] sizes; delete[] sizes;
delete[] offsets; delete[] offsets;
//delete[] CPUBuf;
} }
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
#include "MultiplyElementWise.cuh" #include "MultiplyElementWise.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* /*
element-wise product of two tensors element-wise product of two tensors
c(i) = a(i)*b(i) + \alpha * c(i) c(i) = a(i)*b(i) + \alpha * c(i)
......
...@@ -68,6 +68,7 @@ where |a_lead| means the size of the leading dimension of a ...@@ -68,6 +68,7 @@ where |a_lead| means the size of the leading dimension of a
>> a - tensor a >> a - tensor a
>> b - tensor b >> b - tensor b
>> c - result tensor >> c - result tensor
>> alpha - the coefficient
>> stride - the number of items we go over when move next along the leading dimension in a block >> stride - the number of items we go over when move next along the leading dimension in a block
>> ldSizeA - size of the leading dimension of a >> ldSizeA - size of the leading dimension of a
>> ldSizeB - size of the leading dimension of b >> ldSizeB - size of the leading dimension of b
......
...@@ -26,8 +26,8 @@ ...@@ -26,8 +26,8 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* /*
set every entry to its minus value set every entry to its minus value
>> a - the tensor we are processing >> a - the tensor we are processing
*/ */
void Negate(XTensor * a) void Negate(XTensor * a)
{ {
......
...@@ -42,10 +42,10 @@ void KernelNegate(DTYPE * d, int size) ...@@ -42,10 +42,10 @@ void KernelNegate(DTYPE * d, int size)
} }
/* /*
set each entry to its negtive value (CUDA Kernel) set each entry to its negtive value (CUDA Kernel)
This is for float16 computation This is for float16 computation
>> d - pointer to the data array >> d - pointer to the data array
>> size - size of the data array >> size - size of the data array
*/ */
__global__ __global__
void KernelNegate(__half * d, int size) void KernelNegate(__half * d, int size)
......
...@@ -25,6 +25,7 @@ ...@@ -25,6 +25,7 @@
#include "Normalize.cuh" #include "Normalize.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* /*
normalized the data with normal distribution. For an input x, normalized the data with normal distribution. For an input x,
y = a * (x-mean)/sqrt(variance+\epsilon) + b y = a * (x-mean)/sqrt(variance+\epsilon) + b
......
...@@ -25,6 +25,7 @@ ...@@ -25,6 +25,7 @@
#include "Normalize.cuh" #include "Normalize.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* /*
normalized the data with normal distribution (kernel code). For an input x, normalized the data with normal distribution (kernel code). For an input x,
......
...@@ -28,7 +28,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -28,7 +28,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* normalized the data with normal distribution (Kernel code). For an input x, /*
normalized the data with normal distribution (Kernel code). For an input x,
y = a * (x-mean)/sqrt(variance+\epsilon) + b y = a * (x-mean)/sqrt(variance+\epsilon) + b
where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter
*/ */
...@@ -37,7 +38,8 @@ void KernelNormalize(DTYPE * input, DTYPE * output, DTYPE * mean, DTYPE * var, ...@@ -37,7 +38,8 @@ void KernelNormalize(DTYPE * input, DTYPE * output, DTYPE * mean, DTYPE * var,
DTYPE * a, DTYPE * b, DTYPE epsilon, DTYPE * a, DTYPE * b, DTYPE epsilon,
int stride, int strideNum, int blockNum); int stride, int strideNum, int blockNum);
/* normalized the data with normal distribution. For an input x, /*
normalized the data with normal distribution. For an input x,
y = a * (x-mean)/sqrt(variance+\epsilon) + b y = a * (x-mean)/sqrt(variance+\epsilon) + b
where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter
*/ */
......
...@@ -25,10 +25,11 @@ ...@@ -25,10 +25,11 @@
#include "Power.cuh" #include "Power.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* /*
get the power(a, p) get the power(a, p)
>> a - the tensor >> a - the tensor
>> power - as it is >> p - as it is
*/ */
void Power(XTensor * a, DTYPE p) void Power(XTensor * a, DTYPE p)
{ {
......
...@@ -87,9 +87,6 @@ __global__ ...@@ -87,9 +87,6 @@ __global__
void KernelPower(__half * d, __half p, int size) void KernelPower(__half * d, __half p, int size)
{ {
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) #if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
//int i = blockDim.x * blockIdx.x + threadIdx.x;
//if (i < size)
// d[i] = hpow(d[i], p);
#else #else
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size) if (i < size)
...@@ -126,9 +123,6 @@ void CudaPower(XTensor * a, DTYPE p) ...@@ -126,9 +123,6 @@ void CudaPower(XTensor * a, DTYPE p)
} }
else if (p != (DTYPE)1.0) { else if (p != (DTYPE)1.0) {
ShowNTErrors("TODO!"); ShowNTErrors("TODO!");
//unsigned short p2 = FloatToFloat16(p);
//__half * pp = (__half*)&p2;
//KernelPower<<<blocks, threads>>>((__half*)a->data, *pp, a->unitNum);
} }
} }
else { else {
......
...@@ -31,14 +31,10 @@ namespace nts{ // namespace nts(NiuTrans.Tensor) ...@@ -31,14 +31,10 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
/* /*
reduce a tensor to another that keeps the max value along a dimension - slow version reduce a tensor to another that keeps the max value along a dimension - slow version
Given a block of data, we go over each dimension i in the stride and we have Given a block of data, we go over each dimension i in the stride and we have
sum_i = max_{0<=j<strideNum} input_{i,j} sum_i = max_{0<=j<strideNum} input_{i,j}
where we can view the block as a matrix and input_{i,j} represent the item at the where we can view the block as a matrix and input_{i,j} represent the item at the
crossing of the i-th columne and the j-th row. crossing of the i-th columne and the j-th row.
>> input - the input array (representing a tensor) >> input - the input array (representing a tensor)
>> output - the sum over each block. NOTE: output is also an array >> output - the sum over each block. NOTE: output is also an array
>> stride - stride that we need to move to the next item >> stride - stride that we need to move to the next item
...@@ -89,82 +85,77 @@ void KernelReduceMax(DTYPE * input, DTYPE * output, ...@@ -89,82 +85,77 @@ void KernelReduceMax(DTYPE * input, DTYPE * output,
} }
/* /*
reduce a tensor to another that keeps the max value along a dimension - slow version reduce a tensor to another that keeps the max value along a dimension - slow version
Given a block of data, we go over each dimension i in the stride and we have
Given a block of data, we go over each dimension i in the stride and we have sum_i = max_{0<=j<strideNum} input_{i,j}
where we can view the block as a matrix and input_{i,j} represent the item at the
sum_i = max_{0<=j<strideNum} input_{i,j} crossing of the i-th columne and the j-th row.
>> input - the input array (representing a tensor)
where we can view the block as a matrix and input_{i,j} represent the item at the >> output - the sum over each block. NOTE: output is also an array
crossing of the i-th columne and the j-th row. >> stride - stride that we need to move to the next item
>> strideNum - how many strides we need to finish the reduce
>> input - the input array (representing a tensor) >> reducedStrideNum - the number of strides after reducation
>> output - the sum over each block. NOTE: output is also an array >> blockSize - size of the block (i.e., stride * strideNum)
>> stride - stride that we need to move to the next item >> blockNum - how many blocks
>> strideNum - how many strides we need to finish the reduce */
>> reducedStrideNum - the number of strides after reducation __global__
>> blockSize - size of the block (i.e., stride * strideNum) void KernelReduceMax(__half * input, __half * output,
>> blockNum - how many blocks int stride, int strideNum, int reducedStrideNum,
*/ int blockSize, int blockNum)
__global__ {
void KernelReduceMax(__half * input, __half * output, int idx = threadIdx.x * blockDim.y + threadIdx.y;
int stride, int strideNum, int reducedStrideNum, unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
int blockSize, int blockNum) unsigned int j = blockIdx.y*blockDim.y + threadIdx.y;
{
int idx = threadIdx.x * blockDim.y + threadIdx.y;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int j = blockIdx.y*blockDim.y + threadIdx.y;
if (i >= stride * blockNum) if (i >= stride * blockNum)
return; return;
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) #if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
__shared__ __half iData[MAX_CUDA_THREAD_NUM_PER_BLOCK * MIN_CUDA_SHARED_MEM_COL_SIZE / 2]; __shared__ __half iData[MAX_CUDA_THREAD_NUM_PER_BLOCK * MIN_CUDA_SHARED_MEM_COL_SIZE / 2];
#else #else
__shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK * MIN_CUDA_SHARED_MEM_COL_SIZE / 2]; __shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK * MIN_CUDA_SHARED_MEM_COL_SIZE / 2];
#endif #endif
__syncthreads(); __syncthreads();
int k = i / stride; int k = i / stride;
int iOffset = i % stride; int iOffset = i % stride;
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) #if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
__half value = (i < stride * blockNum && j < strideNum) ? __half value = (i < stride * blockNum && j < strideNum) ?
input[blockSize * k + stride * j + iOffset] : __half(FLOAT16_MIN); input[blockSize * k + stride * j + iOffset] : __half(FLOAT16_MIN);
#else #else
DTYPE value = (i < stride * blockNum && j < strideNum) ? DTYPE value = (i < stride * blockNum && j < strideNum) ?
__half2float(input[blockSize * k + stride * j + iOffset]) : FLOAT_MIN; __half2float(input[blockSize * k + stride * j + iOffset]) : FLOAT_MIN;
#endif #endif
/* load data into the shared mem */ /* load data into the shared mem */
iData[threadIdx.x * blockDim.y + threadIdx.y] = value; iData[threadIdx.x * blockDim.y + threadIdx.y] = value;
__syncthreads(); __syncthreads();
/* do reduction in shared mem */ /* do reduction in shared mem */
for (unsigned int s = blockDim.y / 2; s > 0; s >>= 1) { for (unsigned int s = blockDim.y / 2; s > 0; s >>= 1) {
if (threadIdx.y < s && iData[idx] < iData[idx + s]) { if (threadIdx.y < s && iData[idx] < iData[idx + s]) {
iData[idx] = iData[idx + s]; iData[idx] = iData[idx + s];
} }
__syncthreads(); __syncthreads();
} }
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) #if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
/* write result for this block to the output array */ /* write result for this block to the output array */
if (threadIdx.y == 0 && blockIdx.y < reducedStrideNum) if (threadIdx.y == 0 && blockIdx.y < reducedStrideNum)
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = iData[threadIdx.x * blockDim.y]; output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = iData[threadIdx.x * blockDim.y];
#else #else
/* write result for this block to the output array */ /* write result for this block to the output array */
if (threadIdx.y == 0 && blockIdx.y < reducedStrideNum) if (threadIdx.y == 0 && blockIdx.y < reducedStrideNum)
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = __half(iData[threadIdx.x * blockDim.y]); output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = __half(iData[threadIdx.x * blockDim.y]);
#endif #endif
} }
/* /*
reduce a tensor to another that keeps the max value along a dimension - fast version reduce a tensor to another that keeps the max value along a dimension - fast version
>> input - the input array (representing a tensor) >> input - the input array (representing a tensor)
...@@ -338,9 +329,7 @@ void KernelReduceMaxSimpleFast(DTYPE * input, DTYPE * output, ...@@ -338,9 +329,7 @@ void KernelReduceMaxSimpleFast(DTYPE * input, DTYPE * output,
/* /*
get the max-valued items along a dimension of the tensor (cuda version). get the max-valued items along a dimension of the tensor (cuda version).
For a 1-dimensional data array a, For a 1-dimensional data array a,
sum_i = max_{0<=j<strideNum} input_{i,j} sum_i = max_{0<=j<strideNum} input_{i,j}
>> input - the input tensor >> input - the input tensor
>> output - the output tensor >> output - the output tensor
>> dim - which dimension to reduce >> dim - which dimension to reduce
......
...@@ -28,7 +28,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor) ...@@ -28,7 +28,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
/* /*
get the mean value along a dimension of the tensor. For a 1-dimensional data array a, get the mean value along a dimension of the tensor. For a 1-dimensional data array a,
mean = (1/n) * sum_i input_i mean = (1/n) * sum_i input_i
>> input - the input tensor >> input - the input tensor
>> output - the output tensor >> output - the output tensor
>> dim - the dimension where the reduction is performed on >> dim - the dimension where the reduction is performed on
...@@ -44,5 +43,4 @@ void ReduceMean(XTensor * input, XTensor * output, int dim) ...@@ -44,5 +43,4 @@ void ReduceMean(XTensor * input, XTensor * output, int dim)
ScaleAndShift(output, (DTYPE)1/num, 0); ScaleAndShift(output, (DTYPE)1/num, 0);
} }
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
...@@ -29,13 +29,11 @@ namespace nts{ // namespace nts(NiuTrans.Tensor) ...@@ -29,13 +29,11 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
/* /*
reduce a tensor to another that keeps the sum along a dimension - slow version reduce a tensor to another that keeps the sum along a dimension - slow version
Given a block of data, we go over each dimension i in the stride and we have Given a block of data, we go over each dimension i in the stride and we have
sum_i = sum_{0<=j<strideNum} exp(input_{i,j} - shift) if isExp == true; sum_i = sum_{0<=j<strideNum} exp(input_{i,j} - shift) if isExp == true;
= sum_{0<=j<strideNum} input_{i,j} - shift if isExp == false; = sum_{0<=j<strideNum} input_{i,j} - shift if isExp == false;
where we can view the block as a matrix and input_{i,j} represent the item at the where we can view the block as a matrix and input_{i,j} represent the item at the
crossing of the i-th columne and the j-th row. crossing of the i-th columne and the j-th row.
>> input - the input array (representing a tensor) >> input - the input array (representing a tensor)
>> output - the sum over each block. NOTE: output is also an array >> output - the sum over each block. NOTE: output is also an array
>> stride - stride that we need to move to the next item >> stride - stride that we need to move to the next item
...@@ -107,13 +105,11 @@ void KernelReduceSum(DTYPE * input, DTYPE * output, ...@@ -107,13 +105,11 @@ void KernelReduceSum(DTYPE * input, DTYPE * output,
/* /*
reduce a tensor to another that keeps the sum along a dimension - slow version reduce a tensor to another that keeps the sum along a dimension - slow version
This is for float16 reduction. This is for float16 reduction.
Given a block of data, we go over each dimension i in the stride and we have Given a block of data, we go over each dimension i in the stride and we have
sum_i = sum_{0<=j<strideNum} exp(input_{i,j} - shift) if isExp == true; sum_i = sum_{0<=j<strideNum} exp(input_{i,j} - shift) if isExp == true;
= sum_{0<=j<strideNum} input_{i,j} - shift if isExp == false; = sum_{0<=j<strideNum} input_{i,j} - shift if isExp == false;
where we can view the block as a matrix and input_{i,j} represent the item at the where we can view the block as a matrix and input_{i,j} represent the item at the
crossing of the i-th columne and the j-th row. crossing of the i-th columne and the j-th row.
>> input - the input array (representing a tensor) >> input - the input array (representing a tensor)
>> output - the sum over each block. NOTE: output is also an array >> output - the sum over each block. NOTE: output is also an array
>> stride - stride that we need to move to the next item >> stride - stride that we need to move to the next item
...@@ -304,7 +300,6 @@ void KernelReduceSumFast(DTYPE * input, DTYPE * output, ...@@ -304,7 +300,6 @@ void KernelReduceSumFast(DTYPE * input, DTYPE * output,
/* /*
reduce a tensor to another that keeps the sum along a dimension - fast version reduce a tensor to another that keeps the sum along a dimension - fast version
This is for float16 reduction This is for float16 reduction
>> input - the input array (representing a tensor) >> input - the input array (representing a tensor)
>> output - the sum over each block. NOTE: output is also an array >> output - the sum over each block. NOTE: output is also an array
>> stride - stride that we need to move to the next item >> stride - stride that we need to move to the next item
......
...@@ -28,7 +28,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor) ...@@ -28,7 +28,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
squared sum of the items along a dimension of the tensor. squared sum of the items along a dimension of the tensor.
For a 1-dimensional data array a, For a 1-dimensional data array a,
sum = \sum_i (a_i - shift)^2 sum = \sum_i (a_i - shift)^2
>> input - the input tensor >> input - the input tensor
>> output - the output tensor >> output - the output tensor
>> dim - the dimension where the reduction is performed on >> dim - the dimension where the reduction is performed on
......
...@@ -29,7 +29,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor) ...@@ -29,7 +29,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
variance of the items along a dimension of the tensor. variance of the items along a dimension of the tensor.
For a 1-dimensional data array a, For a 1-dimensional data array a,
variance = 1/n * \sum_i (a_i - mean)^2 variance = 1/n * \sum_i (a_i - mean)^2
>> input - the input tensor >> input - the input tensor
>> output - the output tensor >> output - the output tensor
>> dim - the dimension where the reduction is performed on >> dim - the dimension where the reduction is performed on
......
...@@ -26,9 +26,7 @@ namespace nts{ // namespace nts(NiuTrans.Tensor) ...@@ -26,9 +26,7 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
/* /*
scale and shift all tensor entires scale and shift all tensor entires
p = p * scale + shift p = p * scale + shift
>> a - the tensor >> a - the tensor
>> scale - the scaler factor >> scale - the scaler factor
>> shift - the shift factor >> shift - the shift factor
......
...@@ -80,9 +80,7 @@ void KernelScaleAndShift(__half * d, int size, __half scale, __half shift) ...@@ -80,9 +80,7 @@ void KernelScaleAndShift(__half * d, int size, __half scale, __half shift)
/* /*
scale and shift all matrix entires scale and shift all matrix entires
p = p * scale + shift p = p * scale + shift
>> a - the tensor >> a - the tensor
>> scale - the scaler factor >> scale - the scaler factor
>> shift - the shift factor >> shift - the shift factor
......
...@@ -31,7 +31,7 @@ c = select(a) ...@@ -31,7 +31,7 @@ c = select(a)
>> dim - the dimension along with which we do the job >> dim - the dimension along with which we do the job
>> low - lower bound >> low - lower bound
>> high - higher bound. >> high - higher bound.
Note that range [1,3] means that we select 1 and 2. Note that range [1,3] means that we select 1 and 2.
>> c - result tensor >> c - result tensor
*/ */
void SelectRange(XTensor * a, int dim, int low, int high, XTensor * c) void SelectRange(XTensor * a, int dim, int low, int high, XTensor * c)
...@@ -75,5 +75,4 @@ void SelectRange(XTensor * a, int dim, int low, int high, XTensor * c) ...@@ -75,5 +75,4 @@ void SelectRange(XTensor * a, int dim, int low, int high, XTensor * c)
} }
} }
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
...@@ -68,10 +68,11 @@ void SetDataRand(XTensor * tensor, DTYPE low, DTYPE high) ...@@ -68,10 +68,11 @@ void SetDataRand(XTensor * tensor, DTYPE low, DTYPE high)
ShowNTErrors("TODO"); ShowNTErrors("TODO");
} }
} }
/* GPU code /*
The trick here is that initialize the data on a temperary tensor on CPU. GPU code
The CPU data is then copied to GPU. The trick here is that initialize the data on a temperary tensor on CPU.
TODO: generate data points on GPUs straightforwardly. The CPU data is then copied to GPU.
TODO: generate data points on GPUs straightforwardly.
*/ */
else{ else{
XTensor * t2 = NewTensor(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, -1); XTensor * t2 = NewTensor(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, -1);
......
...@@ -39,6 +39,7 @@ void Sort(XTensor * a, XTensor * index, int dim) ...@@ -39,6 +39,7 @@ void Sort(XTensor * a, XTensor * index, int dim)
CheckNTErrors((index->dataType == X_INT), "Wrong data type!"); CheckNTErrors((index->dataType == X_INT), "Wrong data type!");
int dimRDI = a->order - dim - 1; int dimRDI = a->order - dim - 1;
/* make the index tensor */ /* make the index tensor */
index->SetAscendingOrder(dim); index->SetAscendingOrder(dim);
......
...@@ -29,6 +29,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -29,6 +29,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* sort the tensor along a given dimension */ /* sort the tensor along a given dimension */
extern "C"
void CudaSortBig(XTensor * a, XTensor * b, XTensor * indexA, XTensor * indexB, int dim, int k = -1); void CudaSortBig(XTensor * a, XTensor * b, XTensor * indexA, XTensor * indexB, int dim, int k = -1);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -27,6 +27,7 @@ ...@@ -27,6 +27,7 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* transform a tensor by splitting it, e.g., (M, N) -> (M, N/3, 3) */ /* transform a tensor by splitting it, e.g., (M, N) -> (M, N/3, 3) */
extern "C"
void Split(XTensor * s, XTensor * t, int whereToSplit, int splitNum); void Split(XTensor * s, XTensor * t, int whereToSplit, int splitNum);
/* split a big tensor into small tensors */ /* split a big tensor into small tensors */
......
...@@ -25,6 +25,7 @@ ...@@ -25,6 +25,7 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* /*
summation of data arrays (CUDA Kernel) summation of data arrays (CUDA Kernel)
c = a + b * \beta c = a + b * \beta
......
...@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* summation of data arrays (CUDA Kernel) */ /* summation of data arrays (CUDA Kernel) */
extern "C" __global__ extern "C" __global__
void KernelADD(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta = (DTYPE)1.0); void KernelADD(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta = (DTYPE)1.0);
......
...@@ -27,6 +27,7 @@ ...@@ -27,6 +27,7 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* /*
summation of a vector (column vector) and a tensor summation of a vector (column vector) and a tensor
c = a + \sum{col} b_col * \beta c = a + \sum{col} b_col * \beta
......
...@@ -26,7 +26,6 @@ ...@@ -26,7 +26,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* sum of a (column) vector and a tensor */ /* sum of a (column) vector and a tensor */
extern "C" extern "C"
void SumByColumnVT(XTensor * a, XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0); void SumByColumnVT(XTensor * a, XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);
......
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
#include "TopK.cuh" #include "TopK.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* /*
get the top-k items along a given dimension get the top-k items along a given dimension
>> a - input tensor >> a - input tensor
......
...@@ -95,9 +95,11 @@ public: ...@@ -95,9 +95,11 @@ public:
/* swap */ /* swap */
__device__ void Swap(int i, int j) __device__ void Swap(int i, int j)
{ {
/*CudaHeapNode<T> tmp = items[i]; /*
CudaHeapNode<T> tmp = items[i];
items[i] = items[j]; items[i] = items[j];
items[j] = tmp;*/ items[j] = tmp;
*/
int tmpIndex = items[i].index; int tmpIndex = items[i].index;
T tmpValue = items[i].value; T tmpValue = items[i].value;
items[i] = items[j]; items[i] = items[j];
...@@ -239,8 +241,10 @@ void KernelTopK(T * input, int stride, int strideNum, int blockNum, int k, T min ...@@ -239,8 +241,10 @@ void KernelTopK(T * input, int stride, int strideNum, int blockNum, int k, T min
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
CudaXHeap<MIN_HEAP, T> heapFinal(k, k, heapData + k * threadIdx.y * blockDim.x); CudaXHeap<MIN_HEAP, T> heapFinal(k, k, heapData + k * threadIdx.y * blockDim.x);
/* merge the result over the workers. /*
This can be improved by parallel merging */ merge the result over the workers.
This can be improved by parallel merging
*/
if (blockDim.x > 1) { if (blockDim.x > 1) {
for (int p = 1; p < blockDim.x && p < strideNum; p++) { for (int p = 1; p < blockDim.x && p < strideNum; p++) {
CudaHeapNode<T> * hd = heapData + k * (threadIdx.y * blockDim.x + p); CudaHeapNode<T> * hd = heapData + k * (threadIdx.y * blockDim.x + p);
...@@ -429,6 +433,7 @@ void CudaTopK(XTensor * a, XTensor * b, XTensor * index, int dim, int k) ...@@ -429,6 +433,7 @@ void CudaTopK(XTensor * a, XTensor * b, XTensor * index, int dim, int k)
} }
} }
/* we resort to sorting if the data cannot fit inside the shared memory */ /* we resort to sorting if the data cannot fit inside the shared memory */
else { else {
int dimSize[MAX_TENSOR_DIM_NUM]; int dimSize[MAX_TENSOR_DIM_NUM];
......
...@@ -227,7 +227,7 @@ int SegmentTensor2D(int rowNum, int colNum, int blockNum, int * blockIndex) ...@@ -227,7 +227,7 @@ int SegmentTensor2D(int rowNum, int colNum, int blockNum, int * blockIndex)
x2 = colSize - 1; x2 = colSize - 1;
y2 = rowSize - 1; // bottom-right corner y2 = rowSize - 1; // bottom-right corner
/* the main body of the matrix (after removing the margin block) */ /* the main body of the matrix (after removing the margin block) */
while (x1 <= xMax) { while (x1 <= xMax) {
y1 = 0; y1 = 0;
x2 = x1 + colSize - 1; x2 = x1 + colSize - 1;
......
...@@ -26,9 +26,7 @@ ...@@ -26,9 +26,7 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/******************************************************************* /* segmentation and parallel processing for 2d tensors (i.e., matrices) */
segmentation and parallel processing for 2d tensors (i.e., matrices)
*/
/* segment a 2d tensor (i.e., matrix) into blocks and run jobs in parallel */ /* segment a 2d tensor (i.e., matrix) into blocks and run jobs in parallel */
extern "C" extern "C"
void RunParallel2D(XPRunner * parallelRunner, void * job, int opNum, int rowNum, int colNum, int argNum, ...); void RunParallel2D(XPRunner * parallelRunner, void * job, int opNum, int rowNum, int colNum, int argNum, ...);
......
...@@ -28,9 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -28,9 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* /* matrix multiplication via cuda version BLAS */
matrix multiplication via cuda version BLAS
*/
void CudaBLASMatrixMUL(cublasHandle_t * handle, void CudaBLASMatrixMUL(cublasHandle_t * handle,
void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB, void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
...@@ -85,9 +83,7 @@ void CudaBLASMatrixMUL(cublasHandle_t * handle, ...@@ -85,9 +83,7 @@ void CudaBLASMatrixMUL(cublasHandle_t * handle,
} }
} }
/* /* matrix multiplication via cuda version BLAS */
matrix multiplication via cuda version BLAS
*/
void CudaBLASMatrixMULBatched(cublasHandle_t * handle, void CudaBLASMatrixMULBatched(cublasHandle_t * handle,
const void ** a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, const void ** a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
const void ** b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB, const void ** b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
...@@ -143,7 +139,6 @@ void CudaBLASMatrixMULBatched(cublasHandle_t * handle, ...@@ -143,7 +139,6 @@ void CudaBLASMatrixMULBatched(cublasHandle_t * handle,
} }
/* matrix multiplication in batch and strided mode via cuda version BLAS */ /* matrix multiplication in batch and strided mode via cuda version BLAS */
extern "C"
void CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle, void CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, long long int strideA, const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, long long int strideA,
const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB, long long int strideB, const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB, long long int strideB,
...@@ -198,9 +193,7 @@ void CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle, ...@@ -198,9 +193,7 @@ void CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
} }
} }
/* /* matrix multiplication via cuda version BLAS */
matrix multiplication via cuda version BLAS
*/
void CudaBLASMatrixMULList(cublasHandle_t * handle, void CudaBLASMatrixMULList(cublasHandle_t * handle,
XList * a, MATRIX_TRANS_TYPE transposedA, XList * a, MATRIX_TRANS_TYPE transposedA,
XList * b, MATRIX_TRANS_TYPE transposedB, XList * b, MATRIX_TRANS_TYPE transposedB,
......
...@@ -37,33 +37,41 @@ public: ...@@ -37,33 +37,41 @@ public:
concatenate a list of tensors along a given dimension concatenate a list of tensors along a given dimension
Note that this is actually a wrapper that selects "ConcatenateSolely" Note that this is actually a wrapper that selects "ConcatenateSolely"
or "Merge" by means of the tensor shapes */ or "Merge" by means of the tensor shapes */
extern "C"
void Concatenate(XList * smalls, XTensor * big, int dim); void Concatenate(XList * smalls, XTensor * big, int dim);
/* concatenate two tensors along a given dimension */ /* concatenate two tensors along a given dimension */
extern "C"
void Concatenate(XTensor * smallA, XTensor * smallB, XTensor * big, int dim); void Concatenate(XTensor * smallA, XTensor * smallB, XTensor * big, int dim);
/* concatenate a list of tensors along a given dimension */ /* concatenate a list of tensors along a given dimension */
extern "C"
static static
void ConcatenateSolely(XList * smalls, XTensor * big, int dim); void ConcatenateSolely(XList * smalls, XTensor * big, int dim);
/* copy selected sub-tensors */ /* copy selected sub-tensors */
extern "C"
static static
bool CopyIndexed(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum); bool CopyIndexed(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum);
/* copy a number of blocks in grid */ /* copy a number of blocks in grid */
extern "C"
static static
void CopyInGrid(XTensor * s, XTensor * t, int * index, int blockDim, int blockNumInGrid, bool isIndexOnDev = false); void CopyInGrid(XTensor * s, XTensor * t, int * index, int blockDim, int blockNumInGrid, bool isIndexOnDev = false);
/* copy s to t */ /* copy s to t */
extern "C"
static static
bool CopyValues(XTensor * s, XTensor * t, XStream * stream = NULL); bool CopyValues(XTensor * s, XTensor * t, XStream * stream = NULL);
/* set target data block index for the data movement in merge */ /* set target data block index for the data movement in merge */
extern "C"
static static
void MakeMergeBlockIndex(int * blockIndex, int blockNum, int blockNumInMerge, void MakeMergeBlockIndex(int * blockIndex, int blockNum, int blockNumInMerge,
int splitSizeInGrid, int gridSize, int gridNum, XMem * mem); int splitSizeInGrid, int gridSize, int gridNum, XMem * mem);
/* set target data block index for the data movement in split */ /* set target data block index for the data movement in split */
extern "C"
static static
void MakeSplitBlockIndex(int * blockIndex, int splitNum, int blockSplitSize, int blockNum, XMem * mem); void MakeSplitBlockIndex(int * blockIndex, int splitNum, int blockSplitSize, int blockNum, XMem * mem);
...@@ -78,6 +86,7 @@ public: ...@@ -78,6 +86,7 @@ public:
tensor of the result C. C should be a tensor of z * x * n * m. Obviously C = A * B performs tensor of the result C. C should be a tensor of z * x * n * m. Obviously C = A * B performs
normal matrix multiplication if A = y * z and B = x * y. normal matrix multiplication if A = y * z and B = x * y.
*/ */
extern "C"
static static
void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA, XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c, void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA, XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL); DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
...@@ -87,6 +96,7 @@ public: ...@@ -87,6 +96,7 @@ public:
c = trans(a) * trans(b) * alpha + c * beta c = trans(a) * trans(b) * alpha + c * beta
where trans() return the transposed matrix if the flag is fired where trans() return the transposed matrix if the flag is fired
*/ */
extern "C"
static static
void MatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA, XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c, void MatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA, XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL, XStream * stream = NULL); DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL, XStream * stream = NULL);
...@@ -95,6 +105,7 @@ public: ...@@ -95,6 +105,7 @@ public:
matrix multiplication for a block (x1,y1) - (x2,y2) matrix multiplication for a block (x1,y1) - (x2,y2)
where (x1,y1) is the upper-left corner and (x2,y2) is the bottom-right corner where (x1,y1) is the upper-left corner and (x2,y2) is the bottom-right corner
*/ */
extern "C"
static static
void MatrixMul2DMultiTheading(XList * args); void MatrixMul2DMultiTheading(XList * args);
...@@ -103,6 +114,7 @@ public: ...@@ -103,6 +114,7 @@ public:
c = trans(a) * trans(b) * alpha + c * beta c = trans(a) * trans(b) * alpha + c * beta
where trans() return the transposed matrix if the flag is fired where trans() return the transposed matrix if the flag is fired
*/ */
extern "C"
static static
void MatrixMul2DParallel(XTensor * a, MATRIX_TRANS_TYPE transposedA, XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c, void MatrixMul2DParallel(XTensor * a, MATRIX_TRANS_TYPE transposedA, XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL); DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
...@@ -114,29 +126,36 @@ public: ...@@ -114,29 +126,36 @@ public:
ci = trans(ai) * trans(bi) * alpha + cm * beta ci = trans(ai) * trans(bi) * alpha + cm * beta
where trans() returns the transposed matrix if the flag is fired where trans() returns the transposed matrix if the flag is fired
*/ */
extern "C"
static static
void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA, XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c, void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA, XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL); DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
/* matrix multiplication in batch mode (CPU code) */ /* matrix multiplication in batch mode (CPU code) */
extern "C"
static static
void MatrixMULBatchedCPU(XList * a, MATRIX_TRANS_TYPE transposedA, XList * b, MATRIX_TRANS_TYPE transposedB, XList * c, void MatrixMULBatchedCPU(XList * a, MATRIX_TRANS_TYPE transposedA, XList * b, MATRIX_TRANS_TYPE transposedB, XList * c,
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0); DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
/* transform a tensor by merging it alone with a dimension, e.g., (M, N/3, 3) -> (M, N) */ /* transform a tensor by merging it alone with a dimension, e.g., (M, N/3, 3) -> (M, N) */
void Merge(XTensor * s, XTensor * t, int whereToMerge, int leadingDim = -1); extern "C"
void Merge(XTensor * s, XTensor * t, int whereToMerge, int leadingDim = -1);
/* merge small tensors into a big tensor */ /* merge small tensors into a big tensor */
extern "C"
void Merge(XList * smalls, XTensor * big, int whereToMerge); void Merge(XList * smalls, XTensor * big, int whereToMerge);
/* merge data by blocks */ /* merge data by blocks */
extern "C"
void MergeBlockLists(XList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem); void MergeBlockLists(XList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem);
/* element-wise product of two tensors */ /* element-wise product of two tensors */
extern "C"
static static
void MultiplyElementWise(XTensor * a, XTensor * b, XTensor * c, int leadingDim, DTYPE alpha = 0); void MultiplyElementWise(XTensor * a, XTensor * b, XTensor * c, int leadingDim, DTYPE alpha = 0);
/* set every entry to its minus value */ /* set every entry to its minus value */
extern "C"
void Negate(XTensor * a); void Negate(XTensor * a);
/* /*
...@@ -144,13 +163,16 @@ public: ...@@ -144,13 +163,16 @@ public:
y = a * (x-mean)/sqrt(variance+\epsilon) + b y = a * (x-mean)/sqrt(variance+\epsilon) + b
where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter. where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter.
*/ */
extern "C"
static static
void Normalize(XTensor * input, XTensor * output, int dim, XTensor * mean, XTensor * var, XTensor * a, XTensor * b, DTYPE epsilon); void Normalize(XTensor * input, XTensor * output, int dim, XTensor * mean, XTensor * var, XTensor * a, XTensor * b, DTYPE epsilon);
/* get the power(x, y) */ /* get the power(x, y) */
extern "C"
void Power(XTensor * a, DTYPE p); void Power(XTensor * a, DTYPE p);
/* get the max value of the items along a dimension of the tensor. */ /* get the max value of the items along a dimension of the tensor. */
extern "C"
static static
void ReduceMax(XTensor * input, XTensor * output, int dim); void ReduceMax(XTensor * input, XTensor * output, int dim);
...@@ -158,6 +180,7 @@ public: ...@@ -158,6 +180,7 @@ public:
get the mean value along a dimension of the tensor. For a 1-dimensional data array a, get the mean value along a dimension of the tensor. For a 1-dimensional data array a,
mean = (1/n) * sum_i input_i mean = (1/n) * sum_i input_i
*/ */
extern "C"
static static
void ReduceMean(XTensor * input, XTensor * output, int dim); void ReduceMean(XTensor * input, XTensor * output, int dim);
...@@ -165,6 +188,7 @@ public: ...@@ -165,6 +188,7 @@ public:
standard variance of the items along a dimension of the tensor. For a 1-dimensional data array a, standard variance of the items along a dimension of the tensor. For a 1-dimensional data array a,
variance = (1/n * \sum_i (a_i - mean)^2)^0.5 variance = (1/n * \sum_i (a_i - mean)^2)^0.5
*/ */
extern "C"
static static
void ReduceStandardVariance(XTensor * input, XTensor * output, int dim, XTensor * mean); void ReduceStandardVariance(XTensor * input, XTensor * output, int dim, XTensor * mean);
...@@ -173,6 +197,7 @@ public: ...@@ -173,6 +197,7 @@ public:
sum = \sum_i (a_i - shift) if isExp == false sum = \sum_i (a_i - shift) if isExp == false
sum = \sum_i exp(a_i - shift) if isExp == true sum = \sum_i exp(a_i - shift) if isExp == true
*/ */
extern "C"
static static
void ReduceSum(XTensor * input, XTensor * output, int dim, XTensor * shift = NULL, DTYPE power = (DTYPE)1.0F, bool isExp = false); void ReduceSum(XTensor * input, XTensor * output, int dim, XTensor * shift = NULL, DTYPE power = (DTYPE)1.0F, bool isExp = false);
...@@ -180,6 +205,7 @@ public: ...@@ -180,6 +205,7 @@ public:
squared sum of the items along a dimension of the tensor. For a 1-dimensional data array a, squared sum of the items along a dimension of the tensor. For a 1-dimensional data array a,
sum = \sum_i (a_i - shift)^2 sum = \sum_i (a_i - shift)^2
*/ */
extern "C"
static static
void ReduceSumSquared(XTensor * input, XTensor * output, int dim, XTensor * shift); void ReduceSumSquared(XTensor * input, XTensor * output, int dim, XTensor * shift);
...@@ -187,60 +213,73 @@ public: ...@@ -187,60 +213,73 @@ public:
variance of the items along a dimension of the tensor. For a 1-dimensional data array a, variance of the items along a dimension of the tensor. For a 1-dimensional data array a,
variance = 1/n * \sum_i (a_i - mean)^2 variance = 1/n * \sum_i (a_i - mean)^2
*/ */
extern "C"
static static
void ReduceVariance(XTensor * input, XTensor * output, int dim, XTensor * mean); void ReduceVariance(XTensor * input, XTensor * output, int dim, XTensor * mean);
/* scale and shift all tensor entires */ /* scale and shift all tensor entires */
extern "C"
static static
void ScaleAndShift(XTensor * a, DTYPE scale, DTYPE shift); void ScaleAndShift(XTensor * a, DTYPE scale, DTYPE shift);
/* transform a tensor by splitting it, e.g., (M, N) -> (M, N/3, 3) */ /* transform a tensor by splitting it, e.g., (M, N) -> (M, N/3, 3) */
extern "C"
void Split(XTensor * s, XTensor * t, int whereToSplit, int splitNum); void Split(XTensor * s, XTensor * t, int whereToSplit, int splitNum);
/* split a big tensor into small tensors */ /* split a big tensor into small tensors */
extern "C"
void Split(XTensor * big, XList * smalls, int whereToSplit, int splitNum); void Split(XTensor * big, XList * smalls, int whereToSplit, int splitNum);
/* tensor summation c = a + b * \beta */ /* tensor summation c = a + b * \beta */
extern "C"
static static
void Sum(XTensor * a, XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0); void Sum(XTensor * a, XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);
/* sum of a tensor and a (column) vector */ /* sum of a tensor and a (column) vector */
extern "C"
static static
void SumByColumnTV(XTensor * a, XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0); void SumByColumnTV(XTensor * a, XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);
/* sum of a (column) vector and a tensor */ /* sum of a (column) vector and a tensor */
extern "C"
static static
void SumByColumnVT(XTensor * a, XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0); void SumByColumnVT(XTensor * a, XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);
/* get the top-k items along a given dimension */ /* get the top-k items along a given dimension */
extern "C"
static static
void TopK(XTensor * a, XTensor * b, XTensor * index, int dim, int k); void TopK(XTensor * a, XTensor * b, XTensor * index, int dim, int k);
/* insert a dimension by copying the blocks for x times (where x is the size of the inerted dimension) */ /* insert a dimension by copying the blocks for x times (where x is the size of the inerted dimension) */
void Unsqueeze(XTensor * a, XTensor * b, int dim, int dSize); extern "C"
void Unsqueeze(XTensor * a, XTensor * b, int dim, int dSize);
/* segmentation and parallel processing for 2d tensors (i.e., matrices) */
/*******************************************************************
segmentation and parallel processing for 2d tensors (i.e., matrices)
*/
/* segment a 2d tensor (i.e., matrix) into blocks and run jobs in parallel */ /* segment a 2d tensor (i.e., matrix) into blocks and run jobs in parallel */
static extern "C"
static
void RunParallel2D(XPRunner * parallelRunner, void * job, int opNum, int rowNum, int colNum, int argNum, ...); void RunParallel2D(XPRunner * parallelRunner, void * job, int opNum, int rowNum, int colNum, int argNum, ...);
/* segment a block into sub-blocks */ /* segment a block into sub-blocks */
extern "C"
static static
int SegmentTensor2D(int rowNum, int colNum, int blockNum, int * blockIndex); int SegmentTensor2D(int rowNum, int colNum, int blockNum, int * blockIndex);
/* segment a block into sub-blocks */ /* segment a block into sub-blocks */
extern "C"
static static
int SegmentTensor2DInRows(int rowNum, int colNum, int blockNum, int * blockIndex); int SegmentTensor2DInRows(int rowNum, int colNum, int blockNum, int * blockIndex);
/* matrix multiplication (BLAS) */ /* matrix multiplication (BLAS) */
extern "C"
static static
void MatrixMULCPU(XTensor * a, MATRIX_TRANS_TYPE transposedA, XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0); void MatrixMULCPU(XTensor * a, MATRIX_TRANS_TYPE transposedA, XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
#ifdef USE_CUDA #ifdef USE_CUDA
/* matrix multiplication via cuda version BLAS */ /* matrix multiplication via cuda version BLAS */
extern "C"
static static
void CudaBLASMatrixMUL(cublasHandle_t * handle, void CudaBLASMatrixMUL(cublasHandle_t * handle,
void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
...@@ -249,6 +288,7 @@ public: ...@@ -249,6 +288,7 @@ public:
int na, int ma, int nb, int mb, int nc, int mc, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0); int na, int ma, int nb, int mb, int nc, int mc, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
/* matrix multiplication in batch mode via cuda version BLAS */ /* matrix multiplication in batch mode via cuda version BLAS */
extern "C"
static static
void CudaBLASMatrixMULBatched(cublasHandle_t * handle, void CudaBLASMatrixMULBatched(cublasHandle_t * handle,
const void ** a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, const void ** a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
...@@ -257,6 +297,7 @@ public: ...@@ -257,6 +297,7 @@ public:
int count, int na, int ma, int nb, int mb, int nc, int mc, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0); int count, int na, int ma, int nb, int mb, int nc, int mc, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
/* matrix multiplication in batch and strided mode via cuda version BLAS */ /* matrix multiplication in batch and strided mode via cuda version BLAS */
extern "C"
static static
void CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle, void CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, long long int strideA, const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, long long int strideA,
...@@ -265,6 +306,7 @@ public: ...@@ -265,6 +306,7 @@ public:
int count, int na, int ma, int nb, int mb, int nc, int mc, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0); int count, int na, int ma, int nb, int mb, int nc, int mc, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
/* matrix multiplication in batch mode via cuda version BLAS */ /* matrix multiplication in batch mode via cuda version BLAS */
extern "C"
static static
void CudaBLASMatrixMULList(cublasHandle_t * handle, XList * a, MATRIX_TRANS_TYPE transposedA, XList * b, MATRIX_TRANS_TYPE transposedB, XList * c, void CudaBLASMatrixMULList(cublasHandle_t * handle, XList * a, MATRIX_TRANS_TYPE transposedA, XList * b, MATRIX_TRANS_TYPE transposedB, XList * c,
int count, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0); int count, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
......
...@@ -25,7 +25,6 @@ ...@@ -25,7 +25,6 @@
namespace nts{ // namespace nts(NiuTrans.Tensor) namespace nts{ // namespace nts(NiuTrans.Tensor)
/* /*
hard tanh function hard tanh function
y = 1 if x > 1 y = 1 if x > 1
......
...@@ -95,7 +95,6 @@ dy/dx = 1 if -1 <= x <= 1 ...@@ -95,7 +95,6 @@ dy/dx = 1 if -1 <= x <= 1
>> y - y of the function >> y - y of the function
>> x - x of the function >> x - x of the function
>> size - size of y/x >> size - size of y/x
*/ */
__global__ __global__
void KernelHardtanhBackward(DTYPE * dedy, DTYPE * dedx, DTYPE * gold, DTYPE * y, DTYPE * x, int size) void KernelHardtanhBackward(DTYPE * dedy, DTYPE * dedx, DTYPE * gold, DTYPE * y, DTYPE * x, int size)
......
...@@ -49,7 +49,6 @@ void LogSoftmax(XTensor * x, XTensor * y, int leadDim) ...@@ -49,7 +49,6 @@ void LogSoftmax(XTensor * x, XTensor * y, int leadDim)
dimSize[i - 1] = -x->dimSize[i]; dimSize[i - 1] = -x->dimSize[i];
} }
XMem * mem = x->mem; XMem * mem = x->mem;
XTensor * max = NULL; XTensor * max = NULL;
XTensor * sum = NULL; XTensor * sum = NULL;
...@@ -168,7 +167,6 @@ dE/dx = dE/dy * dy/dx ...@@ -168,7 +167,6 @@ dE/dx = dE/dy * dy/dx
log softmax: y_i = log(e^{x_i} / \sum_{k} e^{x_k}) log softmax: y_i = log(e^{x_i} / \sum_{k} e^{x_k})
dy_i/dx_j dy_i/dx_j
= d{log(e^{x_i} / \sum_{k} e^{x_k})}/dx_j = d{log(e^{x_i} / \sum_{k} e^{x_k})}/dx_j
= d{log(e^{x_i})}/dx_j - d{log(\sum_{k} e^{x_k})}/dx_j = d{log(e^{x_i})}/dx_j - d{log(\sum_{k} e^{x_k})}/dx_j
......
...@@ -41,7 +41,8 @@ void CudaLogSoftmax(XTensor * x, XTensor * y, int leadDim) ...@@ -41,7 +41,8 @@ void CudaLogSoftmax(XTensor * x, XTensor * y, int leadDim)
ShowNTErrors("You should call LogSoftmax instead!"); ShowNTErrors("You should call LogSoftmax instead!");
} }
/* log softmax forward computation (Cuda kernel) /*
log softmax forward computation (Cuda kernel)
for each column j, let y_{i,j} and x_{i,j} are the output for each column j, let y_{i,j} and x_{i,j} are the output
and state value for the i-th element of column j. We have and state value for the i-th element of column j. We have
...@@ -85,7 +86,8 @@ void KernelLogSoftmaxComputeByRow(DTYPE * x, DTYPE * max, DTYPE * sum, DTYPE * y ...@@ -85,7 +86,8 @@ void KernelLogSoftmaxComputeByRow(DTYPE * x, DTYPE * max, DTYPE * sum, DTYPE * y
} }
} }
/* log softmax forward computation (Cuda kernel) /*
log softmax forward computation (Cuda kernel)
for each row i, let y_{i,j} and x_{i,j} are the output for each row i, let y_{i,j} and x_{i,j} are the output
and state value for the j-th element of row i. We have and state value for the j-th element of row i. We have
...@@ -182,7 +184,7 @@ void CudaLogSoftmaxSumMax(XTensor * x, XTensor * y, int leadDim, XTensor * sum, ...@@ -182,7 +184,7 @@ void CudaLogSoftmaxSumMax(XTensor * x, XTensor * y, int leadDim, XTensor * sum,
/* /*
set dE/dx = exp(y) set dE/dx = exp(y)
>> dedu - dE/dy >> dedy - dE/dy
>> dedx - dE/dx >> dedx - dE/dx
>> y - output of the function >> y - output of the function
>> size - size of output >> size - size of output
...@@ -256,7 +258,9 @@ dE/dx_j += -gold_j ...@@ -256,7 +258,9 @@ dE/dx_j += -gold_j
>> gold - gold standard to measure error (or loss) >> gold - gold standard to measure error (or loss)
>> y - output of the function >> y - output of the function
>> x - input of the function >> x - input of the function
>> size - size of input/output >> rowNum - row number of the matrix
>> colNum - column number of the matrix
>> gNonZeroNum -
>> lossName - name of the loss function >> lossName - name of the loss function
*/ */
__global__ __global__
...@@ -293,7 +297,6 @@ dE/dx = dE/dy * dy/dx ...@@ -293,7 +297,6 @@ dE/dx = dE/dy * dy/dx
log softmax: y_i = log(e^{x_i} / \sum_{k} e^{x_k}) log softmax: y_i = log(e^{x_i} / \sum_{k} e^{x_k})
dy_i/dx_j dy_i/dx_j
= d{log(e^{x_i} / \sum_{k} e^{x_k})}/dx_j = d{log(e^{x_i} / \sum_{k} e^{x_k})}/dx_j
= d{log(e^{x_i})}/dx_j - d{log(\sum_{k} e^{x_k})}/dx_j = d{log(e^{x_i})}/dx_j - d{log(\sum_{k} e^{x_k})}/dx_j
......
...@@ -31,7 +31,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor) ...@@ -31,7 +31,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
loss function to measure the "number" of errors loss function to measure the "number" of errors
*/ */
/* /*
compute the loss compute the loss
>> gold - gold standard >> gold - gold standard
......
...@@ -88,7 +88,6 @@ dy/dx = 1 if x >= 0 ...@@ -88,7 +88,6 @@ dy/dx = 1 if x >= 0
>> y - output of the function >> y - output of the function
>> x - input of the function >> x - input of the function
>> size - size of output/input >> size - size of output/input
*/ */
__global__ __global__
void KernelRectifyBackward(DTYPE * dedy, DTYPE * dedx, DTYPE * gold, DTYPE * y, DTYPE * x, int size) void KernelRectifyBackward(DTYPE * dedy, DTYPE * dedx, DTYPE * gold, DTYPE * y, DTYPE * x, int size)
......
...@@ -25,7 +25,6 @@ ...@@ -25,7 +25,6 @@
namespace nts{ // namespace nts(NiuTrans.Tensor) namespace nts{ // namespace nts(NiuTrans.Tensor)
/* /*
sigmoid function y = 1/(1+exp(-x)) sigmoid function y = 1/(1+exp(-x))
>> x - input tensor >> x - input tensor
......
...@@ -95,7 +95,6 @@ sigmoid: y = 1/(1+exp(-x)) ...@@ -95,7 +95,6 @@ sigmoid: y = 1/(1+exp(-x))
>> y - output of the function >> y - output of the function
>> x - input of the function >> x - input of the function
>> size - size of output/input >> size - size of output/input
*/ */
__global__ __global__
void KernelSigmoidBackward(DTYPE * dedy, DTYPE * dedx, DTYPE * gold, DTYPE * y, DTYPE * x, int size) void KernelSigmoidBackward(DTYPE * dedy, DTYPE * dedx, DTYPE * gold, DTYPE * y, DTYPE * x, int size)
...@@ -122,7 +121,6 @@ sigmoid: y = 1/(1+exp(-x)) ...@@ -122,7 +121,6 @@ sigmoid: y = 1/(1+exp(-x))
>> dedy - dE/dy >> dedy - dE/dy
>> dedx - dE/dx >> dedx - dE/dx
>> lossName - type of loss function, e.g., cross entropy >> lossName - type of loss function, e.g., cross entropy
*/ */
void CudaSigmoidBackward(XTensor * gold, XTensor * y, XTensor * x, void CudaSigmoidBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx, XTensor * dedy, XTensor * dedx,
......
...@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* softmax y = e^x / \sum_{i} e^{x_i} (Cuda version) */ /* softmax y = e^x / \sum_{i} e^{x_i} (Cuda version) */
extern "C" extern "C"
void CudaSotmax(XTensor * input, XTensor * output, int leadDim); void CudaSotmax(XTensor * input, XTensor * output, int leadDim);
......
...@@ -22,8 +22,10 @@ ...@@ -22,8 +22,10 @@
#include "TConcatenate.h" #include "TConcatenate.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: concatenate a list of tensors along a given dimension.
* In this case, 2 * (2, 1) -> (2, 2), dim=1. /*
case 1: concatenate a list of tensors along a given dimension.
In this case, 2 * (2, 1) -> (2, 2), dim=1.
*/ */
bool TestConcatenate1() bool TestConcatenate1()
{ {
...@@ -60,12 +62,12 @@ bool TestConcatenate1() ...@@ -60,12 +62,12 @@ bool TestConcatenate1()
for (int i = 0; i < tOrder; i++) for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i]; tUnitNum *= tDimSize[i];
DTYPE sData1[2][1] = { {0.0}, DTYPE sData1[2][1] = { {0.0F},
{1.0} }; {1.0F} };
DTYPE sData2[2][1] = { {2.0}, DTYPE sData2[2][1] = { {2.0F},
{3.0} }; {3.0F} };
DTYPE answer[2][2] = { {0.0, 2.0}, DTYPE answer[2][2] = { {0.0F, 2.0F},
{1.0, 3.0} }; {1.0F, 3.0F} };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -144,8 +146,9 @@ bool TestConcatenate1() ...@@ -144,8 +146,9 @@ bool TestConcatenate1()
#endif // USE_CUDA #endif // USE_CUDA
} }
/* case 2: concatenate a list of tensors along a given dimension. /*
* In this case, 2 * (2, 1) -> (4, 1), dim=0. case 2: concatenate a list of tensors along a given dimension.
In this case, 2 * (2, 1) -> (4, 1), dim=0.
*/ */
bool TestConcatenate2() bool TestConcatenate2()
{ {
...@@ -182,14 +185,14 @@ bool TestConcatenate2() ...@@ -182,14 +185,14 @@ bool TestConcatenate2()
for (int i = 0; i < tOrder; i++) for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i]; tUnitNum *= tDimSize[i];
DTYPE sData1[2][1] = { {0.0}, DTYPE sData1[2][1] = { {0.0F},
{1.0} }; {1.0F} };
DTYPE sData2[2][1] = { {2.0}, DTYPE sData2[2][1] = { {2.0F},
{3.0} }; {3.0F} };
DTYPE answer[4][1] = { {0.0}, DTYPE answer[4][1] = { {0.0F},
{1.0}, {1.0F},
{2.0}, {2.0F},
{3.0} }; {3.0F} };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -268,8 +271,9 @@ bool TestConcatenate2() ...@@ -268,8 +271,9 @@ bool TestConcatenate2()
#endif // USE_CUDA #endif // USE_CUDA
} }
/* case 3: concatenate a list of tensors along a given dimension. /*
* In this case, (2, 1) + (2, 2) -> (2, 3), dim=1. case 3: concatenate a list of tensors along a given dimension.
In this case, (2, 1) + (2, 2) -> (2, 3), dim=1.
*/ */
bool TestConcatenate3() bool TestConcatenate3()
{ {
...@@ -306,12 +310,12 @@ bool TestConcatenate3() ...@@ -306,12 +310,12 @@ bool TestConcatenate3()
for (int i = 0; i < tOrder; i++) for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i]; tUnitNum *= tDimSize[i];
DTYPE sData1[2][1] = { {0.0}, DTYPE sData1[2][1] = { {0.0F},
{1.0} }; {1.0F} };
DTYPE sData2[2][2] = { {2.0, 3.0}, DTYPE sData2[2][2] = { {2.0F, 3.0F},
{4.0, 5.0} }; {4.0F, 5.0F} };
DTYPE answer[2][3] = { {0.0, 2.0, 3.0}, DTYPE answer[2][3] = { {0.0F, 2.0F, 3.0F},
{1.0, 4.0, 5.0} }; {1.0F, 4.0F, 5.0F} };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -390,8 +394,9 @@ bool TestConcatenate3() ...@@ -390,8 +394,9 @@ bool TestConcatenate3()
#endif // USE_CUDA #endif // USE_CUDA
} }
/* case 4: concatenate two tensors along a given dimension. /*
* In this case, (2, 1), (2, 2) -> (2, 3), dim=1. case 4: concatenate two tensors along a given dimension.
In this case, (2, 1), (2, 2) -> (2, 3), dim=1.
*/ */
bool TestConcatenate4() bool TestConcatenate4()
{ {
...@@ -425,12 +430,12 @@ bool TestConcatenate4() ...@@ -425,12 +430,12 @@ bool TestConcatenate4()
for (int i = 0; i < tOrder; i++) for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i]; tUnitNum *= tDimSize[i];
DTYPE sData1[2][1] = { {0.0}, DTYPE sData1[2][1] = { {0.0F},
{1.0} }; {1.0F} };
DTYPE sData2[2][2] = { {2.0, 3.0}, DTYPE sData2[2][2] = { {2.0F, 3.0F},
{4.0, 5.0} }; {4.0F, 5.0F} };
DTYPE answer[2][3] = { {0.0, 2.0, 3.0}, DTYPE answer[2][3] = { {0.0F, 2.0F, 3.0F},
{1.0, 4.0, 5.0} }; {1.0F, 4.0F, 5.0F} };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -502,7 +507,6 @@ TODO!! ...@@ -502,7 +507,6 @@ TODO!!
*/ */
/* test for Concatenate Function */ /* test for Concatenate Function */
extern "C"
bool TestConcatenate() bool TestConcatenate()
{ {
XPRINT(0, stdout, "[TEST CONCATENATE] concatenate a list of tensors or two tensors along a given dimension \n"); XPRINT(0, stdout, "[TEST CONCATENATE] concatenate a list of tensors or two tensors along a given dimension \n");
......
...@@ -19,12 +19,14 @@ ...@@ -19,12 +19,14 @@
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-06-14 * $Created by: Lin Ye (email: linye2015@outlook.com) 2018-06-14
*/ */
#include "TConcatenateSolely.h"
#include "../XList.h" #include "../XList.h"
#include "TConcatenateSolely.h"
namespace nts { // namespace nt(NiuTrans.Tensor) namespace nts { // namespace nt(NiuTrans.Tensor)
/* case 1: concatenate a list of tensors along a given dimension
* In this case, 2 * (2, 1) -> (2, 2), dim=1. /*
case 1: concatenate a list of tensors along a given dimension
In this case, 2 * (2, 1) -> (2, 2), dim=1.
*/ */
bool TestConcatenateSolely1() bool TestConcatenateSolely1()
{ {
...@@ -61,12 +63,12 @@ bool TestConcatenateSolely1() ...@@ -61,12 +63,12 @@ bool TestConcatenateSolely1()
for (int i = 0; i < tOrder; i++) for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i]; tUnitNum *= tDimSize[i];
DTYPE sData1[2][1] = { {0.0}, DTYPE sData1[2][1] = { {0.0F},
{1.0} }; {1.0F} };
DTYPE sData2[2][1] = { {2.0}, DTYPE sData2[2][1] = { {2.0F},
{3.0} }; {3.0F} };
DTYPE answer[2][2] = { {0.0, 2.0}, DTYPE answer[2][2] = { {0.0F, 2.0F},
{1.0, 3.0} }; {1.0F, 3.0F} };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -145,8 +147,9 @@ bool TestConcatenateSolely1() ...@@ -145,8 +147,9 @@ bool TestConcatenateSolely1()
#endif // USE_CUDA #endif // USE_CUDA
} }
/* case 2: concatenate a list of tensors along a given dimension /*
* In this case, 2 * (2, 1) -> (4, 1), dim=0. case 2: concatenate a list of tensors along a given dimension
In this case, 2 * (2, 1) -> (4, 1), dim=0.
*/ */
bool TestConcatenateSolely2() bool TestConcatenateSolely2()
{ {
...@@ -183,14 +186,14 @@ bool TestConcatenateSolely2() ...@@ -183,14 +186,14 @@ bool TestConcatenateSolely2()
for (int i = 0; i < tOrder; i++) for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i]; tUnitNum *= tDimSize[i];
DTYPE sData1[2][1] = { {0.0}, DTYPE sData1[2][1] = { {0.0F},
{1.0} }; {1.0F} };
DTYPE sData2[2][1] = { {2.0}, DTYPE sData2[2][1] = { {2.0F},
{3.0} }; {3.0F} };
DTYPE answer[4][1] = { {0.0}, DTYPE answer[4][1] = { {0.0F},
{1.0}, {1.0F},
{2.0}, {2.0F},
{3.0} }; {3.0F} };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -269,8 +272,9 @@ bool TestConcatenateSolely2() ...@@ -269,8 +272,9 @@ bool TestConcatenateSolely2()
#endif // USE_CUDA #endif // USE_CUDA
} }
/* case 3: concatenate a list of tensors along a given dimension /*
* In this case, (2, 1) + (2, 2) -> (2, 3), dim=1. case 3: concatenate a list of tensors along a given dimension
In this case, (2, 1) + (2, 2) -> (2, 3), dim=1.
*/ */
bool TestConcatenateSolely3() bool TestConcatenateSolely3()
{ {
...@@ -307,12 +311,12 @@ bool TestConcatenateSolely3() ...@@ -307,12 +311,12 @@ bool TestConcatenateSolely3()
for (int i = 0; i < tOrder; i++) for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i]; tUnitNum *= tDimSize[i];
DTYPE sData1[2][1] = { {0.0}, DTYPE sData1[2][1] = { {0.0F},
{1.0} }; {1.0F} };
DTYPE sData2[2][2] = { {2.0, 3.0}, DTYPE sData2[2][2] = { {2.0F, 3.0F},
{4.0, 5.0} }; {4.0F, 5.0F} };
DTYPE answer[2][3] = { {0.0, 2.0, 3.0}, DTYPE answer[2][3] = { {0.0F, 2.0F, 3.0F},
{1.0, 4.0, 5.0} }; {1.0F, 4.0F, 5.0F} };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -397,7 +401,6 @@ TODO!! ...@@ -397,7 +401,6 @@ TODO!!
*/ */
/* test for ConcatenateSolely Function */ /* test for ConcatenateSolely Function */
extern "C"
bool TestConcatenateSolely() bool TestConcatenateSolely()
{ {
XPRINT(0, stdout, "[TEST CONCATENATESOLELY] concatenate a list of tensors along a given dimension \n"); XPRINT(0, stdout, "[TEST CONCATENATESOLELY] concatenate a list of tensors along a given dimension \n");
......
...@@ -22,9 +22,11 @@ ...@@ -22,9 +22,11 @@
#include "TCopyIndexed.h" #include "TCopyIndexed.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1 copy indexed sub-tensors
* In this case, (3, 2, 3) -> (3, 2, 2), dim = 2, indexSize = 2, /*
* srcIndex = [0, 2], tgtIndex = [0, 1], copyNum = 1. case 1 copy indexed sub-tensors
In this case, (3, 2, 3) -> (3, 2, 2), dim = 2, indexSize = 2,
srcIndex = [0, 2], tgtIndex = [0, 1], copyNum = 1.
*/ */
bool TestCopyIndexed1() bool TestCopyIndexed1()
{ {
...@@ -50,19 +52,19 @@ bool TestCopyIndexed1() ...@@ -50,19 +52,19 @@ bool TestCopyIndexed1()
for (int i = 0; i < tOrder; i++) for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i]; tUnitNum *= tDimSize[i];
DTYPE sData[3][2][3] = { { {0.0, -1.0, 2.0}, DTYPE sData[3][2][3] = { { {0.0F, -1.0F, 2.0F},
{2.0, 1.0, 3.0} }, {2.0F, 1.0F, 3.0F} },
{ {1.0, 2.0, 4.0}, { {1.0F, 2.0F, 4.0F},
{3.0, 1.0, 2.0}}, {3.0F, 1.0F, 2.0F}},
{ {-1.0, 3.0, 2.0}, { {-1.0F, 3.0F, 2.0F},
{1.0, -1.0, 0.0} } }; {1.0F, -1.0F, 0.0F} } };
DTYPE answer[3][2][2] = { { {0.0, 2.0}, DTYPE answer[3][2][2] = { { {0.0F, 2.0F},
{2.0, 3.0} }, {2.0F, 3.0F} },
{ {1.0, 4.0}, { {1.0F, 4.0F},
{3.0, 2.0}}, {3.0F, 2.0F}},
{ {-1.0, 2.0}, { {-1.0F, 2.0F},
{1.0, 0.0} } }; {1.0F, 0.0F} } };
int dim = 2; int dim = 2;
int indexSize = 2; int indexSize = 2;
int srcIndex[2] = {0, 2}; int srcIndex[2] = {0, 2};
...@@ -131,7 +133,6 @@ TODO!! ...@@ -131,7 +133,6 @@ TODO!!
*/ */
/* test for CopyIndexed Function */ /* test for CopyIndexed Function */
extern "C"
bool TestCopyIndexed() bool TestCopyIndexed()
{ {
XPRINT(0, stdout, "[TEST CopyIndexed] copy indexed sub-tensors \n"); XPRINT(0, stdout, "[TEST CopyIndexed] copy indexed sub-tensors \n");
......
...@@ -23,6 +23,7 @@ ...@@ -23,6 +23,7 @@
#include "TCopyValues.h" #include "TCopyValues.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: copy tensor s to tensor t */ /* case 1: copy tensor s to tensor t */
bool TestCopyValues1() bool TestCopyValues1()
{ {
...@@ -36,11 +37,11 @@ bool TestCopyValues1() ...@@ -36,11 +37,11 @@ bool TestCopyValues1()
for (int i = 0; i < sOrder; i++) for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i]; sUnitNum *= sDimSize[i];
DTYPE sData[2][4] = { {0.0, 1.0, 2.0, 3.0}, DTYPE sData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0, 5.0, 6.0, 7.0} }; {4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE scaleFactor = 2.0; DTYPE scaleFactor = 2.0F;
DTYPE shiftFactor = 0.5; DTYPE shiftFactor = 0.5F;
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -105,7 +106,6 @@ TODO!! ...@@ -105,7 +106,6 @@ TODO!!
*/ */
/* test for CopyValues Function */ /* test for CopyValues Function */
extern "C"
bool TestCopyValues() bool TestCopyValues()
{ {
XPRINT(0, stdout, "[TEST CopyValues] copy tensor s to tensor t \n"); XPRINT(0, stdout, "[TEST CopyValues] copy tensor s to tensor t \n");
......
...@@ -22,10 +22,11 @@ ...@@ -22,10 +22,11 @@
#include "THardTanH.h" #include "THardTanH.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: hard tanh function */ /* case 1: hard tanh function */
bool TestHardTanH1() bool TestHardTanH1()
{ {
/* a x tensor of size 2 * 3 */ /* a x tensor of size (2, 3) */
int xOrder = 2; int xOrder = 2;
int * xDimSize = new int[xOrder]; int * xDimSize = new int[xOrder];
xDimSize[0] = 2; xDimSize[0] = 2;
...@@ -35,7 +36,7 @@ bool TestHardTanH1() ...@@ -35,7 +36,7 @@ bool TestHardTanH1()
for (int i = 0; i < xOrder; i++) for (int i = 0; i < xOrder; i++)
xUnitNum *= xDimSize[i]; xUnitNum *= xDimSize[i];
/* a y tensor of size 2 * 3 */ /* a y tensor of size (2, 3) */
int yOrder = 2; int yOrder = 2;
int * yDimSize = new int[yOrder]; int * yDimSize = new int[yOrder];
yDimSize[0] = 2; yDimSize[0] = 2;
...@@ -45,10 +46,10 @@ bool TestHardTanH1() ...@@ -45,10 +46,10 @@ bool TestHardTanH1()
for (int i = 0; i < yOrder; i++) for (int i = 0; i < yOrder; i++)
yUnitNum *= yDimSize[i]; yUnitNum *= yDimSize[i];
DTYPE xData[2][3] = { {0.5, -1.0, 2.0}, DTYPE xData[2][3] = { {0.5F, -1.0F, 2.0F},
{3.5, -4.5, 1.0} }; {3.5F, -4.5F, 1.0F} };
DTYPE answer[2][3] = { {0.5, -1.0, 1.0}, DTYPE answer[2][3] = { {0.5F, -1.0F, 1.0F},
{1.0, -1.0, 1.0} }; {1.0F, -1.0F, 1.0F} };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -86,25 +87,32 @@ bool TestHardTanH1() ...@@ -86,25 +87,32 @@ bool TestHardTanH1()
gpuTest = yGPU->CheckData(answer, yUnitNum, 1e-4F); gpuTest = yGPU->CheckData(answer, yUnitNum, 1e-4F);
/* destroy variables */ /* destroy variables */
delete x, y, xGPU, yGPU; delete x;
delete[] xDimSize, yDimSize; delete y;
delete xGPU;
delete yGPU;
delete[] xDimSize;
delete[] yDimSize;
return cpuTest && gpuTest; return cpuTest && gpuTest;
#else #else
/* destroy variables */ /* destroy variables */
delete x, y; delete x;
delete[] xDimSize, yDimSize; delete y;
delete[] xDimSize;
delete[] yDimSize;
return cpuTest; return cpuTest;
#endif // USE_CUDA #endif // USE_CUDA
} }
/* case 2: backward computation /*
* In this case, lossName=CROSSENTROPY. case 2: backward computation
In this case, lossName=CROSSENTROPY.
*/ */
bool TestHardTanH2() bool TestHardTanH2()
{ {
/* a x tensor of size 2 * 3 */ /* a x tensor of size (2, 3) */
int xOrder = 2; int xOrder = 2;
int * xDimSize = new int[xOrder]; int * xDimSize = new int[xOrder];
xDimSize[0] = 2; xDimSize[0] = 2;
...@@ -114,7 +122,7 @@ bool TestHardTanH2() ...@@ -114,7 +122,7 @@ bool TestHardTanH2()
for (int i = 0; i < xOrder; i++) for (int i = 0; i < xOrder; i++)
xUnitNum *= xDimSize[i]; xUnitNum *= xDimSize[i];
/* a y tensor of size 2 * 3 */ /* a y tensor of size (2, 3) */
int yOrder = 2; int yOrder = 2;
int * yDimSize = new int[yOrder]; int * yDimSize = new int[yOrder];
yDimSize[0] = 2; yDimSize[0] = 2;
...@@ -124,7 +132,7 @@ bool TestHardTanH2() ...@@ -124,7 +132,7 @@ bool TestHardTanH2()
for (int i = 0; i < yOrder; i++) for (int i = 0; i < yOrder; i++)
yUnitNum *= yDimSize[i]; yUnitNum *= yDimSize[i];
/* a gold tensor of size 2 * 3 */ /* a gold tensor of size (2, 3) */
int goldOrder = 2; int goldOrder = 2;
int * goldDimSize = new int[goldOrder]; int * goldDimSize = new int[goldOrder];
goldDimSize[0] = 2; goldDimSize[0] = 2;
...@@ -134,7 +142,7 @@ bool TestHardTanH2() ...@@ -134,7 +142,7 @@ bool TestHardTanH2()
for (int i = 0; i < goldOrder; i++) for (int i = 0; i < goldOrder; i++)
goldUnitNum *= goldDimSize[i]; goldUnitNum *= goldDimSize[i];
/* a dedy tensor of size 2 * 3 */ /* a dedy tensor of size (2, 3) */
int dedyOrder = 2; int dedyOrder = 2;
int * dedyDimSize = new int[dedyOrder]; int * dedyDimSize = new int[dedyOrder];
dedyDimSize[0] = 2; dedyDimSize[0] = 2;
...@@ -144,7 +152,7 @@ bool TestHardTanH2() ...@@ -144,7 +152,7 @@ bool TestHardTanH2()
for (int i = 0; i < dedyOrder; i++) for (int i = 0; i < dedyOrder; i++)
dedyUnitNum *= dedyDimSize[i]; dedyUnitNum *= dedyDimSize[i];
/* a dedx tensor of size 2 * 3 */ /* a dedx tensor of size (2, 3) */
int dedxOrder = 2; int dedxOrder = 2;
int * dedxDimSize = new int[dedxOrder]; int * dedxDimSize = new int[dedxOrder];
dedxDimSize[0] = 2; dedxDimSize[0] = 2;
...@@ -154,16 +162,16 @@ bool TestHardTanH2() ...@@ -154,16 +162,16 @@ bool TestHardTanH2()
for (int i = 0; i < dedxOrder; i++) for (int i = 0; i < dedxOrder; i++)
dedxUnitNum *= dedxDimSize[i]; dedxUnitNum *= dedxDimSize[i];
DTYPE xData[2][3] = { {0.5, -1.0, 2.0}, DTYPE xData[2][3] = { {0.5F, -1.0F, 2.0F},
{3.5, -4.5, 1.0} }; {3.5F, -4.5F, 1.0F} };
DTYPE yData[2][3] = { {0.5, -1.0, 1.0}, DTYPE yData[2][3] = { {0.5F, -1.0F, 1.0F},
{1.0, -1.0, 1.0} }; {1.0F, -1.0F, 1.0F} };
DTYPE goldData[2][3] = { {1.0, 1.0, 1.0}, DTYPE goldData[2][3] = { {1.0F, 1.0F, 1.0F},
{1.0, 1.0, 1.0} }; {1.0F, 1.0F, 1.0F} };
DTYPE dedyData[2][3] = { {-2.0, 1.0, -1.0}, DTYPE dedyData[2][3] = { {-2.0F, 1.0F, -1.0F},
{-1.0, 1.0, -1.0} }; {-1.0F, 1.0F, -1.0F} };
DTYPE answer[2][3] = { {-2.0, 1.0, 0.0}, DTYPE answer[2][3] = { {-2.0F, 1.0F, 0.0F},
{0.0, 0.0, -1.0} }; {0.0F, 0.0F, -1.0F} };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -226,12 +234,13 @@ bool TestHardTanH2() ...@@ -226,12 +234,13 @@ bool TestHardTanH2()
#endif // USE_CUDA #endif // USE_CUDA
} }
/* case 3: backward computation /*
* In this case, lossName=SQUAREDERROR. case 3: backward computation
In this case, lossName=SQUAREDERROR.
*/ */
bool TestHardTanH3() bool TestHardTanH3()
{ {
/* a x tensor of size 2 * 3 */ /* a x tensor of size (2, 3) */
int xOrder = 2; int xOrder = 2;
int * xDimSize = new int[xOrder]; int * xDimSize = new int[xOrder];
xDimSize[0] = 2; xDimSize[0] = 2;
...@@ -241,7 +250,7 @@ bool TestHardTanH3() ...@@ -241,7 +250,7 @@ bool TestHardTanH3()
for (int i = 0; i < xOrder; i++) for (int i = 0; i < xOrder; i++)
xUnitNum *= xDimSize[i]; xUnitNum *= xDimSize[i];
/* a y tensor of size 2 * 3 */ /* a y tensor of size (2, 3) */
int yOrder = 2; int yOrder = 2;
int * yDimSize = new int[yOrder]; int * yDimSize = new int[yOrder];
yDimSize[0] = 2; yDimSize[0] = 2;
...@@ -251,7 +260,7 @@ bool TestHardTanH3() ...@@ -251,7 +260,7 @@ bool TestHardTanH3()
for (int i = 0; i < yOrder; i++) for (int i = 0; i < yOrder; i++)
yUnitNum *= yDimSize[i]; yUnitNum *= yDimSize[i];
/* a gold tensor of size 2 * 3 */ /* a gold tensor of size (2, 3) */
int goldOrder = 2; int goldOrder = 2;
int * goldDimSize = new int[goldOrder]; int * goldDimSize = new int[goldOrder];
goldDimSize[0] = 2; goldDimSize[0] = 2;
...@@ -261,7 +270,7 @@ bool TestHardTanH3() ...@@ -261,7 +270,7 @@ bool TestHardTanH3()
for (int i = 0; i < goldOrder; i++) for (int i = 0; i < goldOrder; i++)
goldUnitNum *= goldDimSize[i]; goldUnitNum *= goldDimSize[i];
/* a dedy tensor of size 2 * 3 */ /* a dedy tensor of size (2, 3) */
int dedyOrder = 2; int dedyOrder = 2;
int * dedyDimSize = new int[dedyOrder]; int * dedyDimSize = new int[dedyOrder];
dedyDimSize[0] = 2; dedyDimSize[0] = 2;
...@@ -271,7 +280,7 @@ bool TestHardTanH3() ...@@ -271,7 +280,7 @@ bool TestHardTanH3()
for (int i = 0; i < dedyOrder; i++) for (int i = 0; i < dedyOrder; i++)
dedyUnitNum *= dedyDimSize[i]; dedyUnitNum *= dedyDimSize[i];
/* a dedx tensor of size 2 * 3 */ /* a dedx tensor of size (2, 3) */
int dedxOrder = 2; int dedxOrder = 2;
int * dedxDimSize = new int[dedxOrder]; int * dedxDimSize = new int[dedxOrder];
dedxDimSize[0] = 2; dedxDimSize[0] = 2;
...@@ -281,16 +290,16 @@ bool TestHardTanH3() ...@@ -281,16 +290,16 @@ bool TestHardTanH3()
for (int i = 0; i < dedxOrder; i++) for (int i = 0; i < dedxOrder; i++)
dedxUnitNum *= dedxDimSize[i]; dedxUnitNum *= dedxDimSize[i];
DTYPE xData[2][3] = { {0.5, -1.0, 2.0}, DTYPE xData[2][3] = { {0.5F, -1.0F, 2.0F},
{3.5, -4.5, 1.0} }; {3.5F, -4.5F, 1.0F} };
DTYPE yData[2][3] = { {0.5, -1.0, 1.0}, DTYPE yData[2][3] = { {0.5F, -1.0F, 1.0F},
{1.0, -1.0, 1.0} }; {1.0F, -1.0F, 1.0F} };
DTYPE goldData[2][3] = { {1.0, 1.0, 1.0}, DTYPE goldData[2][3] = { {1.0F, 1.0F, 1.0F},
{1.0, 1.0, 1.0} }; {1.0F, 1.0F, 1.0F} };
DTYPE dedyData[2][3] = { {-0.5, -2.0, 0.0 }, DTYPE dedyData[2][3] = { {-0.5F, -2.0F, 0.0F },
{0.0, -2.0, 0.0 } }; {0.0F, -2.0F, 0.0F } };
DTYPE answer[2][3] = { {-0.5, -2.0, 0.0}, DTYPE answer[2][3] = { {-0.5F, -2.0F, 0.0F},
{0.0, 0.0, 0.0} }; {0.0F, 0.0F, 0.0F} };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -353,12 +362,13 @@ bool TestHardTanH3() ...@@ -353,12 +362,13 @@ bool TestHardTanH3()
#endif // USE_CUDA #endif // USE_CUDA
} }
/* case 4: backward computation /*
* In this case, lossName=ONEHOTERROR. case 4: backward computation
In this case, lossName=ONEHOTERROR.
*/ */
bool TestHardTanH4() bool TestHardTanH4()
{ {
/* a x tensor of size 2 * 3 */ /* a x tensor of size (2, 3) */
int xOrder = 2; int xOrder = 2;
int * xDimSize = new int[xOrder]; int * xDimSize = new int[xOrder];
xDimSize[0] = 2; xDimSize[0] = 2;
...@@ -368,7 +378,7 @@ bool TestHardTanH4() ...@@ -368,7 +378,7 @@ bool TestHardTanH4()
for (int i = 0; i < xOrder; i++) for (int i = 0; i < xOrder; i++)
xUnitNum *= xDimSize[i]; xUnitNum *= xDimSize[i];
/* a y tensor of size 2 * 3 */ /* a y tensor of size (2, 3) */
int yOrder = 2; int yOrder = 2;
int * yDimSize = new int[yOrder]; int * yDimSize = new int[yOrder];
yDimSize[0] = 2; yDimSize[0] = 2;
...@@ -378,7 +388,7 @@ bool TestHardTanH4() ...@@ -378,7 +388,7 @@ bool TestHardTanH4()
for (int i = 0; i < yOrder; i++) for (int i = 0; i < yOrder; i++)
yUnitNum *= yDimSize[i]; yUnitNum *= yDimSize[i];
/* a gold tensor of size 2 * 3 */ /* a gold tensor of size (2, 3) */
int goldOrder = 2; int goldOrder = 2;
int * goldDimSize = new int[goldOrder]; int * goldDimSize = new int[goldOrder];
goldDimSize[0] = 2; goldDimSize[0] = 2;
...@@ -388,7 +398,7 @@ bool TestHardTanH4() ...@@ -388,7 +398,7 @@ bool TestHardTanH4()
for (int i = 0; i < goldOrder; i++) for (int i = 0; i < goldOrder; i++)
goldUnitNum *= goldDimSize[i]; goldUnitNum *= goldDimSize[i];
/* a dedy tensor of size 2 * 3 */ /* a dedy tensor of size (2, 3) */
int dedyOrder = 2; int dedyOrder = 2;
int * dedyDimSize = new int[dedyOrder]; int * dedyDimSize = new int[dedyOrder];
dedyDimSize[0] = 2; dedyDimSize[0] = 2;
...@@ -398,7 +408,7 @@ bool TestHardTanH4() ...@@ -398,7 +408,7 @@ bool TestHardTanH4()
for (int i = 0; i < dedyOrder; i++) for (int i = 0; i < dedyOrder; i++)
dedyUnitNum *= dedyDimSize[i]; dedyUnitNum *= dedyDimSize[i];
/* a dedx tensor of size 2 * 3 */ /* a dedx tensor of size (2, 3) */
int dedxOrder = 2; int dedxOrder = 2;
int * dedxDimSize = new int[dedxOrder]; int * dedxDimSize = new int[dedxOrder];
dedxDimSize[0] = 2; dedxDimSize[0] = 2;
...@@ -408,16 +418,16 @@ bool TestHardTanH4() ...@@ -408,16 +418,16 @@ bool TestHardTanH4()
for (int i = 0; i < dedxOrder; i++) for (int i = 0; i < dedxOrder; i++)
dedxUnitNum *= dedxDimSize[i]; dedxUnitNum *= dedxDimSize[i];
DTYPE xData[2][3] = { {0.5, -1.0, 2.0}, DTYPE xData[2][3] = { {0.5F, -1.0F, 2.0F},
{3.5, -4.5, 1.0} }; {3.5F, -4.5F, 1.0F} };
DTYPE yData[2][3] = { {0.5, -1.0, 1.0}, DTYPE yData[2][3] = { {0.5F, -1.0F, 1.0F},
{1.0, -1.0, 1.0} }; {1.0F, -1.0F, 1.0F} };
DTYPE goldData[2][3] = { {1.0, 0.0, 1.0}, DTYPE goldData[2][3] = { {1.0F, 0.0F, 1.0F},
{0.0, 1.0, 1.0} }; {0.0F, 1.0F, 1.0F} };
DTYPE dedyData[2][3] = { {-0.5, 0.0, 0.0}, DTYPE dedyData[2][3] = { {-0.5F, 0.0F, 0.0F},
{0.0, -2.0, 0.0} }; {0.0F, -2.0F, 0.0F} };
DTYPE answer[2][3] = { {-0.5, 0.0, 0.0}, DTYPE answer[2][3] = { {-0.5F, 0.0F, 0.0F},
{0.0, 0.0, 0.0} }; {0.0F, 0.0F, 0.0F} };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -486,10 +496,9 @@ TODO!! ...@@ -486,10 +496,9 @@ TODO!!
*/ */
/* test for HardTanH Function */ /* test for HardTanH Function */
extern "C"
bool TestHardTanH() bool TestHardTanH()
{ {
XPRINT(0, stdout, "[TEST HARDTANH] -------------\n"); XPRINT(0, stdout, "[TEST HARDTANH] test hardtanh and its backward computation \n");
bool returnFlag = true, caseFlag = true; bool returnFlag = true, caseFlag = true;
/* case 1 test */ /* case 1 test */
......
...@@ -23,8 +23,10 @@ ...@@ -23,8 +23,10 @@
#include "TIdentity.h" #include "TIdentity.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: test Identity function.
* Identity function: y = x /*
case 1: test Identity function.
Identity function: y = x
*/ */
bool TestIdentity1() bool TestIdentity1()
{ {
...@@ -38,10 +40,10 @@ bool TestIdentity1() ...@@ -38,10 +40,10 @@ bool TestIdentity1()
for (int i = 0; i < sOrder; i++) for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i]; sUnitNum *= sDimSize[i];
DTYPE xData[2][3] = { {0.0, 1.0, 2.0}, DTYPE xData[2][3] = { {0.0F, 1.0F, 2.0F},
{0.5, 0.7, 1.4} }; {0.5F, 0.7F, 1.4F} };
DTYPE answer[2][3] = { {0.0, 1.0, 2.0}, DTYPE answer[2][3] = { {0.0F, 1.0F, 2.0F},
{0.5, 0.7, 1.4} }; {0.5F, 0.7F, 1.4F} };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -93,8 +95,9 @@ bool TestIdentity1() ...@@ -93,8 +95,9 @@ bool TestIdentity1()
#endif // USE_CUDA #endif // USE_CUDA
} }
/* case 2: test IdentityBackward function. /*
* IdentityBackward function: dE/dx = dE/dy * dy/dx = dE/dy case 2: test IdentityBackward function.
IdentityBackward function: dE/dx = dE/dy * dy/dx = dE/dy
*/ */
bool TestIdentity2() bool TestIdentity2()
{ {
...@@ -107,9 +110,9 @@ bool TestIdentity2() ...@@ -107,9 +110,9 @@ bool TestIdentity2()
for (int i = 0; i < sOrder; i++) for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i]; sUnitNum *= sDimSize[i];
DTYPE xData[1][3] = { {0.0, 1.0, 2.0} }; DTYPE xData[1][3] = { {0.0F, 1.0F, 2.0F} };
DTYPE gData[1][3] = { {0.0, 0.0, 1.0} }; DTYPE gData[1][3] = { {0.0F, 0.0F, 1.0F} };
DTYPE dedxAnswer[3] = {0.090031, 0.244728, -0.334759}; DTYPE dedxAnswer[3] = {0.090031F, 0.244728F, -0.334759F};
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -135,7 +138,7 @@ bool TestIdentity2() ...@@ -135,7 +138,7 @@ bool TestIdentity2()
IdentityBackward(g, y, x, dedy, dedx, CROSSENTROPY); IdentityBackward(g, y, x, dedy, dedx, CROSSENTROPY);
/* check result */ /* check result */
cpuTest = dedx->CheckData(dedxAnswer, sUnitNum); cpuTest = dedx->CheckData(dedxAnswer, sUnitNum, 1e-4F);
#ifdef USE_CUDA #ifdef USE_CUDA
/* GPU test */ /* GPU test */
...@@ -162,7 +165,7 @@ bool TestIdentity2() ...@@ -162,7 +165,7 @@ bool TestIdentity2()
IdentityBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, CROSSENTROPY); IdentityBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, CROSSENTROPY);
/* check result */ /* check result */
gpuTest = dedxGPU->CheckData(dedxAnswer, sUnitNum); gpuTest = dedxGPU->CheckData(dedxAnswer, sUnitNum, 1e-4F);
/* destroy variables */ /* destroy variables */
delete x; delete x;
...@@ -197,7 +200,6 @@ bool TestIdentity2() ...@@ -197,7 +200,6 @@ bool TestIdentity2()
*/ */
/* test for Identity Function */ /* test for Identity Function */
extern "C"
bool TestIdentity() bool TestIdentity()
{ {
XPRINT(0, stdout, "[TEST Identity] identity function and its backward computation \n"); XPRINT(0, stdout, "[TEST Identity] identity function and its backward computation \n");
...@@ -213,15 +215,15 @@ bool TestIdentity() ...@@ -213,15 +215,15 @@ bool TestIdentity()
else else
XPRINT(0, stdout, ">> case 1 passed!\n"); XPRINT(0, stdout, ">> case 1 passed!\n");
///* case 2 test */ /* case 2 test */
//caseFlag = TestIdentity2(); caseFlag = TestIdentity2();
//if (!caseFlag) { if (!caseFlag) {
// returnFlag = false; returnFlag = false;
// XPRINT(0, stdout, ">> case 2 failed!\n"); XPRINT(0, stdout, ">> case 2 failed!\n");
//} }
//else else
// XPRINT(0, stdout, ">> case 2 passed!\n"); XPRINT(0, stdout, ">> case 2 passed!\n");
/* other cases test */ /* other cases test */
/* /*
......
...@@ -23,8 +23,10 @@ ...@@ -23,8 +23,10 @@
#include "TLogSoftmax.h" #include "TLogSoftmax.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: test LogSoftmax function.
* LogSoftmax function: y = log(e^x / \sum_{i} e^{x_i}) /*
case 1: test LogSoftmax function.
LogSoftmax function: y = log(e^x / \sum_{i} e^{x_i})
*/ */
bool TestLogSoftmax1() bool TestLogSoftmax1()
{ {
...@@ -38,10 +40,10 @@ bool TestLogSoftmax1() ...@@ -38,10 +40,10 @@ bool TestLogSoftmax1()
for (int i = 0; i < sOrder; i++) for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i]; sUnitNum *= sDimSize[i];
DTYPE xData[2][3] = { {0.0, 1.0, 2.0}, DTYPE xData[2][3] = { {0.0F, 1.0F, 2.0F},
{0.5, 0.7, 1.4} }; {0.5F, 0.7F, 1.4F} };
DTYPE answer[2][3] = { {-2.4076, -1.4076, -0.4076}, DTYPE answer[2][3] = { {-2.4076F, -1.4076F, -0.4076F},
{-1.5435, -1.3435, -0.6435} }; {-1.5435F, -1.3435F, -0.6435F} };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -58,7 +60,7 @@ bool TestLogSoftmax1() ...@@ -58,7 +60,7 @@ bool TestLogSoftmax1()
LogSoftmax(x, y, 1); LogSoftmax(x, y, 1);
/* check result */ /* check result */
cpuTest = y->CheckData(answer, sUnitNum); cpuTest = y->CheckData(answer, sUnitNum, 1e-4F);
#ifdef USE_CUDA #ifdef USE_CUDA
/* GPU test */ /* GPU test */
...@@ -76,7 +78,7 @@ bool TestLogSoftmax1() ...@@ -76,7 +78,7 @@ bool TestLogSoftmax1()
LogSoftmax(xGPU, yGPU, 1); LogSoftmax(xGPU, yGPU, 1);
/* check result */ /* check result */
gpuTest = yGPU->CheckData(answer, sUnitNum); gpuTest = yGPU->CheckData(answer, sUnitNum, 1e-4F);
/* destroy variables */ /* destroy variables */
delete x; delete x;
...@@ -97,9 +99,10 @@ bool TestLogSoftmax1() ...@@ -97,9 +99,10 @@ bool TestLogSoftmax1()
#endif // USE_CUDA #endif // USE_CUDA
} }
/* case 2: test LogSoftmaxBackward function. /*
* dE/dx = dE/dy * dy/dx case 2: test LogSoftmaxBackward function.
* log softmax: y_i = log(e^{x_i} / \sum_{k} e^{x_k}) dE/dx = dE/dy * dy/dx
log softmax: y_i = log(e^{x_i} / \sum_{k} e^{x_k})
*/ */
bool TestLogSoftmax2() bool TestLogSoftmax2()
{ {
...@@ -112,10 +115,10 @@ bool TestLogSoftmax2() ...@@ -112,10 +115,10 @@ bool TestLogSoftmax2()
for (int i = 0; i < sOrder; i++) for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i]; sUnitNum *= sDimSize[i];
DTYPE xData[3] = {0.0, 1.0, 2.0}; DTYPE xData[3] = {0.0F, 1.0F, 2.0F};
DTYPE gData[3] = {0.5, 0.8, 1.5}; DTYPE gData[3] = {0.5F, 0.8F, 1.5F};
DTYPE yAnswer[3] = {-2.4076, -1.4076, -0.4076}; DTYPE yAnswer[3] = {-2.4076F, -1.4076F, -0.4076F};
DTYPE dedxAnswer[3] = {-0.409969, -0.555272, -0.834759}; DTYPE dedxAnswer[3] = {-0.409969F, -0.555272F, -0.834759F};
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -141,7 +144,7 @@ bool TestLogSoftmax2() ...@@ -141,7 +144,7 @@ bool TestLogSoftmax2()
LogSoftmaxBackward(g, y, x, dedy, dedx, 0, CROSSENTROPY); LogSoftmaxBackward(g, y, x, dedy, dedx, 0, CROSSENTROPY);
/* check result */ /* check result */
cpuTest = y->CheckData(yAnswer, sUnitNum) && dedx->CheckData(dedxAnswer, sUnitNum); cpuTest = y->CheckData(yAnswer, sUnitNum, 1e-4F) && dedx->CheckData(dedxAnswer, sUnitNum, 1e-4F);
#ifdef USE_CUDA #ifdef USE_CUDA
/* GPU test */ /* GPU test */
...@@ -168,7 +171,7 @@ bool TestLogSoftmax2() ...@@ -168,7 +171,7 @@ bool TestLogSoftmax2()
LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, 0, CROSSENTROPY); LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, 0, CROSSENTROPY);
/* check result */ /* check result */
gpuTest = yGPU->CheckData(yAnswer, sUnitNum) && dedxGPU->CheckData(dedxAnswer, sUnitNum); gpuTest = yGPU->CheckData(yAnswer, sUnitNum, 1e-4F) && dedxGPU->CheckData(dedxAnswer, sUnitNum, 1e-4F);
/* destroy variables */ /* destroy variables */
delete x; delete x;
...@@ -197,9 +200,10 @@ bool TestLogSoftmax2() ...@@ -197,9 +200,10 @@ bool TestLogSoftmax2()
#endif // USE_CUDA #endif // USE_CUDA
} }
/* case 3: test LogSoftmaxBackward function. /*
* dE/dx = dE/dy * dy/dx case 3: test LogSoftmaxBackward function.
* log softmax: y_i = log(e^{x_i} / \sum_{k} e^{x_k}) dE/dx = dE/dy * dy/dx
log softmax: y_i = log(e^{x_i} / \sum_{k} e^{x_k})
*/ */
bool TestLogSoftmax3() bool TestLogSoftmax3()
{ {
...@@ -213,10 +217,10 @@ bool TestLogSoftmax3() ...@@ -213,10 +217,10 @@ bool TestLogSoftmax3()
for (int i = 0; i < sOrder; i++) for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i]; sUnitNum *= sDimSize[i];
DTYPE xData[1][3] = { {0.0, 1.0, 2.0} }; DTYPE xData[1][3] = { {0.0F, 1.0F, 2.0F} };
DTYPE gData[1][3] = { {0.5, 0.8, 1.5} }; DTYPE gData[1][3] = { {0.5F, 0.8F, 1.5F} };
DTYPE yAnswer[1][3] = {-2.4076, -1.4076, -0.4076}; DTYPE yAnswer[1][3] = {-2.4076F, -1.4076F, -0.4076F};
DTYPE dedxAnswer[1][3] = {-0.409969, -0.555272, -0.834759}; DTYPE dedxAnswer[1][3] = {-0.409969F, -0.555272F, -0.834759F};
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -242,7 +246,7 @@ bool TestLogSoftmax3() ...@@ -242,7 +246,7 @@ bool TestLogSoftmax3()
LogSoftmaxBackward(g, y, x, dedy, dedx, 1, CROSSENTROPY); LogSoftmaxBackward(g, y, x, dedy, dedx, 1, CROSSENTROPY);
/* check result */ /* check result */
cpuTest = y->CheckData(yAnswer, sUnitNum) && dedx->CheckData(dedxAnswer, sUnitNum); cpuTest = y->CheckData(yAnswer, sUnitNum, 1e-4F) && dedx->CheckData(dedxAnswer, sUnitNum, 1e-4F);
#ifdef USE_CUDA #ifdef USE_CUDA
/* GPU test */ /* GPU test */
...@@ -269,7 +273,7 @@ bool TestLogSoftmax3() ...@@ -269,7 +273,7 @@ bool TestLogSoftmax3()
LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, 1, CROSSENTROPY); LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, 1, CROSSENTROPY);
/* check result */ /* check result */
gpuTest = yGPU->CheckData(yAnswer, sUnitNum) && dedxGPU->CheckData(dedxAnswer, sUnitNum); gpuTest = yGPU->CheckData(yAnswer, sUnitNum, 1e-4F) && dedxGPU->CheckData(dedxAnswer, sUnitNum, 1e-4F);
/* destroy variables */ /* destroy variables */
delete x; delete x;
...@@ -305,7 +309,6 @@ bool TestLogSoftmax3() ...@@ -305,7 +309,6 @@ bool TestLogSoftmax3()
*/ */
/* test for LogSoftmax Function */ /* test for LogSoftmax Function */
extern "C"
bool TestLogSoftmax() bool TestLogSoftmax()
{ {
XPRINT(0, stdout, "[TEST LogSoftmax] test log softmax function and its backward computation \n"); XPRINT(0, stdout, "[TEST LogSoftmax] test log softmax function and its backward computation \n");
...@@ -321,15 +324,15 @@ bool TestLogSoftmax() ...@@ -321,15 +324,15 @@ bool TestLogSoftmax()
else else
XPRINT(0, stdout, ">> case 1 passed!\n"); XPRINT(0, stdout, ">> case 1 passed!\n");
///* case 2 test */ /* case 2 test */
//caseFlag = TestLogSoftmax2(); caseFlag = TestLogSoftmax2();
//if (!caseFlag) { if (!caseFlag) {
// returnFlag = false; returnFlag = false;
// XPRINT(0, stdout, ">> case 2 failed!\n"); XPRINT(0, stdout, ">> case 2 failed!\n");
//} }
//else else
// XPRINT(0, stdout, ">> case 2 passed!\n"); XPRINT(0, stdout, ">> case 2 passed!\n");
/* case 3 test */ /* case 3 test */
caseFlag = TestLogSoftmax3(); caseFlag = TestLogSoftmax3();
......
...@@ -23,10 +23,12 @@ ...@@ -23,10 +23,12 @@
#include "../function/Loss.h" #include "../function/Loss.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: test LossCompute function
* In this case, Loss function name = SQUAREDERROR. /*
* loss = sum_{i} 0.5*(t_i - y_i)^2, case 1: test LossCompute function
* where t_i is the gold standard and y_i is the model output In this case, Loss function name = SQUAREDERROR.
loss = sum_{i} 0.5*(t_i - y_i)^2,
where t_i is the gold standard and y_i is the model output
*/ */
bool TestLoss1() bool TestLoss1()
{ {
...@@ -99,10 +101,11 @@ bool TestLoss1() ...@@ -99,10 +101,11 @@ bool TestLoss1()
#endif // USE_CUDA #endif // USE_CUDA
} }
/* case 2: test LossCompute function /*
* In this case, Loss function name = CROSSENTROPY. case 2: test LossCompute function
* loss = sum_{i} (-t_i * log(y_i)) In this case, Loss function name = CROSSENTROPY.
* where t_i is the gold standard and y_i is the model output loss = sum_{i} (-t_i * log(y_i))
where t_i is the gold standard and y_i is the model output
*/ */
bool TestLoss2() bool TestLoss2()
{ {
...@@ -175,10 +178,11 @@ bool TestLoss2() ...@@ -175,10 +178,11 @@ bool TestLoss2()
#endif // USE_CUDA #endif // USE_CUDA
} }
/* case 3: test LossCompute function /*
* In this case, Loss function name = ONEHOTERROR. case 3: test LossCompute function
* loss = sum_{i} e_i In this case, Loss function name = ONEHOTERROR.
* where e_i = 0.5*(t_i - y_i)^2 if t_i = 1, e_i = 0 otherwise loss = sum_{i} e_i
where e_i = 0.5*(t_i - y_i)^2 if t_i = 1, e_i = 0 otherwise
*/ */
bool TestLoss3() bool TestLoss3()
{ {
...@@ -191,16 +195,16 @@ bool TestLoss3() ...@@ -191,16 +195,16 @@ bool TestLoss3()
int unitNum = 1; int unitNum = 1;
for (int i = 0; i < order; i++) for (int i = 0; i < order; i++)
unitNum *= dimSize[i]; unitNum *= dimSize[i];
DTYPE outputData[5][1] = { {0.5}, DTYPE outputData[5][1] = { {0.5F},
{0.5}, {0.5F},
{0.5}, {0.5F},
{0.5}, {0.5F},
{0.5} }; {0.5F} };
DTYPE goldData[5][1] = { {1.0}, DTYPE goldData[5][1] = { {1.0F},
{1.0}, {1.0F},
{0.0}, {0.0F},
{0.0}, {0.0F},
{0.0} }; {0.0F} };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -263,7 +267,6 @@ TODO!! ...@@ -263,7 +267,6 @@ TODO!!
*/ */
/* test for Loss Function */ /* test for Loss Function */
extern "C"
bool TestLoss() bool TestLoss()
{ {
XPRINT(0, stdout, "[TEST Loss] compute the loss \n"); XPRINT(0, stdout, "[TEST Loss] compute the loss \n");
......
...@@ -22,9 +22,10 @@ ...@@ -22,9 +22,10 @@
#include "TMatrixMULBatchedCPU.h" #include "TMatrixMULBatchedCPU.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: matrix multiplication in batch mode (CPU code).
* In this case, aList=2*(2, 3), bList=2*(3, 2) -> c=2*(2, 2), /*
* transposedA=X_NOTRANS, transposedB=X_NOTRANS. case 1: matrix multiplication in batch mode (CPU code).
In this case, aList=2*(2, 3), bList=2*(3, 2) -> c=2*(2, 2), transposedA=X_NOTRANS, transposedB=X_NOTRANS.
*/ */
bool TestMatrixMulBatchedCPU1() bool TestMatrixMulBatchedCPU1()
{ {
...@@ -63,20 +64,20 @@ bool TestMatrixMulBatchedCPU1() ...@@ -63,20 +64,20 @@ bool TestMatrixMulBatchedCPU1()
for (int i = 0; i < cOrder; i++) for (int i = 0; i < cOrder; i++)
cUnitNum *= cDimSize[i]; cUnitNum *= cDimSize[i];
DTYPE aData1[2][3] = { {1.0, 2.0, 3.0}, DTYPE aData1[2][3] = { {1.0F, 2.0F, 3.0F},
{-4.0, 5.0, 6.0} }; {-4.0F, 5.0F, 6.0F} };
DTYPE aData2[2][3] = { {1.0, -2.0, -3.0}, DTYPE aData2[2][3] = { {1.0F, -2.0F, -3.0F},
{-4.0, 3.0, 2.0} }; {-4.0F, 3.0F, 2.0F} };
DTYPE bData1[3][2] = { {0.0, -1.0}, DTYPE bData1[3][2] = { {0.0F, -1.0F},
{1.0, 2.0}, {1.0F, 2.0F},
{2.0, 1.0} }; {2.0F, 1.0F} };
DTYPE bData2[3][2] = { {0.0, 1.0}, DTYPE bData2[3][2] = { {0.0F, 1.0F},
{3.0, 2.0}, {3.0F, 2.0F},
{2.0, 1.0} }; {2.0F, 1.0F} };
DTYPE answer1[2][2] = { {8.0, 6.0}, DTYPE answer1[2][2] = { {8.0F, 6.0F},
{17.0, 20.0} }; {17.0F, 20.0F} };
DTYPE answer2[2][2] = { {-12.0, -6.0}, DTYPE answer2[2][2] = { {-12.0F, -6.0F},
{13.0, 4.0} }; {13.0F, 4.0F} };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
......
...@@ -22,9 +22,11 @@ ...@@ -22,9 +22,11 @@
#include "TMatrixMul.h" #include "TMatrixMul.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: matrix multiplication.
* In this case, a=(2, 3), b=(3, 2) -> c=(2, 2), /*
* transposedA=X_NOTRANS, transposedB=X_NOTRANS. case 1: matrix multiplication.
In this case, a=(2, 3), b=(3, 2) -> c=(2, 2),
transposedA=X_NOTRANS, transposedB=X_NOTRANS.
*/ */
bool TestMatrixMul1() bool TestMatrixMul1()
{ {
...@@ -58,13 +60,13 @@ bool TestMatrixMul1() ...@@ -58,13 +60,13 @@ bool TestMatrixMul1()
for (int i = 0; i < tOrder; i++) for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i]; tUnitNum *= tDimSize[i];
DTYPE sData1[2][3] = { {1.0, 2.0, 3.0}, DTYPE sData1[2][3] = { {1.0F, 2.0F, 3.0F},
{-4.0, 5.0, 6.0} }; {-4.0F, 5.0F, 6.0F} };
DTYPE sData2[3][2] = { {0.0, -1.0}, DTYPE sData2[3][2] = { {0.0F, -1.0F},
{1.0, 2.0}, {1.0F, 2.0F},
{2.0, 1.0} }; {2.0F, 1.0F} };
DTYPE answer[2][2] = { {8.0, 6.0}, DTYPE answer[2][2] = { {8.0F, 6.0F},
{17.0, 20.0} }; {17.0F, 20.0F} };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -130,9 +132,10 @@ bool TestMatrixMul1() ...@@ -130,9 +132,10 @@ bool TestMatrixMul1()
#endif // USE_CUDA #endif // USE_CUDA
} }
/* case 2: matrix multiplication. /*
* In this case, a=(3, 2), b=(3, 2) -> c=(2, 2), case 2: matrix multiplication.
* transposedA=X_TRANS, transposedB=X_NOTRANS. In this case, a=(3, 2), b=(3, 2) -> c=(2, 2),
transposedA=X_TRANS, transposedB=X_NOTRANS.
*/ */
bool TestMatrixMul2() bool TestMatrixMul2()
{ {
...@@ -166,14 +169,14 @@ bool TestMatrixMul2() ...@@ -166,14 +169,14 @@ bool TestMatrixMul2()
for (int i = 0; i < tOrder; i++) for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i]; tUnitNum *= tDimSize[i];
DTYPE sData1[3][2] = { {1.0, -4.0}, DTYPE sData1[3][2] = { {1.0F, -4.0F},
{2.0, 5.0}, {2.0F, 5.0F},
{3.0, 6.0} }; {3.0F, 6.0F} };
DTYPE sData2[3][2] = { {0.0, -1.0}, DTYPE sData2[3][2] = { {0.0F, -1.0F},
{1.0, 2.0}, {1.0F, 2.0F},
{2.0, 1.0} }; {2.0F, 1.0F} };
DTYPE answer[2][2] = { {8.0, 6.0}, DTYPE answer[2][2] = { {8.0F, 6.0F},
{17.0, 20.0} }; {17.0F, 20.0F} };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -239,9 +242,10 @@ bool TestMatrixMul2() ...@@ -239,9 +242,10 @@ bool TestMatrixMul2()
#endif // USE_CUDA #endif // USE_CUDA
} }
/* case 3: matrix multiplication. /*
* In this case, a=(3, 2, 3), b=(2, 3, 2) -> c=(3, 2, 2, 2), case 3: matrix multiplication.
* transposedA=X_NOTRANS, transposedB=X_NOTRANS. In this case, a=(3, 2, 3), b=(2, 3, 2) -> c=(3, 2, 2, 2),
transposedA=X_NOTRANS, transposedB=X_NOTRANS.
*/ */
bool TestMatrixMul3() bool TestMatrixMul3()
{ {
...@@ -279,30 +283,30 @@ bool TestMatrixMul3() ...@@ -279,30 +283,30 @@ bool TestMatrixMul3()
for (int i = 0; i < tOrder; i++) for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i]; tUnitNum *= tDimSize[i];
DTYPE sData1[3][2][3] = { { {0.0, -1.0, 2.0}, DTYPE sData1[3][2][3] = { { {0.0F, -1.0F, 2.0F},
{2.0, 1.0, 3.0} }, {2.0F, 1.0F, 3.0F} },
{ {1.0, 2.0, 4.0}, { {1.0F, 2.0F, 4.0F},
{3.0, 1.0, 2.0}}, {3.0F, 1.0F, 2.0F}},
{ {-1.0, 3.0, 2.0}, { {-1.0F, 3.0F, 2.0F},
{1.0, -1.0, 0.0} } }; {1.0F, -1.0F, 0.0F} } };
DTYPE sData2[2][3][2] = { { {1.0, 2.0}, DTYPE sData2[2][3][2] = { { {1.0F, 2.0F},
{-4.0, 3.0}, {-4.0F, 3.0F},
{2.0, 6.0} }, {2.0F, 6.0F} },
{ {1.0, 2.0}, { {1.0F, 2.0F},
{3.0, 4.0}, {3.0F, 4.0F},
{5.0, 6.0} } }; {5.0F, 6.0F} } };
DTYPE answer[3][2][2][2] = { { { {8.0, 9.0}, DTYPE answer[3][2][2][2] = { { { {8.0F, 9.0F},
{4.0, 25.0} }, {4.0F, 25.0F} },
{ {7.0, 8.0}, { {7.0F, 8.0F},
{20.0, 26.0} } }, {20.0F, 26.0F} } },
{ { {1.0, 32.0}, { { {1.0F, 32.0F},
{3.0, 21.0} }, {3.0F, 21.0F} },
{ {27.0, 34.0}, { {27.0F, 34.0F},
{16.0, 22.0} } }, {16.0F, 22.0F} } },
{ { {-9.0, 19.0}, { { {-9.0F, 19.0F},
{5.0, -1.0} }, {5.0F, -1.0F} },
{ {18.0, 22.0}, { {18.0F, 22.0F},
{-2.0, -2.0} } } }; {-2.0F, -2.0F} } } };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -368,9 +372,10 @@ bool TestMatrixMul3() ...@@ -368,9 +372,10 @@ bool TestMatrixMul3()
#endif // USE_CUDA #endif // USE_CUDA
} }
/* case 4: matrix multiplication. /*
* In this case, a=(3, 2, 3), b=(3, 2) -> c=(3, 2, 2), case 4: matrix multiplication.
* transposedA=X_NOTRANS, transposedB=X_NOTRANS. In this case, a=(3, 2, 3), b=(3, 2) -> c=(3, 2, 2),
transposedA=X_NOTRANS, transposedB=X_NOTRANS.
*/ */
bool TestMatrixMul4() bool TestMatrixMul4()
{ {
...@@ -406,21 +411,21 @@ bool TestMatrixMul4() ...@@ -406,21 +411,21 @@ bool TestMatrixMul4()
for (int i = 0; i < tOrder; i++) for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i]; tUnitNum *= tDimSize[i];
DTYPE sData1[3][2][3] = { { {0.0, -1.0, 2.0}, DTYPE sData1[3][2][3] = { { {0.0F, -1.0F, 2.0F},
{2.0, 1.0, 3.0} }, {2.0F, 1.0F, 3.0F} },
{ {1.0, 2.0, 4.0}, { {1.0F, 2.0F, 4.0F},
{3.0, 1.0, 2.0}}, {3.0F, 1.0F, 2.0F}},
{ {-1.0, 3.0, 2.0}, { {-1.0F, 3.0F, 2.0F},
{1.0, -1.0, 0.0} } }; {1.0F, -1.0F, 0.0F} } };
DTYPE sData2[3][2] = { {1.0, 2.0}, DTYPE sData2[3][2] = { {1.0F, 2.0F},
{3.0, 4.0}, {3.0F, 4.0F},
{5.0, 6.0} }; {5.0F, 6.0F} };
DTYPE answer[3][2][2] = { { {7.0, 8.0}, DTYPE answer[3][2][2] = { { {7.0F, 8.0F},
{20.0, 26.0} }, {20.0F, 26.0F} },
{ {27.0, 34.0}, { {27.0F, 34.0F},
{16.0, 22.0} }, {16.0F, 22.0F} },
{ {18.0, 22.0}, { {18.0F, 22.0F},
{-2.0, -2.0} } }; {-2.0F, -2.0F} } };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -493,7 +498,6 @@ bool TestMatrixMul4() ...@@ -493,7 +498,6 @@ bool TestMatrixMul4()
*/ */
/* test for MatrixMul Function */ /* test for MatrixMul Function */
extern "C"
bool TestMatrixMul() bool TestMatrixMul()
{ {
XPRINT(0, stdout, "[TEST MATRIXMUL] matrix multiplication \n"); XPRINT(0, stdout, "[TEST MATRIXMUL] matrix multiplication \n");
......
...@@ -22,9 +22,11 @@ ...@@ -22,9 +22,11 @@
#include "TMatrixMul2D.h" #include "TMatrixMul2D.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: matrix multiplication (for 2d tensors).
* In this case, a=(2, 3), b=(3, 2) -> c=(2, 2), /*
* transposedA=X_NOTRANS, transposedB=X_NOTRANS. case 1: matrix multiplication (for 2d tensors).
In this case, a=(2, 3), b=(3, 2) -> c=(2, 2),
transposedA=X_NOTRANS, transposedB=X_NOTRANS.
*/ */
bool TestMatrixMul2D1() bool TestMatrixMul2D1()
{ {
...@@ -58,13 +60,13 @@ bool TestMatrixMul2D1() ...@@ -58,13 +60,13 @@ bool TestMatrixMul2D1()
for (int i = 0; i < tOrder; i++) for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i]; tUnitNum *= tDimSize[i];
DTYPE sData1[2][3] = { {1.0, 2.0, 3.0}, DTYPE sData1[2][3] = { {1.0F, 2.0F, 3.0F},
{-4.0, 5.0, 6.0} }; {-4.0F, 5.0F, 6.0F} };
DTYPE sData2[3][2] = { {0.0, -1.0}, DTYPE sData2[3][2] = { {0.0F, -1.0F},
{1.0, 2.0}, {1.0F, 2.0F},
{2.0, 1.0} }; {2.0F, 1.0F} };
DTYPE answer[2][2] = { {8.0, 6.0}, DTYPE answer[2][2] = { {8.0F, 6.0F},
{17.0, 20.0} }; {17.0F, 20.0F} };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -130,9 +132,10 @@ bool TestMatrixMul2D1() ...@@ -130,9 +132,10 @@ bool TestMatrixMul2D1()
#endif // USE_CUDA #endif // USE_CUDA
} }
/* case 2: matrix multiplication (for 2d tensors). /*
* In this case, a=(3, 2), b=(3, 2) -> c=(2, 2), case 2: matrix multiplication (for 2d tensors).
* transposedA=X_TRANS, transposedB=X_NOTRANS. In this case, a=(3, 2), b=(3, 2) -> c=(2, 2),
transposedA=X_TRANS, transposedB=X_NOTRANS.
*/ */
bool TestMatrixMul2D2() bool TestMatrixMul2D2()
{ {
...@@ -166,14 +169,14 @@ bool TestMatrixMul2D2() ...@@ -166,14 +169,14 @@ bool TestMatrixMul2D2()
for (int i = 0; i < tOrder; i++) for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i]; tUnitNum *= tDimSize[i];
DTYPE sData1[3][2] = { {1.0, -4.0}, DTYPE sData1[3][2] = { {1.0F, -4.0F},
{2.0, 5.0}, {2.0F, 5.0F},
{3.0, 6.0} }; {3.0F, 6.0F} };
DTYPE sData2[3][2] = { {0.0, -1.0}, DTYPE sData2[3][2] = { {0.0F, -1.0F},
{1.0, 2.0}, {1.0F, 2.0F},
{2.0, 1.0} }; {2.0F, 1.0F} };
DTYPE answer[2][2] = { {8.0, 6.0}, DTYPE answer[2][2] = { {8.0F, 6.0F},
{17.0, 20.0} }; {17.0F, 20.0F} };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
......
...@@ -22,9 +22,11 @@ ...@@ -22,9 +22,11 @@
#include "TMatrixMul2DParallel.h" #include "TMatrixMul2DParallel.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: matrix multiplication (for 2d tensors) with multi-threading.
* In this case, a=(2, 3), b=(3, 2) -> c=(2, 2), /*
* transposedA=X_NOTRANS, transposedB=X_NOTRANS. case 1: matrix multiplication (for 2d tensors) with multi-threading.
In this case, a=(2, 3), b=(3, 2) -> c=(2, 2),
transposedA=X_NOTRANS, transposedB=X_NOTRANS.
*/ */
bool TestMatrixMul2DParallel1() bool TestMatrixMul2DParallel1()
{ {
...@@ -58,13 +60,13 @@ bool TestMatrixMul2DParallel1() ...@@ -58,13 +60,13 @@ bool TestMatrixMul2DParallel1()
for (int i = 0; i < tOrder; i++) for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i]; tUnitNum *= tDimSize[i];
DTYPE sData1[2][3] = { {1.0, 2.0, 3.0}, DTYPE sData1[2][3] = { {1.0F, 2.0F, 3.0F},
{-4.0, 5.0, 6.0} }; {-4.0F, 5.0F, 6.0F} };
DTYPE sData2[3][2] = { {0.0, -1.0}, DTYPE sData2[3][2] = { {0.0F, -1.0F},
{1.0, 2.0}, {1.0F, 2.0F},
{2.0, 1.0} }; {2.0F, 1.0F} };
DTYPE answer[2][2] = { {8.0, 6.0}, DTYPE answer[2][2] = { {8.0F, 6.0F},
{17.0, 20.0} }; {17.0F, 20.0F} };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -96,9 +98,10 @@ bool TestMatrixMul2DParallel1() ...@@ -96,9 +98,10 @@ bool TestMatrixMul2DParallel1()
return cpuTest; return cpuTest;
} }
/* case 2: matrix multiplication (for 2d tensors) with multi-threading. /*
* In this case, a=(3, 2), b=(3, 2) -> c=(2, 2), case 2: matrix multiplication (for 2d tensors) with multi-threading.
* transposedA=X_TRANS, transposedB=X_NOTRANS. In this case, a=(3, 2), b=(3, 2) -> c=(2, 2),
transposedA=X_TRANS, transposedB=X_NOTRANS.
*/ */
bool TestMatrixMul2DParallel2() bool TestMatrixMul2DParallel2()
{ {
...@@ -132,14 +135,14 @@ bool TestMatrixMul2DParallel2() ...@@ -132,14 +135,14 @@ bool TestMatrixMul2DParallel2()
for (int i = 0; i < tOrder; i++) for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i]; tUnitNum *= tDimSize[i];
DTYPE sData1[3][2] = { {1.0, -4.0}, DTYPE sData1[3][2] = { {1.0F, -4.0F},
{2.0, 5.0}, {2.0F, 5.0F},
{3.0, 6.0} }; {3.0F, 6.0F} };
DTYPE sData2[3][2] = { {0.0, -1.0}, DTYPE sData2[3][2] = { {0.0F, -1.0F},
{1.0, 2.0}, {1.0F, 2.0F},
{2.0, 1.0} }; {2.0F, 1.0F} };
DTYPE answer[2][2] = { {8.0, 6.0}, DTYPE answer[2][2] = { {8.0F, 6.0F},
{17.0, 20.0} }; {17.0F, 20.0F} };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -177,7 +180,6 @@ bool TestMatrixMul2DParallel2() ...@@ -177,7 +180,6 @@ bool TestMatrixMul2DParallel2()
*/ */
/* test for MatrixMul2DParallel Function */ /* test for MatrixMul2DParallel Function */
extern "C"
bool TestMatrixMul2DParallel() bool TestMatrixMul2DParallel()
{ {
XPRINT(0, stdout, "[TEST MatrixMul2DParallel] matrix multiplication (for 2d tensors) with multi-threading \n"); XPRINT(0, stdout, "[TEST MatrixMul2DParallel] matrix multiplication (for 2d tensors) with multi-threading \n");
......
...@@ -22,9 +22,10 @@ ...@@ -22,9 +22,10 @@
#include "TMatrixMULBatched.h" #include "TMatrixMULBatched.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: matrix multiplication of the two tensors.
* In this case, a=(2, 3), b=(2, 3) -> c=(2, 2), transposedA=X_NOTRANS, /*
transposedB=X_NOTRANS. case 1: matrix multiplication of the two tensors.
In this case, a=(2, 3), b=(2, 3) -> c=(2, 2), transposedA=X_NOTRANS, transposedB=X_NOTRANS.
*/ */
bool TestMatrixMulBatched1() bool TestMatrixMulBatched1()
{ {
...@@ -58,13 +59,13 @@ bool TestMatrixMulBatched1() ...@@ -58,13 +59,13 @@ bool TestMatrixMulBatched1()
for (int i = 0; i < tOrder; i++) for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i]; tUnitNum *= tDimSize[i];
DTYPE sData1[2][3] = { {1.0, 2.0, 3.0}, DTYPE sData1[2][3] = { {1.0F, 2.0F, 3.0F},
{-4.0, 5.0, 6.0} }; {-4.0F, 5.0F, 6.0F} };
DTYPE sData2[3][2] = { {0.0, -1.0}, DTYPE sData2[3][2] = { {0.0F, -1.0F},
{1.0, 2.0}, {1.0F, 2.0F},
{2.0, 1.0} }; {2.0F, 1.0F} };
DTYPE answer[2][2] = { {8.0, 6.0}, DTYPE answer[2][2] = { {8.0F, 6.0F},
{17.0, 20.0} }; {17.0F, 20.0F} };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -130,9 +131,9 @@ bool TestMatrixMulBatched1() ...@@ -130,9 +131,9 @@ bool TestMatrixMulBatched1()
#endif // USE_CUDA #endif // USE_CUDA
} }
/* case 2: matrix multiplication of the two tensors. /*
* In this case, a=(2, 2, 3), b=(2, 3, 2) -> c=(2, 2, 2), case 2: matrix multiplication of the two tensors.
* transposedA=X_NOTRANS, transposedB=X_NOTRANS. In this case, a=(2, 2, 3), b=(2, 3, 2) -> c=(2, 2, 2), transposedA=X_NOTRANS, transposedB=X_NOTRANS.
*/ */
bool TestMatrixMulBatched2() bool TestMatrixMulBatched2()
{ {
...@@ -169,20 +170,20 @@ bool TestMatrixMulBatched2() ...@@ -169,20 +170,20 @@ bool TestMatrixMulBatched2()
for (int i = 0; i < tOrder; i++) for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i]; tUnitNum *= tDimSize[i];
DTYPE sData1[2][2][3] = { { {0.0, -1.0, 2.0}, DTYPE sData1[2][2][3] = { { {0.0F, -1.0F, 2.0F},
{2.0, 1.0, 3.0} }, {2.0F, 1.0F, 3.0F} },
{ {1.0, 2.0, 4.0}, { {1.0F, 2.0F, 4.0F},
{3.0, 1.0, 2.0} } }; {3.0F, 1.0F, 2.0F} } };
DTYPE sData2[2][3][2] = { { {1.0, 2.0}, DTYPE sData2[2][3][2] = { { {1.0F, 2.0F},
{-4.0, 3.0}, {-4.0F, 3.0F},
{2.0, 6.0} }, {2.0F, 6.0F} },
{ {1.0, 2.0}, { {1.0F, 2.0F},
{3.0, 4.0}, {3.0F, 4.0F},
{5.0, 6.0} } }; {5.0F, 6.0F} } };
DTYPE answer[2][2][2] = { { {8.0, 9.0}, DTYPE answer[2][2][2] = { { {8.0F, 9.0F},
{4.0, 25.0} }, {4.0F, 25.0F} },
{ {27.0, 34.0}, { {27.0F, 34.0F},
{16.0, 22.0} } }; {16.0F, 22.0F} } };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -254,7 +255,6 @@ bool TestMatrixMulBatched2() ...@@ -254,7 +255,6 @@ bool TestMatrixMulBatched2()
*/ */
/* test for TestMatrixMulBatched Function */ /* test for TestMatrixMulBatched Function */
extern "C"
bool TestMatrixMulBatched() bool TestMatrixMulBatched()
{ {
XPRINT(0, stdout, "[TEST MATRIXMULBATCHED] matrix multiplication of the two tensors \n"); XPRINT(0, stdout, "[TEST MATRIXMULBATCHED] matrix multiplication of the two tensors \n");
......
...@@ -24,8 +24,10 @@ ...@@ -24,8 +24,10 @@
#include "TMerge.h" #include "TMerge.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: transform a tensor by merging it along with a dimension.
* In this case, (3, 2) -> (6), whereToMerge=1, leadingDim=0. /*
case 1: transform a tensor by merging it along with a dimension.
In this case, (3, 2) -> (6), whereToMerge=1, leadingDim=0.
*/ */
bool TestMerge1() bool TestMerge1()
{ {
...@@ -48,9 +50,9 @@ bool TestMerge1() ...@@ -48,9 +50,9 @@ bool TestMerge1()
for (int i = 0; i < tOrder; i++) for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i]; tUnitNum *= tDimSize[i];
DTYPE sData[2][3] = { {0.0, 1.0, 2.0}, DTYPE sData[2][3] = { {0.0F, 1.0F, 2.0F},
{3.0, 4.0, 5.0} }; {3.0F, 4.0F, 5.0F} };
DTYPE answer[6] = {0.0, 1.0, 2.0, 3.0, 4.0, 5.0}; DTYPE answer[6] = {0.0F, 1.0F, 2.0F, 3.0F, 4.0F, 5.0F};
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -107,8 +109,9 @@ bool TestMerge1() ...@@ -107,8 +109,9 @@ bool TestMerge1()
#endif // USE_CUDA #endif // USE_CUDA
} }
/* case 2: transform a tensor by merging it along with a dimension. /*
* In this case, case 2: transform a tensor by merging it along with a dimension.
In this case,
(2, 2, 3) -> (4, 3), whereToMerge=1, leadingDim=0. (2, 2, 3) -> (4, 3), whereToMerge=1, leadingDim=0.
(2, 2, 3) -> (2, 6), whereToMerge=2, leadingDim=0. (2, 2, 3) -> (2, 6), whereToMerge=2, leadingDim=0.
*/ */
...@@ -145,16 +148,16 @@ bool TestMerge2() ...@@ -145,16 +148,16 @@ bool TestMerge2()
for (int i = 0; i < tOrder2; i++) for (int i = 0; i < tOrder2; i++)
tUnitNum2 *= tDimSize2[i]; tUnitNum2 *= tDimSize2[i];
DTYPE sData[2][2][3] = { { {0.0, 1.0, 2.0}, DTYPE sData[2][2][3] = { { {0.0F, 1.0F, 2.0F},
{4.0, 5.0, 6.0} }, {4.0F, 5.0F, 6.0F} },
{ {-1.0, 2.0, 3.0}, { {-1.0F, 2.0F, 3.0F},
{-4.0, -5.0, -6.0} } }; {-4.0F, -5.0F, -6.0F} } };
DTYPE answer1[4][3] = { {0.0, 1.0, 2.0}, DTYPE answer1[4][3] = { {0.0F, 1.0F, 2.0F},
{4.0, 5.0, 6.0}, {4.0F, 5.0F, 6.0F},
{-1.0, 2.0, 3.0}, {-1.0F, 2.0F, 3.0F},
{-4.0, -5.0, -6.0} }; {-4.0F, -5.0F, -6.0F} };
DTYPE answer2[2][6] = { {0.0, 1.0, 2.0, -1.0, 2.0, 3.0}, DTYPE answer2[2][6] = { {0.0F, 1.0F, 2.0F, -1.0F, 2.0F, 3.0F},
{4.0, 5.0, 6.0, -4.0, -5.0, -6.0} }; {4.0F, 5.0F, 6.0F, -4.0F, -5.0F, -6.0F} };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -222,7 +225,8 @@ bool TestMerge2() ...@@ -222,7 +225,8 @@ bool TestMerge2()
#endif // USE_CUDA #endif // USE_CUDA
} }
/* case 3: merge small tensors into a big tensor. /*
case 3: merge small tensors into a big tensor.
In this case, 2 * (2, 4) -> (4, 4), whereToMerge=0. In this case, 2 * (2, 4) -> (4, 4), whereToMerge=0.
*/ */
bool TestMerge3() bool TestMerge3()
...@@ -240,10 +244,10 @@ bool TestMerge3() ...@@ -240,10 +244,10 @@ bool TestMerge3()
for (int i = 0; i < sOrder; i++) for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i]; sUnitNum *= sDimSize[i];
DTYPE sData1[2][4] = { {0.0, 1.0, 2.0, 3.0}, DTYPE sData1[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0, 5.0, 6.0, 7.0} }; {4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE sData2[2][4] = { {0.0, -1.0, -2.0, -3.0}, DTYPE sData2[2][4] = { {0.0F, -1.0F, -2.0F, -3.0F},
{-4.0, -5.0, -6.0, -7.0} }; {-4.0F, -5.0F, -6.0F, -7.0F} };
/* a target tensor of size (4, 4) */ /* a target tensor of size (4, 4) */
int tOrder = 2; int tOrder = 2;
...@@ -255,10 +259,10 @@ bool TestMerge3() ...@@ -255,10 +259,10 @@ bool TestMerge3()
for (int i = 0; i < tOrder; i++) for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i]; tUnitNum *= tDimSize[i];
DTYPE answer[4][4] = { {0.0, 1.0, 2.0, 3.0}, DTYPE answer[4][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0, 5.0, 6.0, 7.0}, {4.0F, 5.0F, 6.0F, 7.0F},
{0.0, -1.0, -2.0, -3.0}, {0.0F, -1.0F, -2.0F, -3.0F},
{-4.0, -5.0, -6.0, -7.0} }; {-4.0F, -5.0F, -6.0F, -7.0F} };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -336,7 +340,8 @@ bool TestMerge3() ...@@ -336,7 +340,8 @@ bool TestMerge3()
#endif // USE_CUDA #endif // USE_CUDA
} }
/* case 4: merge small tensors into a big tensor. /*
case 4: merge small tensors into a big tensor.
In this case, 2 * (2, 4) -> (2, 8), whereToMerge=1. In this case, 2 * (2, 4) -> (2, 8), whereToMerge=1.
*/ */
bool TestMerge4() bool TestMerge4()
...@@ -354,10 +359,10 @@ bool TestMerge4() ...@@ -354,10 +359,10 @@ bool TestMerge4()
for (int i = 0; i < sOrder; i++) for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i]; sUnitNum *= sDimSize[i];
DTYPE sData1[2][4] = { {0.0, 1.0, 2.0, 3.0}, DTYPE sData1[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0, 5.0, 6.0, 7.0} }; {4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE sData2[2][4] = { {0.0, -1.0, -2.0, -3.0}, DTYPE sData2[2][4] = { {0.0F, -1.0F, -2.0F, -3.0F},
{-4.0, -5.0, -6.0, -7.0} }; {-4.0F, -5.0F, -6.0F, -7.0F} };
/* a target tensor of size (4, 4) */ /* a target tensor of size (4, 4) */
int tOrder = 2; int tOrder = 2;
...@@ -369,8 +374,8 @@ bool TestMerge4() ...@@ -369,8 +374,8 @@ bool TestMerge4()
for (int i = 0; i < tOrder; i++) for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i]; tUnitNum *= tDimSize[i];
DTYPE answer[2][8] = { {0.0, 1.0, 2.0, 3.0, 0.0, -1.0, -2.0, -3.0}, DTYPE answer[2][8] = { {0.0F, 1.0F, 2.0F, 3.0F, 0.0F, -1.0F, -2.0F, -3.0F},
{4.0, 5.0, 6.0, 7.0, -4.0, -5.0, -6.0, -7.0} }; {4.0F, 5.0F, 6.0F, 7.0F, -4.0F, -5.0F, -6.0F, -7.0F} };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -454,7 +459,6 @@ bool TestMerge4() ...@@ -454,7 +459,6 @@ bool TestMerge4()
*/ */
/* test for Merge Function */ /* test for Merge Function */
extern "C"
bool TestMerge() bool TestMerge()
{ {
XPRINT(0, stdout, "[TEST MERGE] transform a tensor by merging it alone with a dimension or merge small tensors into a big tensor\n"); XPRINT(0, stdout, "[TEST MERGE] transform a tensor by merging it alone with a dimension or merge small tensors into a big tensor\n");
......
...@@ -22,9 +22,11 @@ ...@@ -22,9 +22,11 @@
#include "TMultiplyElementWise.h" #include "TMultiplyElementWise.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: element-wise product of two tensors
* c(i) = a(i)*b(i) + \alpha * c(i) /*
* In this case, (2, 1) (2, 1) -> (2, 1), leadingDim=0, alpha=0. case 1: element-wise product of two tensors
c(i) = a(i)*b(i) + \alpha * c(i)
In this case, (2, 1) (2, 1) -> (2, 1), leadingDim=0, alpha=0.
*/ */
bool TestMultiplyElementWise1() bool TestMultiplyElementWise1()
{ {
...@@ -58,12 +60,12 @@ bool TestMultiplyElementWise1() ...@@ -58,12 +60,12 @@ bool TestMultiplyElementWise1()
for (int i = 0; i < tOrder; i++) for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i]; tUnitNum *= tDimSize[i];
DTYPE sData1[2][1] = { {0.0}, DTYPE sData1[2][1] = { {0.0F},
{1.0} }; {1.0F} };
DTYPE sData2[2][1] = { {2.0}, DTYPE sData2[2][1] = { {2.0F},
{3.0} }; {3.0F} };
DTYPE answer[2][1] = { {0.0}, DTYPE answer[2][1] = { {0.0F},
{3.0} }; {3.0F} };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -129,9 +131,10 @@ bool TestMultiplyElementWise1() ...@@ -129,9 +131,10 @@ bool TestMultiplyElementWise1()
#endif // USE_CUDA #endif // USE_CUDA
} }
/* case 2: element-wise product of two tensors /*
* c(i) = a(i)*b(i) + \alpha * c(i) case 2: element-wise product of two tensors
* In this case, (2, 2) (2, 2) -> (2, 2), leadingDim=0, alpha=0. c(i) = a(i)*b(i) + \alpha * c(i)
In this case, (2, 2) (2, 2) -> (2, 2), leadingDim=0, alpha=0.
*/ */
bool TestMultiplyElementWise2() bool TestMultiplyElementWise2()
{ {
...@@ -165,12 +168,12 @@ bool TestMultiplyElementWise2() ...@@ -165,12 +168,12 @@ bool TestMultiplyElementWise2()
for (int i = 0; i < tOrder; i++) for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i]; tUnitNum *= tDimSize[i];
DTYPE sData1[2][2] = { {0.0, 1.0}, DTYPE sData1[2][2] = { {0.0F, 1.0F},
{2.0, 3.0} }; {2.0F, 3.0F} };
DTYPE sData2[2][2] = { {0.0, 1.0}, DTYPE sData2[2][2] = { {0.0F, 1.0F},
{2.0, 3.0} }; {2.0F, 3.0F} };
DTYPE answer[2][2] = { {0.0, 1.0}, DTYPE answer[2][2] = { {0.0F, 1.0F},
{4.0, 9.0} }; {4.0F, 9.0F} };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -236,8 +239,9 @@ bool TestMultiplyElementWise2() ...@@ -236,8 +239,9 @@ bool TestMultiplyElementWise2()
#endif // USE_CUDA #endif // USE_CUDA
} }
/* case 3: element-wise product of two tensors, c(i) = a(i)*b(i) + \alpha * c(i) /*
* In this case, (2, 2) (2, 2) -> (2, 2), leadingDim=1, alpha=0. case 3: element-wise product of two tensors, c(i) = a(i)*b(i) + \alpha * c(i)
In this case, (2, 2) (2, 2) -> (2, 2), leadingDim=1, alpha=0.
*/ */
bool TestMultiplyElementWise3() bool TestMultiplyElementWise3()
{ {
...@@ -271,12 +275,12 @@ bool TestMultiplyElementWise3() ...@@ -271,12 +275,12 @@ bool TestMultiplyElementWise3()
for (int i = 0; i < tOrder; i++) for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i]; tUnitNum *= tDimSize[i];
DTYPE sData1[2][2] = { {0.0, 1.0}, DTYPE sData1[2][2] = { {0.0F, 1.0F},
{2.0, 3.0} }; {2.0F, 3.0F} };
DTYPE sData2[2][2] = { {0.0, 1.0}, DTYPE sData2[2][2] = { {0.0F, 1.0F},
{2.0, 3.0} }; {2.0F, 3.0F} };
DTYPE answer[2][2] = { {0.0, 1.0}, DTYPE answer[2][2] = { {0.0F, 1.0F},
{4.0, 9.0} }; {4.0F, 9.0F} };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -348,7 +352,6 @@ TODO!! ...@@ -348,7 +352,6 @@ TODO!!
*/ */
/* test for MultiplyElementWise Function */ /* test for MultiplyElementWise Function */
extern "C"
bool TestMultiplyElementWise() bool TestMultiplyElementWise()
{ {
XPRINT(0, stdout, "[TEST MULTIPLYELEMENTWISE] element-wise product of two tensors \n"); XPRINT(0, stdout, "[TEST MULTIPLYELEMENTWISE] element-wise product of two tensors \n");
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include "TNegate.h" #include "TNegate.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: set every entry to its minus value */ /* case 1: set every entry to its minus value */
bool TestNegate1() bool TestNegate1()
{ {
...@@ -35,12 +36,12 @@ bool TestNegate1() ...@@ -35,12 +36,12 @@ bool TestNegate1()
for (int i = 0; i < aOrder; i++) for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i]; aUnitNum *= aDimSize[i];
DTYPE aData[3][2] = { {1.0, -2.0}, DTYPE aData[3][2] = { {1.0F, -2.0F},
{-3.0, 4.0}, {-3.0F, 4.0F},
{5.0, -6.0} }; {5.0F, -6.0F} };
DTYPE answer[3][2] = { {-1.0, 2.0}, DTYPE answer[3][2] = { {-1.0F, 2.0F},
{3.0, -4.0}, {3.0F, -4.0F},
{-5.0, 6.0} }; {-5.0F, 6.0F} };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -101,12 +102,12 @@ bool TestNegate2() ...@@ -101,12 +102,12 @@ bool TestNegate2()
for (int i = 0; i < aOrder; i++) for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i]; aUnitNum *= aDimSize[i];
DTYPE aData[3][2] = { {0.0, 0.0}, DTYPE aData[3][2] = { {0.0F, 0.0F},
{0.0, 0.0}, {0.0F, 0.0F},
{0.0, 0.0} }; {0.0F, 0.0F} };
DTYPE answer[3][2] = { {-0.0, -0.0}, DTYPE answer[3][2] = { {-0.0F, -0.0F},
{-0.0, -0.0}, {-0.0F, -0.0F},
{-0.0, -0.0} }; {-0.0F, -0.0F} };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -160,7 +161,6 @@ TODO!! ...@@ -160,7 +161,6 @@ TODO!!
*/ */
/* test for Negate Function */ /* test for Negate Function */
extern "C"
bool TestNegate() bool TestNegate()
{ {
XPRINT(0, stdout, "[TEST NEGATE] set every entry to its minus value \n"); XPRINT(0, stdout, "[TEST NEGATE] set every entry to its minus value \n");
......
...@@ -22,10 +22,12 @@ ...@@ -22,10 +22,12 @@
#include "TNormalize.h" #include "TNormalize.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: normalized the data with normal distribution
* For an input x, y = a * (x-mean)/sqrt(variance+\epsilon) + b. /*
* where a and b are the scalar and bias respectively, case 1: normalized the data with normal distribution
* and \epsilon is the adjustment parameter. For an input x, y = a * (x-mean)/sqrt(variance+\epsilon) + b.
where a and b are the scalar and bias respectively,
and \epsilon is the adjustment parameter.
*/ */
bool TestNormalize1() bool TestNormalize1()
{ {
...@@ -87,14 +89,14 @@ bool TestNormalize1() ...@@ -87,14 +89,14 @@ bool TestNormalize1()
for (int i = 0; i < bOrder; i++) for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i]; bUnitNum *= bDimSize[i];
DTYPE sData[2][3] = { {1.0, 2.0, 3.0}, DTYPE sData[2][3] = { {1.0F, 2.0F, 3.0F},
{1.5, 2.5, 3.5} }; {1.5F, 2.5F, 3.5F} };
DTYPE meanData[3] = {1.0, 1.5, 2.0}; DTYPE meanData[3] = {1.0F, 1.5F, 2.0F};
DTYPE varData[3] = {1.0, 1.0, 4.0}; DTYPE varData[3] = {1.0F, 1.0F, 4.0F};
DTYPE aData[2][3] = { {1.0, 1.0, 1.0}, DTYPE aData[2][3] = { {1.0F, 1.0F, 1.0F},
{1.0, 1.0, 1.0} }; {1.0F, 1.0F, 1.0F} };
DTYPE answer[2][3] = { {0.0, 0.5, 0.5}, DTYPE answer[2][3] = { {0.0F, 0.5F, 0.5F},
{0.5, 1.0, 0.75} }; {0.5F, 1.0F, 0.75F} };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -116,7 +118,7 @@ bool TestNormalize1() ...@@ -116,7 +118,7 @@ bool TestNormalize1()
t->SetZeroAll(); t->SetZeroAll();
/* call normalize function */ /* call normalize function */
Normalize(s, t, 0, mean, var, a, b, 0.0); Normalize(s, t, 0, mean, var, a, b, 0.0F);
/* check results */ /* check results */
cpuTest = t->CheckData(answer, tUnitNum, 1e-4, 0); cpuTest = t->CheckData(answer, tUnitNum, 1e-4, 0);
...@@ -142,7 +144,7 @@ bool TestNormalize1() ...@@ -142,7 +144,7 @@ bool TestNormalize1()
tGPU->SetZeroAll(); tGPU->SetZeroAll();
/* call Normalize function */ /* call Normalize function */
Normalize(sGPU, tGPU, 0, meanGPU, varGPU, aGPU, bGPU, 0.0); Normalize(sGPU, tGPU, 0, meanGPU, varGPU, aGPU, bGPU, 0.0F);
/* check results */ /* check results */
gpuTest = tGPU->CheckData(answer, tUnitNum, 1e-4, 0); gpuTest = tGPU->CheckData(answer, tUnitNum, 1e-4, 0);
...@@ -193,7 +195,6 @@ TODO!! ...@@ -193,7 +195,6 @@ TODO!!
*/ */
/* test for Normalize Function */ /* test for Normalize Function */
extern "C"
bool TestNormalize() bool TestNormalize()
{ {
XPRINT(0, stdout, "[TEST NORMALIZE] normalized the data with normal distribution \n"); XPRINT(0, stdout, "[TEST NORMALIZE] normalized the data with normal distribution \n");
......
...@@ -23,8 +23,10 @@ ...@@ -23,8 +23,10 @@
#include "TPower.h" #include "TPower.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: get the power(a, p)
* In this case, p=2. /*
case 1: get the power(a, p)
In this case, p=2.
*/ */
bool TestPower1() bool TestPower1()
{ {
...@@ -38,12 +40,12 @@ bool TestPower1() ...@@ -38,12 +40,12 @@ bool TestPower1()
for (int i = 0; i < aOrder; i++) for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i]; aUnitNum *= aDimSize[i];
DTYPE aData[3][2] = { {1.0, 2.0}, DTYPE aData[3][2] = { {1.0F, 2.0F},
{3.0, 4.0}, {3.0F, 4.0F},
{5.0, 6.0} }; {5.0F, 6.0F} };
DTYPE answer[3][2] = { {1.0, 4.0}, DTYPE answer[3][2] = { {1.0F, 4.0F},
{9.0, 16.0}, {9.0F, 16.0F},
{25.0, 36.0} }; {25.0F, 36.0F} };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -55,7 +57,7 @@ bool TestPower1() ...@@ -55,7 +57,7 @@ bool TestPower1()
a->SetData(aData, aUnitNum); a->SetData(aData, aUnitNum);
/* call Power function */ /* call Power function */
Power(a, 2.0); Power(a, 2.0F);
/* check results */ /* check results */
cpuTest = a->CheckData(answer, aUnitNum, 1e-4F); cpuTest = a->CheckData(answer, aUnitNum, 1e-4F);
...@@ -71,7 +73,7 @@ bool TestPower1() ...@@ -71,7 +73,7 @@ bool TestPower1()
aGPU->SetData(aData, aUnitNum); aGPU->SetData(aData, aUnitNum);
/* call power function */ /* call power function */
Power(aGPU, 2.0); Power(aGPU, 2.0F);
/* check results */ /* check results */
gpuTest = aGPU->CheckData(answer, aUnitNum, 1e-4F); gpuTest = aGPU->CheckData(answer, aUnitNum, 1e-4F);
...@@ -91,8 +93,9 @@ bool TestPower1() ...@@ -91,8 +93,9 @@ bool TestPower1()
#endif // USE_CUDA #endif // USE_CUDA
} }
/* case 2: get the power(a, p) /*
* In this case, p=1. case 2: get the power(a, p)
In this case, p=1.
*/ */
bool TestPower2() bool TestPower2()
{ {
...@@ -106,12 +109,12 @@ bool TestPower2() ...@@ -106,12 +109,12 @@ bool TestPower2()
for (int i = 0; i < aOrder; i++) for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i]; aUnitNum *= aDimSize[i];
DTYPE aData[3][2] = { {0.0, 1.0}, DTYPE aData[3][2] = { {0.0F, 1.0F},
{2.0, 3.0}, {2.0F, 3.0F},
{4.0, 5.0} }; {4.0F, 5.0F} };
DTYPE answer[3][2] = { {0.0, 1.0}, DTYPE answer[3][2] = { {0.0F, 1.0F},
{2.0, 3.0}, {2.0F, 3.0F},
{4.0, 5.0} }; {4.0F, 5.0F} };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -123,7 +126,7 @@ bool TestPower2() ...@@ -123,7 +126,7 @@ bool TestPower2()
a->SetData(aData, aUnitNum); a->SetData(aData, aUnitNum);
/* call Power function */ /* call Power function */
Power(a, 1.0); Power(a, 1.0F);
/* check results */ /* check results */
cpuTest = a->CheckData(answer, aUnitNum, 1e-4F); cpuTest = a->CheckData(answer, aUnitNum, 1e-4F);
...@@ -139,7 +142,7 @@ bool TestPower2() ...@@ -139,7 +142,7 @@ bool TestPower2()
aGPU->SetData(aData, aUnitNum); aGPU->SetData(aData, aUnitNum);
/* call Power function */ /* call Power function */
Power(aGPU, 1.0); Power(aGPU, 1.0F);
/* check results */ /* check results */
gpuTest = aGPU->CheckData(answer, aUnitNum, 1e-4F); gpuTest = aGPU->CheckData(answer, aUnitNum, 1e-4F);
...@@ -159,8 +162,9 @@ bool TestPower2() ...@@ -159,8 +162,9 @@ bool TestPower2()
#endif // USE_CUDA #endif // USE_CUDA
} }
/* case 3: get the power(a, p) /*
* In this case, p=0. case 3: get the power(a, p)
In this case, p=0.
*/ */
bool TestPower3() bool TestPower3()
{ {
...@@ -174,12 +178,12 @@ bool TestPower3() ...@@ -174,12 +178,12 @@ bool TestPower3()
for (int i = 0; i < aOrder; i++) for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i]; aUnitNum *= aDimSize[i];
DTYPE aData[3][2] = { {0.0, 1.0}, DTYPE aData[3][2] = { {0.0F, 1.0F},
{2.0, 3.0}, {2.0F, 3.0F},
{4.0, 5.0} }; {4.0F, 5.0F} };
DTYPE answer[3][2] = { {1.0, 1.0}, DTYPE answer[3][2] = { {1.0F, 1.0F},
{1.0, 1.0}, {1.0F, 1.0F},
{1.0, 1.0} }; {1.0F, 1.0F} };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -191,7 +195,7 @@ bool TestPower3() ...@@ -191,7 +195,7 @@ bool TestPower3()
a->SetData(aData, aUnitNum); a->SetData(aData, aUnitNum);
/* call Power function */ /* call Power function */
Power(a, 0.0); Power(a, 0.0F);
/* check results */ /* check results */
cpuTest = a->CheckData(answer, aUnitNum, 1e-4F); cpuTest = a->CheckData(answer, aUnitNum, 1e-4F);
...@@ -207,7 +211,7 @@ bool TestPower3() ...@@ -207,7 +211,7 @@ bool TestPower3()
aGPU->SetData(aData, aUnitNum); aGPU->SetData(aData, aUnitNum);
/* call Power function */ /* call Power function */
Power(aGPU, 0.0); Power(aGPU, 0.0F);
/* check results */ /* check results */
gpuTest = aGPU->CheckData(answer, aUnitNum, 1e-4F); gpuTest = aGPU->CheckData(answer, aUnitNum, 1e-4F);
...@@ -233,7 +237,6 @@ TODO!! ...@@ -233,7 +237,6 @@ TODO!!
*/ */
/* test for Power Function */ /* test for Power Function */
extern "C"
bool TestPower() bool TestPower()
{ {
XPRINT(0, stdout, "[TEST POWER] get the power(a, p) \n"); XPRINT(0, stdout, "[TEST POWER] get the power(a, p) \n");
......
...@@ -22,8 +22,10 @@ ...@@ -22,8 +22,10 @@
#include "TRectify.h" #include "TRectify.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: test rectify function
* y = max(0, x) /*
case 1: test rectify function
In this case, y = max(0, x)
*/ */
bool TestRectify1() bool TestRectify1()
{ {
...@@ -47,10 +49,10 @@ bool TestRectify1() ...@@ -47,10 +49,10 @@ bool TestRectify1()
for (int i = 0; i < yOrder; i++) for (int i = 0; i < yOrder; i++)
yUnitNum *= yDimSize[i]; yUnitNum *= yDimSize[i];
DTYPE xData[2][3] = { {0.0, -1.0, 2.0}, DTYPE xData[2][3] = { {0.0F, -1.0F, 2.0F},
{3.0, -4.0, -5.0} }; {3.0F, -4.0F, -5.0F} };
DTYPE answer[2][3] = { {0.0, 0.0, 2.0}, DTYPE answer[2][3] = { {0.0F, 0.0F, 2.0F},
{3.0, 0.0, 0.0} }; {3.0F, 0.0F, 0.0F} };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -107,10 +109,11 @@ bool TestRectify1() ...@@ -107,10 +109,11 @@ bool TestRectify1()
#endif // USE_CUDA #endif // USE_CUDA
} }
/* case 2: backward computation /*
* dE/dx = dE/dy * dy/dx case 2: backward computation
* rectified: y = max(0, x) dE/dx = dE/dy * dy/dx
* In this case, lossName=CROSSENTROPY. rectified: y = max(0, x)
In this case, lossName=CROSSENTROPY.
*/ */
bool TestRectify2() bool TestRectify2()
{ {
...@@ -124,16 +127,16 @@ bool TestRectify2() ...@@ -124,16 +127,16 @@ bool TestRectify2()
for (int i = 0; i < xOrder; i++) for (int i = 0; i < xOrder; i++)
xUnitNum *= xDimSize[i]; xUnitNum *= xDimSize[i];
DTYPE xData[2][3] = { {1.0, 1.0, 2.0}, DTYPE xData[2][3] = { {1.0F, 1.0F, 2.0F},
{2.0, 4.0, 5.0} }; {2.0F, 4.0F, 5.0F} };
DTYPE yData[2][3] = { {1.0, 1.0, 2.0}, DTYPE yData[2][3] = { {1.0F, 1.0F, 2.0F},
{2.0, 4.0, 5.0} }; {2.0F, 4.0F, 5.0F} };
DTYPE goldData[2][3] = { {1.0, 1.0, 1.0}, DTYPE goldData[2][3] = { {1.0F, 1.0F, 1.0F},
{1.0, 1.0, 1.0} }; {1.0F, 1.0F, 1.0F} };
DTYPE dedyData[2][3] = { {-1.0, -1.0, -0.5}, DTYPE dedyData[2][3] = { {-1.0F, -1.0F, -0.5F},
{-0.5, -0.25, -0.2} }; {-0.5F, -0.25F, -0.2F} };
DTYPE answer[2][3] = { {-1.0, -1.0, -0.5}, DTYPE answer[2][3] = { {-1.0F, -1.0F, -0.5F},
{-0.5, -0.25, -0.2} }; {-0.5F, -0.25F, -0.2F} };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -215,7 +218,6 @@ TODO!! ...@@ -215,7 +218,6 @@ TODO!!
*/ */
/* test for Rectify Function */ /* test for Rectify Function */
extern "C"
bool TestRectify() bool TestRectify()
{ {
XPRINT(0, stdout, "[TEST RECTIFY] test rectify and its backward computation \n"); XPRINT(0, stdout, "[TEST RECTIFY] test rectify and its backward computation \n");
......
...@@ -22,8 +22,10 @@ ...@@ -22,8 +22,10 @@
#include "TReduceMax.h" #include "TReduceMax.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: get the max value of the items along a dimension of the tensor.
* In this case, /*
case 1: get the max value of the items along a dimension of the tensor.
In this case,
(2, 4) -> (4), dim = 0 (2, 4) -> (4), dim = 0
(2, 4) -> (2), dim = 1 (2, 4) -> (2), dim = 1
*/ */
...@@ -57,10 +59,10 @@ bool TestReduceMax1() ...@@ -57,10 +59,10 @@ bool TestReduceMax1()
for (int i = 0; i < tOrder2; i++) for (int i = 0; i < tOrder2; i++)
tUnitNum2 *= tDimSize2[i]; tUnitNum2 *= tDimSize2[i];
DTYPE sData[2][4] = { {0.0, 5.0, 2.0, 3.0}, DTYPE sData[2][4] = { {0.0F, 5.0F, 2.0F, 3.0F},
{4.0, 1.0, 6.0, 7.0} }; {4.0F, 1.0F, 6.0F, 7.0F} };
DTYPE answer1[4] = {4.0, 5.0, 6.0, 7.0}; DTYPE answer1[4] = {4.0F, 5.0F, 6.0F, 7.0F};
DTYPE answer2[2] = {5.0, 7.0}; DTYPE answer2[2] = {5.0F, 7.0F};
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -134,7 +136,6 @@ TODO!! ...@@ -134,7 +136,6 @@ TODO!!
*/ */
/* test for ReduceMax Function */ /* test for ReduceMax Function */
extern "C"
bool TestReduceMax() bool TestReduceMax()
{ {
XPRINT(0, stdout, "[TEST ReduceMax] get the max value of the items along a dimension of the tensor\n"); XPRINT(0, stdout, "[TEST ReduceMax] get the max value of the items along a dimension of the tensor\n");
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include "TReduceMean.h" #include "TReduceMean.h"
namespace nts { // namespace nt(NiuTrans.Tensor) namespace nts { // namespace nt(NiuTrans.Tensor)
/* case 1: get the mean value along a dimension of the tensor */ /* case 1: get the mean value along a dimension of the tensor */
bool TestReduceMean1() bool TestReduceMean1()
{ {
...@@ -53,10 +54,10 @@ bool TestReduceMean1() ...@@ -53,10 +54,10 @@ bool TestReduceMean1()
for (int i = 0; i < tOrder2; i++) for (int i = 0; i < tOrder2; i++)
tUnitNum2 *= tDimSize2[i]; tUnitNum2 *= tDimSize2[i];
DTYPE sData[2][4] = { { 0.0, 1.0, 2.0, 3.0 }, DTYPE sData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{ 4.0, 5.0, 6.0, 7.0 } }; {4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE answer1[4] = {2.0, 3.0, 4.0, 5.0}; DTYPE answer1[4] = {2.0F, 3.0F, 4.0F, 5.0F};
DTYPE answer2[2] = {1.5, 5.5}; DTYPE answer2[2] = {1.5F, 5.5F};
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -124,104 +125,12 @@ bool TestReduceMean1() ...@@ -124,104 +125,12 @@ bool TestReduceMean1()
#endif // USE_CUDA #endif // USE_CUDA
} }
bool TestReduceMeanForLargescale()
{
/* a tensor of size 10000 * 500 */
int order = 2;
int order_reduce = 1;
int * dimSize = new int[order];
dimSize[0] = 10000;
dimSize[1] = 500;
int unitNum = 1;
for (int i = 0; i < order; i++)
unitNum *= dimSize[i];
/* a tensor of size 500 */
int * dimSize_reduce_a = new int[order_reduce];
dimSize_reduce_a[0] = 500;
int unitNum_a = 1;
for (int i = 0; i < order_reduce; i++)
unitNum_a *= dimSize_reduce_a[i];
/* a tensor of size 10000 */
int * dimSize_reduce_b = new int[order_reduce];
dimSize_reduce_b[0] = 10000;
int unitNum_b = 1;
for (int i = 0; i < order_reduce; i++)
unitNum_b *= dimSize_reduce_b[i];
DTYPE * data = new DTYPE[5000000];
DTYPE * tmp = data;
for (int i = 0; i < unitNum; i++)
*tmp++ = 1;
DTYPE answer_a[500];
for (int i = 0; i < unitNum_a; i++)
answer_a[i] = 1;
DTYPE answer_b[10000];
for (int i = 0; i < unitNum_b; i++)
answer_b[i] = 1;
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensor(order, dimSize);
XTensor * reduce_a = NewTensor(order_reduce, dimSize_reduce_a);
XTensor * b = NewTensor(order, dimSize);
XTensor * reduce_b = NewTensor(order_reduce, dimSize_reduce_b);
/* initialize variables */
a->SetData(data, unitNum);
b->SetData(data, unitNum);
/* call reduce max function */
ReduceMean(a, reduce_a, 0);
ReduceMean(b, reduce_b, 1);
/* check results */
cpuTest = reduce_a->CheckData(answer_a, unitNum_a) && reduce_b->CheckData(answer_b, unitNum_b);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT);
XTensor * reduce_aGPU = NewTensor(order_reduce, dimSize_reduce_a, X_FLOAT);
XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT);
XTensor * reduce_bGPU = NewTensor(order_reduce, dimSize_reduce_b, X_FLOAT);
/* Initialize variables */
aGPU->SetData(data, unitNum);
bGPU->SetData(data, unitNum);
/* call reduce max function */
ReduceMean(aGPU, reduce_aGPU, 0);
ReduceMean(bGPU, reduce_bGPU, 1);
/* check results */
gpuTest = reduce_aGPU->CheckData(answer_a, unitNum_a) && reduce_bGPU->CheckData(answer_b, unitNum_b);
/* destroy variables */
delete aGPU, bGPU, reduce_aGPU, reduce_bGPU;
delete[] dimSize, dimSize_reduce_a, dimSize_reduce_b;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */ /* other cases */
/* /*
TODO!! TODO!!
*/ */
/* test for ReduceMean Function */ /* test for ReduceMean Function */
extern "C"
bool TestReduceMean() bool TestReduceMean()
{ {
XPRINT(0, stdout, "[TEST ReduceMean] get the mean value along a dimension of the tensor \n"); XPRINT(0, stdout, "[TEST ReduceMean] get the mean value along a dimension of the tensor \n");
...@@ -236,15 +145,6 @@ bool TestReduceMean() ...@@ -236,15 +145,6 @@ bool TestReduceMean()
else else
XPRINT(0, stdout, ">> case 1 passed!\n"); XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 2 test */
caseFlag = TestReduceMeanForLargescale();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n");
}
else
XPRINT(0, stdout, ">> case 2 passed!\n");
///* other cases test */ ///* other cases test */
///* ///*
//TODO!! //TODO!!
......
...@@ -24,13 +24,13 @@ ...@@ -24,13 +24,13 @@
#include "../core/ReduceMean.h" #include "../core/ReduceMean.h"
namespace nts { // namespace nt(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* test for ReduceMean Function */ /* test for ReduceMean Function */
extern "C" extern "C"
bool TestReduceMean(); bool TestReduceMean();
} // namespace nt(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
#endif // __TEST_REDUCEMEAN_H__ #endif // __TEST_REDUCEMEAN_H__
...@@ -22,8 +22,10 @@ ...@@ -22,8 +22,10 @@
#include "TReduceSum.h" #include "TReduceSum.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: sum the items along a dimension of the tensor.
* In this case, /*
case 1: sum the items along a dimension of the tensor.
In this case,
(2, 4) -> (4), dim = 0 (2, 4) -> (4), dim = 0
(2, 4) -> (2), dim = 1 (2, 4) -> (2), dim = 1
*/ */
...@@ -57,10 +59,10 @@ bool TestReduceSum1() ...@@ -57,10 +59,10 @@ bool TestReduceSum1()
for (int i = 0; i < tOrder2; i++) for (int i = 0; i < tOrder2; i++)
tUnitNum2 *= tDimSize2[i]; tUnitNum2 *= tDimSize2[i];
DTYPE sData[2][4] = { {0.0, 1.0, 2.0, 3.0}, DTYPE sData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0, 5.0, 6.0, 7.0} }; {4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE answer1[4] = {4.0, 6.0, 8.0, 10.0}; DTYPE answer1[4] = {4.0F, 6.0F, 8.0F, 10.0F};
DTYPE answer2[2] = {6.0, 22.0}; DTYPE answer2[2] = {6.0F, 22.0F};
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -128,103 +130,12 @@ bool TestReduceSum1() ...@@ -128,103 +130,12 @@ bool TestReduceSum1()
#endif // USE_CUDA #endif // USE_CUDA
} }
bool TestReduceSumForLargescale()
{
/* a tensor of size 10000 * 500 */
int order = 2;
int orderReduce = 1;
int * dimSize = new int[order];
dimSize[0] = 10000;
dimSize[1] = 500;
int unitNum = 1;
for (int i = 0; i < order; i++)
unitNum *= dimSize[i];
/* a tensor of size 500 */
int * dimSize_reduce_a = new int[orderReduce];
dimSize_reduce_a[0] = 500;
int unitNum_a = 1;
for (int i = 0; i < orderReduce; i++)
unitNum_a *= dimSize_reduce_a[i];
/* a tensor of size 10000 */
int * dimSize_reduce_b = new int[orderReduce];
dimSize_reduce_b[0] = 10000;
int unitNum_b = 1;
for (int i = 0; i < orderReduce; i++)
unitNum_b *= dimSize_reduce_b[i];
DTYPE * data = new DTYPE[5000000];
DTYPE * tmp = data;
for (int i = 0; i < unitNum; i++)
*tmp++ = 1;
DTYPE answer_a[500];
for (int i = 0; i < unitNum_a; i++)
answer_a[i] = 10000;
DTYPE answer_b[10000];
for (int i = 0; i < unitNum_b; i++)
answer_b[i] = 500;
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensor(order, dimSize);
XTensor * reduce_a = NewTensor(orderReduce, dimSize_reduce_a);
XTensor * b = NewTensor(order, dimSize);
XTensor * reduce_b = NewTensor(orderReduce, dimSize_reduce_b);
/* initialize variables */
a->SetData(data, unitNum);
b->SetData(data, unitNum);
/* call reduce sum function */
ReduceSum(a, reduce_a, 0);
ReduceSum(b, reduce_b, 1);
/* check results */
cpuTest = reduce_a->CheckData(answer_a, unitNum_a) && reduce_b->CheckData(answer_b, unitNum_b);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT);
XTensor * reduce_aGPU = NewTensor(orderReduce, dimSize_reduce_a, X_FLOAT);
XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT);
XTensor * reduce_bGPU = NewTensor(orderReduce, dimSize_reduce_b, X_FLOAT);
/* Initialize variables */
aGPU->SetData(data, unitNum);
bGPU->SetData(data, unitNum);
/* call reduce max function */
ReduceSum(aGPU, reduce_aGPU, 0);
ReduceSum(bGPU, reduce_bGPU, 1);
/* check results */
gpuTest = reduce_aGPU->CheckData(answer_a, unitNum_a) && reduce_bGPU->CheckData(answer_b, unitNum_b);
/* destroy variables */
delete aGPU, bGPU, reduce_aGPU, reduce_bGPU;
delete[] dimSize, dimSize_reduce_a, dimSize_reduce_b;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */ /* other cases */
/* /*
TODO!! TODO!!
*/ */
/* test for ReduceSum Function */ /* test for ReduceSum Function */
extern "C"
bool TestReduceSum() bool TestReduceSum()
{ {
XPRINT(0, stdout, "[TEST ReduceSum] sum the items along a dimension of the tensor.\n"); XPRINT(0, stdout, "[TEST ReduceSum] sum the items along a dimension of the tensor.\n");
...@@ -239,15 +150,6 @@ bool TestReduceSum() ...@@ -239,15 +150,6 @@ bool TestReduceSum()
else else
XPRINT(0, stdout, ">> case 1 passed!\n"); XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 2 test */
caseFlag = TestReduceSumForLargescale();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n");
}
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* other cases test */ /* other cases test */
/* /*
TODO!! TODO!!
......
...@@ -22,9 +22,11 @@ ...@@ -22,9 +22,11 @@
#include "TReduceSumSquared.h" #include "TReduceSumSquared.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: squared sum of the items along a dimension of the tensor.
* For a 1-dimensional data array a, sum = \sum_i (a_i - shift)^2. /*
* In this case, (2, 4) -> (4), dim = 0. case 1: squared sum of the items along a dimension of the tensor.
For a 1-dimensional data array a, sum = \sum_i (a_i - shift)^2.
In this case, (2, 4) -> (4), dim = 0.
*/ */
bool TestReduceSumSquared1() bool TestReduceSumSquared1()
{ {
...@@ -56,10 +58,10 @@ bool TestReduceSumSquared1() ...@@ -56,10 +58,10 @@ bool TestReduceSumSquared1()
for (int i = 0; i < shiftOrder; i++) for (int i = 0; i < shiftOrder; i++)
shiftUnitNum *= shiftDimSize[i]; shiftUnitNum *= shiftDimSize[i];
DTYPE sData[2][4] = { {0.0, 1.0, 2.0, 3.0}, DTYPE sData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0, 5.0, 6.0, 7.0} }; {4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE shiftData[4] = {1.0, -1.0, -1.0, 0.0}; DTYPE shiftData[4] = {1.0F, -1.0F, -1.0F, 0.0F};
DTYPE answer[4] = {10.0, 40.0, 58.0, 58.0}; DTYPE answer[4] = {10.0F, 40.0F, 58.0F, 58.0F};
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -125,9 +127,10 @@ bool TestReduceSumSquared1() ...@@ -125,9 +127,10 @@ bool TestReduceSumSquared1()
#endif // USE_CUDA #endif // USE_CUDA
} }
/* case 1: squared sum of the items along a dimension of the tensor. /*
* For a 1-dimensional data array a, sum = \sum_i (a_i - shift)^2. case 2: squared sum of the items along a dimension of the tensor.
* In this case, (2, 4) -> (2), dim = 1. For a 1-dimensional data array a, sum = \sum_i (a_i - shift)^2.
In this case, (2, 4) -> (2), dim = 1.
*/ */
bool TestReduceSumSquared2() bool TestReduceSumSquared2()
{ {
...@@ -141,7 +144,7 @@ bool TestReduceSumSquared2() ...@@ -141,7 +144,7 @@ bool TestReduceSumSquared2()
for (int i = 0; i < sOrder; i++) for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i]; sUnitNum *= sDimSize[i];
/* a output tensor of size (4) */ /* a output tensor of size (2) */
int tOrder = 1; int tOrder = 1;
int * tDimSize = new int[tOrder]; int * tDimSize = new int[tOrder];
tDimSize[0] = 2; tDimSize[0] = 2;
...@@ -150,7 +153,7 @@ bool TestReduceSumSquared2() ...@@ -150,7 +153,7 @@ bool TestReduceSumSquared2()
for (int i = 0; i < tOrder; i++) for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i]; tUnitNum *= tDimSize[i];
/* a shift tensor of size (4) */ /* a shift tensor of size (2) */
int shiftOrder = 1; int shiftOrder = 1;
int * shiftDimSize = new int[shiftOrder]; int * shiftDimSize = new int[shiftOrder];
shiftDimSize[0] = 2; shiftDimSize[0] = 2;
...@@ -159,10 +162,10 @@ bool TestReduceSumSquared2() ...@@ -159,10 +162,10 @@ bool TestReduceSumSquared2()
for (int i = 0; i < shiftOrder; i++) for (int i = 0; i < shiftOrder; i++)
shiftUnitNum *= shiftDimSize[i]; shiftUnitNum *= shiftDimSize[i];
DTYPE sData[2][4] = { {0.0, 1.0, 2.0, 3.0}, DTYPE sData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0, 5.0, 6.0, 7.0} }; {4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE shiftData[2] = {-1.0, 1.0}; DTYPE shiftData[2] = {-1.0F, 1.0F};
DTYPE answer[2] = {30.0, 86.0}; DTYPE answer[2] = {30.0F, 86.0F};
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -234,7 +237,6 @@ TODO!! ...@@ -234,7 +237,6 @@ TODO!!
*/ */
/* test for ReduceSumSquared Function */ /* test for ReduceSumSquared Function */
extern "C"
bool TestReduceSumSquared() bool TestReduceSumSquared()
{ {
XPRINT(0, stdout, "[TEST ReduceSumSquared] squared sum of the items along a dimension of the tensor\n"); XPRINT(0, stdout, "[TEST ReduceSumSquared] squared sum of the items along a dimension of the tensor\n");
......
...@@ -22,9 +22,11 @@ ...@@ -22,9 +22,11 @@
#include "TReduceVariance.h" #include "TReduceVariance.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: variance of the items along a dimension of the tensor.
* For a 1-dimensional data array a, variance = 1/n * \sum_i (a_i - mean)^2. /*
* In this case, (2, 4) -> (4), dim = 0. case 1: variance of the items along a dimension of the tensor.
For a 1-dimensional data array a, variance = 1/n * \sum_i (a_i - mean)^2.
In this case, (2, 4) -> (4), dim = 0.
*/ */
bool TestReduceVariance1() bool TestReduceVariance1()
{ {
...@@ -131,7 +133,6 @@ TODO!! ...@@ -131,7 +133,6 @@ TODO!!
*/ */
/* test for ReduceVariance Function */ /* test for ReduceVariance Function */
extern "C"
bool TestReduceVariance() bool TestReduceVariance()
{ {
XPRINT(0, stdout, "[TEST ReduceVariance] variance of the items along a dimension of the tensor\n"); XPRINT(0, stdout, "[TEST ReduceVariance] variance of the items along a dimension of the tensor\n");
......
...@@ -22,8 +22,10 @@ ...@@ -22,8 +22,10 @@
#include "TScaleAndShift.h" #include "TScaleAndShift.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: scale and shift all tensor entires.
* p = p * scale + shift /*
case 1: scale and shift all tensor entires.
p = p * scale + shift
*/ */
bool TestScaleAndShift1() bool TestScaleAndShift1()
{ {
...@@ -42,8 +44,8 @@ bool TestScaleAndShift1() ...@@ -42,8 +44,8 @@ bool TestScaleAndShift1()
DTYPE answer[2][4] = { {0.5F, 2.5F, 4.5F, 6.5F}, DTYPE answer[2][4] = { {0.5F, 2.5F, 4.5F, 6.5F},
{8.5F, 10.5F, 12.5F, 14.5F} }; {8.5F, 10.5F, 12.5F, 14.5F} };
DTYPE scaleFactor = 2.0; DTYPE scaleFactor = 2.0F;
DTYPE shiftFactor = 0.5; DTYPE shiftFactor = 0.5F;
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -97,7 +99,6 @@ TODO!! ...@@ -97,7 +99,6 @@ TODO!!
*/ */
/* test for ScaleAndShift Function */ /* test for ScaleAndShift Function */
extern "C"
bool TestScaleAndShift() bool TestScaleAndShift()
{ {
XPRINT(0, stdout, "[TEST ScaleAndShift] scale and shift all tensor entires\n"); XPRINT(0, stdout, "[TEST ScaleAndShift] scale and shift all tensor entires\n");
......
...@@ -20,12 +20,14 @@ ...@@ -20,12 +20,14 @@
*/ */
#include "TSelect.h" #include "TSelect.h"
#include "../xc/Mycode.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: test SelectRange function.
* It can generate a tensor with seleccted data /*
* in range[low,high] along the given dimension. case 1: test SelectRange function.
* In this case, (2, 2, 4) -> (2, 2, 2), dim = 2, low = 1, high = 3. It can generate a tensor with seleccted data in range[low,high] along the given dimension.
In this case, (2, 2, 4) -> (2, 2, 2), dim = 2, low = 1, high = 3.
*/ */
bool TestSelect1() bool TestSelect1()
{ {
...@@ -76,25 +78,25 @@ bool TestSelect1() ...@@ -76,25 +78,25 @@ bool TestSelect1()
/* check results */ /* check results */
cpuTest = t->CheckData(answer, tUnitNum); cpuTest = t->CheckData(answer, tUnitNum);
return cpuTest;
#ifdef USE_CUDA #ifdef USE_CUDA
/* GPU test */ /* GPU test */
bool gpuTest = true; bool gpuTest = true;
/* create tensors */ /* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0); XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0); XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
/* initialize variables */ /* initialize variables */
sGPU->SetData(sData, sUnitNum); sGPU->SetData(sData, sUnitNum);
tGPU->SetZeroAll(); tGPU->SetZeroAll();
/* call Select function */ /* call Select function */
SelectRange(sGPU, 1, 1, 3, tGPU); SelectRange(sGPU, 2, 1, 3, tGPU);
/* check results */ /* check results */
gpuTest = tGPU->CheckData(answer, sUnitNum); gpuTest = tGPU->CheckData(answer, tUnitNum);
/* destroy variables */ /* destroy variables */
delete s; delete s;
delete t; delete t;
...@@ -121,7 +123,6 @@ TODO!! ...@@ -121,7 +123,6 @@ TODO!!
*/ */
/* test for Select Function */ /* test for Select Function */
extern "C"
bool TestSelect() bool TestSelect()
{ {
XPRINT(0, stdout, "[TEST Select] generate a tensor with seleccted data in range[low,high] along the given dimension \n"); XPRINT(0, stdout, "[TEST Select] generate a tensor with seleccted data in range[low,high] along the given dimension \n");
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include "TSetAscendingOrder.h" #include "TSetAscendingOrder.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: set the cell to the ascending order along a given dimension. /* case 1: set the cell to the ascending order along a given dimension.
*/ */
bool TestSetAscendingOrder1() bool TestSetAscendingOrder1()
...@@ -92,7 +93,6 @@ TODO!! ...@@ -92,7 +93,6 @@ TODO!!
*/ */
/* test for SetAscendingOrder Function */ /* test for SetAscendingOrder Function */
extern "C"
bool TestSetAscendingOrder() bool TestSetAscendingOrder()
{ {
XPRINT(0, stdout, "[TEST SetAscendingOrder] set the cell to the ascending order along a given dimension \n"); XPRINT(0, stdout, "[TEST SetAscendingOrder] set the cell to the ascending order along a given dimension \n");
......
...@@ -22,8 +22,8 @@ ...@@ -22,8 +22,8 @@
#include "TSetData.h" #include "TSetData.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: set the cell to the ascending order along a given dimension.
*/ /* case 1: set the cell to the ascending order along a given dimension. */
bool TestSetData1() bool TestSetData1()
{ {
/* a input tensor of size (2, 4) */ /* a input tensor of size (2, 4) */
...@@ -83,7 +83,6 @@ TODO!! ...@@ -83,7 +83,6 @@ TODO!!
*/ */
/* test for SetData Function */ /* test for SetData Function */
extern "C"
bool TestSetData() bool TestSetData()
{ {
XPRINT(0, stdout, "[TEST SetData] set the data of tensor \n"); XPRINT(0, stdout, "[TEST SetData] set the data of tensor \n");
......
...@@ -23,9 +23,11 @@ ...@@ -23,9 +23,11 @@
#include "TSigmoid.h" #include "TSigmoid.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: test Sigmoid function and SigmoidBackward function.
* sigmoid function: y = 1/(1+exp(-x)) /*
* backward computation: dE/ds = dE/dy * dy/dx case 1: test Sigmoid function and SigmoidBackward function.
sigmoid function: y = 1/(1+exp(-x))
backward computation: dE/ds = dE/dy * dy/dx
*/ */
bool TestSigmoid1() bool TestSigmoid1()
{ {
...@@ -124,9 +126,10 @@ bool TestSigmoid1() ...@@ -124,9 +126,10 @@ bool TestSigmoid1()
#endif // USE_CUDA #endif // USE_CUDA
} }
/* case 2: test Sigmoid function and SigmoidBackward function. /*
* sigmoid function: y = 1/(1+exp(-x)) case 2: test Sigmoid function and SigmoidBackward function.
* backward computation: dE/ds = dE/dy * dy/dx sigmoid function: y = 1/(1+exp(-x))
backward computation: dE/ds = dE/dy * dy/dx
*/ */
bool TestSigmoid2() bool TestSigmoid2()
{ {
...@@ -234,7 +237,6 @@ bool TestSigmoid2() ...@@ -234,7 +237,6 @@ bool TestSigmoid2()
*/ */
/* test for Sigmoid Function */ /* test for Sigmoid Function */
extern "C"
bool TestSigmoid() bool TestSigmoid()
{ {
XPRINT(0, stdout, "[TEST SIGMOID] sigmoid function and its backward computation \n"); XPRINT(0, stdout, "[TEST SIGMOID] sigmoid function and its backward computation \n");
......
...@@ -24,8 +24,10 @@ ...@@ -24,8 +24,10 @@
#include "TSoftmax.h" #include "TSoftmax.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: test Softmax function.
* softmax function: y = e^x / \sum_{i} e^{x_i} /*
case 1: test Softmax function.
softmax function: y = e^x / \sum_{i} e^{x_i}
*/ */
bool TestSoftmax1() bool TestSoftmax1()
{ {
...@@ -96,8 +98,9 @@ bool TestSoftmax1() ...@@ -96,8 +98,9 @@ bool TestSoftmax1()
#endif // USE_CUDA #endif // USE_CUDA
} }
/* case 2: test SoftmaxBackward function. /*
* SoftmaxBackward function: dE/dx_j = -gold_j + y_j case 2: test SoftmaxBackward function.
SoftmaxBackward function: dE/dx_j = -gold_j + y_j
*/ */
bool TestSoftmax2() bool TestSoftmax2()
{ {
...@@ -200,7 +203,6 @@ bool TestSoftmax2() ...@@ -200,7 +203,6 @@ bool TestSoftmax2()
*/ */
/* test for Softmax Function */ /* test for Softmax Function */
extern "C"
bool TestSoftmax() bool TestSoftmax()
{ {
XPRINT(0, stdout, "[TEST SOFTMAX] softmax function and its backward computation \n"); XPRINT(0, stdout, "[TEST SOFTMAX] softmax function and its backward computation \n");
......
...@@ -22,7 +22,8 @@ ...@@ -22,7 +22,8 @@
#include "TSort.h" #include "TSort.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: sort the tensor along a given dimension*/
/* case 1: sort the tensor along a given dimension */
bool TestSort1() bool TestSort1()
{ {
/* a tensor of size (2, 4) */ /* a tensor of size (2, 4) */
...@@ -35,10 +36,10 @@ bool TestSort1() ...@@ -35,10 +36,10 @@ bool TestSort1()
for (int i = 0; i < order; i++) for (int i = 0; i < order; i++)
unitNum *= dimSize[i]; unitNum *= dimSize[i];
DTYPE aData[2][4] = { { 0.0F, 1.0F, 2.0F, 3.0F }, DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{ 4.0F, 5.0F, 6.0F, 7.0F } }; {4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE answer[2][4] = { { 4.0F, 5.0F, 6.0F, 7.0F }, DTYPE answer[2][4] = { {4.0F, 5.0F, 6.0F, 7.0F},
{ 0.0F, 1.0F, 2.0F, 3.0F } }; {0.0F, 1.0F, 2.0F, 3.0F} };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -104,10 +105,10 @@ bool TestSort2() ...@@ -104,10 +105,10 @@ bool TestSort2()
for (int i = 0; i < order; i++) for (int i = 0; i < order; i++)
unitNum *= dimSize[i]; unitNum *= dimSize[i];
DTYPE aData[2][4] = { { 0.0, 1.0, 2.0, 3.0 }, DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{ 4.0, 5.0, 6.0, 7.0 } }; {4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE answer[2][4] = { { 3.0, 2.0, 1.0, 0.0 }, DTYPE answer[2][4] = { {3.0F, 2.0F, 1.0F, 0.0F},
{ 7.0, 6.0, 5.0, 4.0 } }; {7.0F, 6.0F, 5.0F, 4.0F} };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -166,7 +167,6 @@ TODO!! ...@@ -166,7 +167,6 @@ TODO!!
*/ */
/* test for Sort Function */ /* test for Sort Function */
extern "C"
bool TestSort() bool TestSort()
{ {
XPRINT(0, stdout, "[TEST SORT] sort the tensor along a given dimension \n"); XPRINT(0, stdout, "[TEST SORT] sort the tensor along a given dimension \n");
......
...@@ -19,18 +19,17 @@ ...@@ -19,18 +19,17 @@
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-06-13 * $Created by: Lin Ye (email: linye2015@outlook.com) 2018-06-13
*/ */
#include "../XTensor.h" #include "TSplit.h"
#include "../XDevice.h"
#include "../core/Split.h"
#include "../XList.h"
namespace nts { // namespace nt(NiuTrans.Tensor) namespace nts { // namespace nt(NiuTrans.Tensor)
/* case 1: transform a tensor by splitting it, e.g., (N, M) -> (N/3, M, 3)
* In this case, 4 * 3 -> 2 * 2 * 3, whereToSplit=0, splitNum=2. /*
case 1: transform a tensor by splitting it, e.g., (N, M) -> (N/3, M, 3)
In this case, (4, 3) -> (2, 2, 3), whereToSplit=0, splitNum=2.
*/ */
bool TestSplit1() bool TestSplit1()
{ {
/* a source tensor of size 4 * 3 */ /* a source tensor of size (4, 3) */
int sOrder = 2; int sOrder = 2;
int * sDimSize = new int[sOrder]; int * sDimSize = new int[sOrder];
sDimSize[0] = 4; sDimSize[0] = 4;
...@@ -40,7 +39,7 @@ bool TestSplit1() ...@@ -40,7 +39,7 @@ bool TestSplit1()
for (int i = 0; i < sOrder; i++) for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i]; sUnitNum *= sDimSize[i];
/* a target tensor of size 2 * 2 * 3 */ /* a target tensor of size (2, 2, 3) */
int tOrder = 3; int tOrder = 3;
int * tDimSize = new int[tOrder]; int * tDimSize = new int[tOrder];
tDimSize[0] = 2; tDimSize[0] = 2;
...@@ -109,12 +108,13 @@ bool TestSplit1() ...@@ -109,12 +108,13 @@ bool TestSplit1()
#endif // USE_CUDA #endif // USE_CUDA
} }
/* case 2: transform a tensor by splitting it, e.g., (N, M) -> (N/3, M, 3) /*
* In this case, 3 * 4 -> 2 * 3 * 2, whereToSplit=1, splitNum=2. case 2: transform a tensor by splitting it, e.g., (N, M) -> (N/3, M, 3)
In this case, (3, 4) -> (2, 3, 2), whereToSplit=1, splitNum=2.
*/ */
bool TestSplit2() bool TestSplit2()
{ {
/* a source tensor of size 3 * 4 */ /* a source tensor of size (3, 4) */
int sOrder = 2; int sOrder = 2;
int * sDimSize = new int[sOrder]; int * sDimSize = new int[sOrder];
sDimSize[0] = 3; sDimSize[0] = 3;
...@@ -124,7 +124,7 @@ bool TestSplit2() ...@@ -124,7 +124,7 @@ bool TestSplit2()
for (int i = 0; i < sOrder; i++) for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i]; sUnitNum *= sDimSize[i];
/* a target tensor of size 2 * 3 * 2 */ /* a target tensor of size (2, 3, 2) */
int tOrder = 3; int tOrder = 3;
int * tDimSize = new int[tOrder]; int * tDimSize = new int[tOrder];
tDimSize[0] = 2; tDimSize[0] = 2;
...@@ -194,8 +194,9 @@ bool TestSplit2() ...@@ -194,8 +194,9 @@ bool TestSplit2()
#endif // USE_CUDA #endif // USE_CUDA
} }
/* case 3: split a big tensor into small tensors /*
* In this case, 3 * 4 -> 2 * (3 * 2) , whereToSplit=1, splitNum=2. case 3: split a big tensor into small tensors
In this case, (3, 4) -> 2 * (3, 2) , whereToSplit=1, splitNum=2.
*/ */
bool TestSplit3() bool TestSplit3()
{ {
...@@ -203,7 +204,7 @@ bool TestSplit3() ...@@ -203,7 +204,7 @@ bool TestSplit3()
XList tList; XList tList;
tList = XList(); tList = XList();
/* a source tensor of size (3 * 4) */ /* a source tensor of size (3, 4) */
int sOrder = 2; int sOrder = 2;
int * sDimSize = new int[sOrder]; int * sDimSize = new int[sOrder];
sDimSize[0] = 3; sDimSize[0] = 3;
...@@ -213,7 +214,7 @@ bool TestSplit3() ...@@ -213,7 +214,7 @@ bool TestSplit3()
for (int i = 0; i < sOrder; i++) for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i]; sUnitNum *= sDimSize[i];
/* a target tensor of size (3 * 2) */ /* a target tensor of size (3, 2) */
int tOrder1 = 2; int tOrder1 = 2;
int * tDimSize1 = new int[tOrder1]; int * tDimSize1 = new int[tOrder1];
tDimSize1[0] = 3; tDimSize1[0] = 3;
...@@ -313,10 +314,9 @@ TODO!! ...@@ -313,10 +314,9 @@ TODO!!
*/ */
/* test for Split Function */ /* test for Split Function */
extern "C" bool TestSplit()
bool TestSplit()
{ {
XPRINT(0, stdout, "[TEST SPLIT] -------------\n"); XPRINT(0, stdout, "[TEST SPLIT] split a big tensor into small tensors \n");
bool returnFlag = true, caseFlag = true; bool returnFlag = true, caseFlag = true;
/* case 1 test */ /* case 1 test */
......
...@@ -22,7 +22,8 @@ ...@@ -22,7 +22,8 @@
#include "TSum.h" #include "TSum.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1 */
/* case 1: tensor summation c = a + b * \beta */
bool TestSum1() bool TestSum1()
{ {
/* a tensor of size (2, 4) */ /* a tensor of size (2, 4) */
...@@ -35,12 +36,12 @@ bool TestSum1() ...@@ -35,12 +36,12 @@ bool TestSum1()
for (int i = 0; i < order; i++) for (int i = 0; i < order; i++)
unitNum *= dimSize[i]; unitNum *= dimSize[i];
DTYPE aData[2][4] = { {0.0, 1.0, 2.0, 3.0}, DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0, 5.0, 6.0, 7.0} }; {4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[2][4] = { {1.0, -1.0, -3.0, -5.0}, DTYPE bData[2][4] = { {1.0F, -1.0F, -3.0F, -5.0F},
{-7.0, -9.0, -11.0, -13.0} }; {-7.0F, -9.0F, -11.0F, -13.0F} };
DTYPE answer[2][4] = { {1.0, 0.0, -1.0, -2.0}, DTYPE answer[2][4] = { {1.0F, 0.0F, -1.0F, -2.0F},
{-3.0, -4.0, -5.0, -6.0} }; {-3.0F, -4.0F, -5.0F, -6.0F} };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -95,7 +96,7 @@ bool TestSum1() ...@@ -95,7 +96,7 @@ bool TestSum1()
#endif // USE_CUDA #endif // USE_CUDA
} }
/* case 2 */ /* case 2: tensor summation c = a + b * \beta */
bool TestSum2() bool TestSum2()
{ {
/* a tensor of size (2, 4) */ /* a tensor of size (2, 4) */
...@@ -108,12 +109,12 @@ bool TestSum2() ...@@ -108,12 +109,12 @@ bool TestSum2()
for (int i = 0; i < order; i++) { for (int i = 0; i < order; i++) {
unitNum *= dimSize[i]; unitNum *= dimSize[i];
} }
DTYPE aData[2][4] = { {0.0, 1.0, 2.0, 3.0}, DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0, 5.0, 6.0, 7.0} }; {4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[2][4] = { {1.0, -1.0, -3.0, -5.0}, DTYPE bData[2][4] = { {1.0F, -1.0F, -3.0F, -5.0F},
{-7.0, -9.0, -11.0, -13.0} }; {-7.0F, -9.0F, -11.0F, -13.0F} };
DTYPE answer[2][4] = { {0.5, 0.5, 0.5, 0.5}, DTYPE answer[2][4] = { {0.5F, 0.5F, 0.5F, 0.5F},
{0.5, 0.5, 0.5, 0.5} }; {0.5F, 0.5F, 0.5F, 0.5F} };
float beta = 0.5F; float beta = 0.5F;
/* CPU test */ /* CPU test */
...@@ -129,7 +130,7 @@ bool TestSum2() ...@@ -129,7 +130,7 @@ bool TestSum2()
b->SetData(bData, unitNum); b->SetData(bData, unitNum);
c->SetZeroAll(); c->SetZeroAll();
/* call sum function */ /* call Sum function */
Sum(a, b, c, beta); Sum(a, b, c, beta);
/* check results */ /* check results */
...@@ -149,7 +150,7 @@ bool TestSum2() ...@@ -149,7 +150,7 @@ bool TestSum2()
bGPU->SetData(bData, unitNum); bGPU->SetData(bData, unitNum);
cGPU->SetZeroAll(); cGPU->SetZeroAll();
/* call sum function */ /* call Sum function */
Sum(aGPU, bGPU, cGPU, beta); Sum(aGPU, bGPU, cGPU, beta);
/* check results */ /* check results */
...@@ -182,8 +183,7 @@ bool TestSum2() ...@@ -182,8 +183,7 @@ bool TestSum2()
*/ */
/* test for Sum Function */ /* test for Sum Function */
extern "C" bool TestSum()
bool TestSum()
{ {
XPRINT(0, stdout, "[TEST SUM] tensor summation c = a + b * beta\n"); XPRINT(0, stdout, "[TEST SUM] tensor summation c = a + b * beta\n");
bool returnFlag = true, caseFlag = true; bool returnFlag = true, caseFlag = true;
......
...@@ -22,9 +22,10 @@ ...@@ -22,9 +22,10 @@
#include "TSumByColumnTV.h" #include "TSumByColumnTV.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: test SumByColumnTV function
* sum of a tensor and a vector (column vector) /*
* in a column by column manner case 1: test SumByColumnTV function
sum of a tensor and a vector (column vector) in a column by column manner
*/ */
bool TestSumByColumnTV1() bool TestSumByColumnTV1()
{ {
...@@ -58,12 +59,12 @@ bool TestSumByColumnTV1() ...@@ -58,12 +59,12 @@ bool TestSumByColumnTV1()
for (int i = 0; i < cOrder; i++) for (int i = 0; i < cOrder; i++)
cUnitNum *= cDimSize[i]; cUnitNum *= cDimSize[i];
DTYPE aData[2][4] = { {0.0, 1.0, 2.0, 3.0}, DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0, 5.0, 6.0, 7.0} }; {4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[2][1] = { {1.0}, DTYPE bData[2][1] = { {1.0F},
{0.0} }; {0.0F} };
DTYPE answer[2][4] = { {1.0, 2.0, 3.0, 4.0}, DTYPE answer[2][4] = { {1.0F, 2.0F, 3.0F, 4.0F},
{4.0, 5.0, 6.0, 7.0} }; {4.0F, 5.0F, 6.0F, 7.0F} };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -128,9 +129,9 @@ bool TestSumByColumnTV1() ...@@ -128,9 +129,9 @@ bool TestSumByColumnTV1()
#endif // USE_CUDA #endif // USE_CUDA
} }
/* case 2: test SumByColumnTV function /*
* sum of a tensor and a vector (column vector) case 2: test SumByColumnTV function
* in a column by column manner sum of a tensor and a vector (column vector) in a column by column manner
*/ */
bool TestSumByColumnTV2() bool TestSumByColumnTV2()
{ {
...@@ -154,12 +155,12 @@ bool TestSumByColumnTV2() ...@@ -154,12 +155,12 @@ bool TestSumByColumnTV2()
for (int i = 0; i < bOrder; i++) for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i]; bUnitNum *= bDimSize[i];
DTYPE aData[2][4] = { {0.0, 1.0, 2.0, 3.0}, DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0, 5.0, 6.0, 7.0} }; {4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[2][1] = { {1.0}, DTYPE bData[2][1] = { {1.0F},
{0.0} }; {0.0F} };
DTYPE answer[2][4] = { {1.0, 2.0, 3.0, 4.0}, DTYPE answer[2][4] = { {1.0F, 2.0F, 3.0F, 4.0F},
{4.0, 5.0, 6.0, 7.0} }; {4.0F, 5.0F, 6.0F, 7.0F} };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -222,7 +223,6 @@ bool TestSumByColumnTV2() ...@@ -222,7 +223,6 @@ bool TestSumByColumnTV2()
*/ */
/* test for SumByColumnTV Function */ /* test for SumByColumnTV Function */
extern "C"
bool TestSumByColumnTV() bool TestSumByColumnTV()
{ {
XPRINT(0, stdout, "[TEST SumByColumnTV] sum of a tensor and a vector (column vector) in a column by column manner \n"); XPRINT(0, stdout, "[TEST SumByColumnTV] sum of a tensor and a vector (column vector) in a column by column manner \n");
......
...@@ -22,9 +22,10 @@ ...@@ -22,9 +22,10 @@
#include "TSumByColumnVT.h" #include "TSumByColumnVT.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: test SumByColumnVT function
* sum of a vector (column vector) and a tensor /*
* in a column by column manner case 1: test SumByColumnVT function
sum of a vector (column vector) and a tensor in a column by column manner
*/ */
bool TestSumByColumnVT1() bool TestSumByColumnVT1()
{ {
...@@ -58,12 +59,12 @@ bool TestSumByColumnVT1() ...@@ -58,12 +59,12 @@ bool TestSumByColumnVT1()
for (int i = 0; i < cOrder; i++) for (int i = 0; i < cOrder; i++)
cUnitNum *= cDimSize[i]; cUnitNum *= cDimSize[i];
DTYPE aData[2][1] = { {1.0}, DTYPE aData[2][1] = { {1.0F},
{0.0} }; {0.0F} };
DTYPE bData[2][4] = { {0.0, 1.0, 2.0, 3.0}, DTYPE bData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0, 5.0, 6.0, 7.0} }; {4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE answer[2][1] = { {7.0}, DTYPE answer[2][1] = { {7.0F},
{22.0} }; {22.0F} };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -129,9 +130,9 @@ bool TestSumByColumnVT1() ...@@ -129,9 +130,9 @@ bool TestSumByColumnVT1()
#endif // USE_CUDA #endif // USE_CUDA
} }
/* case 2: test SumByColumnVT function /*
* sum of a vector (column vector) and a tensor case 2: test SumByColumnVT function
* in a column by column manner sum of a vector (column vector) and a tensor in a column by column manner
*/ */
bool TestSumByColumnVT2() bool TestSumByColumnVT2()
{ {
...@@ -155,12 +156,12 @@ bool TestSumByColumnVT2() ...@@ -155,12 +156,12 @@ bool TestSumByColumnVT2()
for (int i = 0; i < bOrder; i++) for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i]; bUnitNum *= bDimSize[i];
DTYPE aData[2][1] = { {1.0}, DTYPE aData[2][1] = { {1.0F},
{0.0} }; {0.0F} };
DTYPE bData[2][4] = { {0.0, 1.0, 2.0, 3.0}, DTYPE bData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0, 5.0, 6.0, 7.0} }; {4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE answer[2][1] = { {7.0}, DTYPE answer[2][1] = { {7.0F},
{22.0} }; {22.0F} };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -223,7 +224,6 @@ bool TestSumByColumnVT2() ...@@ -223,7 +224,6 @@ bool TestSumByColumnVT2()
*/ */
/* test for SumByColumnVT Function */ /* test for SumByColumnVT Function */
extern "C"
bool TestSumByColumnVT() bool TestSumByColumnVT()
{ {
XPRINT(0, stdout, "[TEST SumByColumnVT] sum of a vector (column vector) and a tensor in a column by column manner \n"); XPRINT(0, stdout, "[TEST SumByColumnVT] sum of a vector (column vector) and a tensor in a column by column manner \n");
......
...@@ -22,10 +22,12 @@ ...@@ -22,10 +22,12 @@
#include "TTopK.h" #include "TTopK.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: get the top-k items along a given dimension.
* In this case, /*
* (2, 4) -> (2, 4), dim = 0, k = 2 case 1: get the top-k items along a given dimension.
* (2, 4) -> (2, 4), dim = 1, k = 4 In this case,
(2, 4) -> (2, 4), dim = 0, k = 2
(2, 4) -> (2, 4), dim = 1, k = 4
*/ */
bool TestTopK1() bool TestTopK1()
{ {
...@@ -49,16 +51,16 @@ bool TestTopK1() ...@@ -49,16 +51,16 @@ bool TestTopK1()
for (int i = 0; i < tOrder; i++) for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i]; tUnitNum *= tDimSize[i];
DTYPE sData[2][4] = { {5.0, 1.0, 2.0, 8.0}, DTYPE sData[2][4] = { {5.0F, 1.0F, 2.0F, 8.0F},
{4.0, 3.0, 7.0, 6.0} }; {4.0F, 3.0F, 7.0F, 6.0F} };
DTYPE tAnswer1[2][4] = { {5.0, 3.0, 7.0, 8.0}, DTYPE tAnswer1[2][4] = { {5.0F, 3.0F, 7.0F, 8.0F},
{4.0, 1.0, 2.0, 6.0} }; {4.0F, 1.0F, 2.0F, 6.0F} };
int indexAnswer1[2][4] = { {0, 1, 1, 0}, int indexAnswer1[2][4] = { {0, 1, 1, 0},
{1, 0, 0, 1} }; {1, 0, 0, 1} };
DTYPE tAnswer2[2][4] = { {8.0, 5.0, 2.0, 1.0}, DTYPE tAnswer2[2][4] = { {8.0F, 5.0F, 2.0F, 1.0F},
{7.0, 6.0, 4.0, 3.0} }; {7.0F, 6.0F, 4.0F, 3.0F} };
int indexAnswer2[2][4] = { {3, 0, 2, 1}, int indexAnswer2[2][4] = { {3, 0, 2, 1},
{2, 3, 0, 1} }; {2, 3, 0, 1} };
...@@ -156,9 +158,9 @@ bool TestTopK1() ...@@ -156,9 +158,9 @@ bool TestTopK1()
#endif // USE_CUDA #endif // USE_CUDA
} }
/* case 2: get the top-k items along a given dimension. /*
* In this case, case 2: get the top-k items along a given dimension.
* (2, 4) -> (2, 2), dim = 1, k = 2 In this case, (2, 4) -> (2, 2), dim = 1, k = 2.
*/ */
bool TestTopK2() bool TestTopK2()
{ {
...@@ -182,10 +184,10 @@ bool TestTopK2() ...@@ -182,10 +184,10 @@ bool TestTopK2()
for (int i = 0; i < tOrder; i++) for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i]; tUnitNum *= tDimSize[i];
DTYPE sData[2][4] = { {5.0, 1.0, 2.0, 8.0}, DTYPE sData[2][4] = { {5.0F, 1.0F, 2.0F, 8.0F},
{4.0, 3.0, 7.0, 6.0} }; {4.0F, 3.0F, 7.0F, 6.0F} };
DTYPE tAnswer[2][2] = { {8.0, 5.0}, DTYPE tAnswer[2][2] = { {8.0F, 5.0F},
{7.0, 6.0} }; {7.0F, 6.0F} };
int indexAnswer[2][2] = { {3, 0}, int indexAnswer[2][2] = { {3, 0},
{2, 3} }; {2, 3} };
...@@ -255,14 +257,12 @@ bool TestTopK2() ...@@ -255,14 +257,12 @@ bool TestTopK2()
#endif // USE_CUDA #endif // USE_CUDA
} }
/* other cases */ /* other cases */
/* /*
TODO!! TODO!!
*/ */
/* test for TopK Function */ /* test for TopK Function */
extern "C"
bool TestTopK() bool TestTopK()
{ {
XPRINT(0, stdout, "[TEST TopK] get the top-k items along a given dimension\n"); XPRINT(0, stdout, "[TEST TopK] get the top-k items along a given dimension\n");
......
...@@ -19,15 +19,16 @@ ...@@ -19,15 +19,16 @@
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-13 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-13
*/ */
#include "../XTensor.h"
#include "../core/Unsqueeze.h"
#include "../XList.h" #include "../XList.h"
#include "TUnsqueeze.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* case 1: insert a dimension by copying the blocks for x times (where x is the size of the inerted dimension)
* In this case, /*
* (2, 3) -> (2, 2, 3), dim=1, dSize=2 case 1: insert a dimension by copying the blocks for x times (where x is the size of the inerted dimension)
* (2, 3) -> (2, 3, 2), dim=2, dSize=2 In this case,
(2, 3) -> (2, 2, 3), dim=1, dSize=2
(2, 3) -> (2, 3, 2), dim=2, dSize=2
*/ */
bool TestUnsqueeze1() bool TestUnsqueeze1()
{ {
...@@ -63,18 +64,18 @@ bool TestUnsqueeze1() ...@@ -63,18 +64,18 @@ bool TestUnsqueeze1()
for (int i = 0; i < tOrder2; i++) for (int i = 0; i < tOrder2; i++)
tUnitNum2 *= tDimSize2[i]; tUnitNum2 *= tDimSize2[i];
DTYPE sData[2][3] = { {0.0, 1.0, 2.0}, DTYPE sData[2][3] = { {0.0F, 1.0F, 2.0F},
{3.0, 4.0, 5.0} }; {3.0F, 4.0F, 5.0F} };
DTYPE answer1[2][2][3] = { { {0.0, 1.0, 2.0}, DTYPE answer1[2][2][3] = { { {0.0F, 1.0F, 2.0F},
{0.0, 1.0, 2.0} }, {0.0F, 1.0F, 2.0F} },
{ {3.0, 4.0, 5.0}, { {3.0F, 4.0F, 5.0F},
{3.0, 4.0, 5.0} } }; {3.0F, 4.0F, 5.0F} } };
DTYPE answer2[2][3][2] = { { {0.0, 0.0}, DTYPE answer2[2][3][2] = { { {0.0F, 0.0F},
{1.0, 1.0}, {1.0F, 1.0F},
{2.0, 2.0} }, {2.0F, 2.0F} },
{ {3.0, 3.0}, { {3.0F, 3.0F},
{4.0, 4.0}, {4.0F, 4.0F},
{5.0, 5.0} } }; {5.0F, 5.0F} } };
/* CPU test */ /* CPU test */
bool cpuTest = true; bool cpuTest = true;
...@@ -148,7 +149,6 @@ bool TestUnsqueeze1() ...@@ -148,7 +149,6 @@ bool TestUnsqueeze1()
*/ */
/* test for Unsqueeze Function */ /* test for Unsqueeze Function */
extern "C"
bool TestUnsqueeze() bool TestUnsqueeze()
{ {
XPRINT(0, stdout, "[TEST Unsqueeze] insert a dimension by copying the blocks for x times\n"); XPRINT(0, stdout, "[TEST Unsqueeze] insert a dimension by copying the blocks for x times\n");
......
...@@ -19,14 +19,13 @@ ...@@ -19,14 +19,13 @@
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-6-24 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-6-24
*/ */
#include "TXMem.h"
#include "../XGlobal.h" #include "../XGlobal.h"
#include "../XUtility.h" #include "../XUtility.h"
#include "../XMem.h" #include "TXMem.h"
/* the nts (NiuTrans.Tensor) namespace */ namespace nts{ // namespace nts(NiuTrans.Tensor)
namespace nts{
/* case 1: test memory pool class */
bool TestXMemCase1() bool TestXMemCase1()
{ {
bool ok = true; bool ok = true;
...@@ -83,6 +82,7 @@ bool TestXMemCase1() ...@@ -83,6 +82,7 @@ bool TestXMemCase1()
return ok; return ok;
} }
/* test for memory pool class */
bool TestXMem() bool TestXMem()
{ {
XPRINT(0, stdout, "[Test] Memory pool ... Began\n"); XPRINT(0, stdout, "[Test] Memory pool ... Began\n");
...@@ -93,11 +93,18 @@ bool TestXMem() ...@@ -93,11 +93,18 @@ bool TestXMem()
/* case 1 test */ /* case 1 test */
caseFlag = TestXMemCase1(); caseFlag = TestXMemCase1();
if (!caseFlag) { returnFlag = false; XPRINT(0, stdout, ">> case 1 failed!\n"); } if (!caseFlag) {
else {XPRINT(0, stdout, ">> case 1 passed!\n");} returnFlag = false;
XPRINT(0, stdout, ">> case 1 failed!\n");
}
else
XPRINT(0, stdout, ">> case 1 passed!\n");
if (returnFlag) { XPRINT(0, stdout, ">> All Passed!\n"); } if (returnFlag) {
else { XPRINT(0, stdout, ">> Failed!\n"); } XPRINT(0, stdout, ">> All Passed!\n");
}
else
XPRINT(0, stdout, ">> Failed!\n");
double endT = GetClock(); double endT = GetClock();
...@@ -106,4 +113,4 @@ bool TestXMem() ...@@ -106,4 +113,4 @@ bool TestXMem()
return returnFlag; return returnFlag;
} }
} /* end of the nts (NiuTrans.Tensor) namespace */ } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
...@@ -22,13 +22,13 @@ ...@@ -22,13 +22,13 @@
#ifndef __TXMEM_H__ #ifndef __TXMEM_H__
#define __TXMEM_H__ #define __TXMEM_H__
/* the nts (NiuTrans.Tensor) namespace */ #include "../XMem.h"
namespace nts{
namespace nts{ // namespace nts(NiuTrans.Tensor)
/* test for memory pool class */ /* test for memory pool class */
extern "C" extern "C"
bool TestXMem(); bool TestXMem();
} /* end of the nts (NiuTrans.Tensor) namespace */ } // namespace nts(NiuTrans.Tensor)
#endif // __TXMEM_H__
#endif
...@@ -62,7 +62,7 @@ bool Test() ...@@ -62,7 +62,7 @@ bool Test()
wrong = !TestXMem() || wrong; wrong = !TestXMem() || wrong;
//wrong = !TestHardTanH() || wrong; //wrong = !TestHardTanH() || wrong;
wrong = !TestIdentity || wrong; //wrong = !TestIdentity() || wrong;
//wrong = !TestLogSoftmax() || wrong; //wrong = !TestLogSoftmax() || wrong;
//wrong = !TestLoss() || wrong; //wrong = !TestLoss() || wrong;
//wrong = !TestRectify() || wrong; //wrong = !TestRectify() || wrong;
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论