Commit abeb3e64 by liyinqiao

merged

parents dcabc2b0 414ff54f
......@@ -38,7 +38,7 @@
#include "XMem.h"
#include "XHeap.h"
#include "XBLAS.h"
#include "core/MergeBlockLists.h"
#include "core/shape/MergeBlockLists.h"
#ifdef USE_CUDA
......@@ -47,8 +47,8 @@
#include <cublas_v2.h>
#include <cuda.h>
#include <curand.h>
#include "core/FlushToMem.cuh"
#include "core/SetAscendingOrder.cuh"
#include "core/utilities/FlushToMem.cuh"
#include "core/utilities/SetAscendingOrder.cuh"
#endif
......@@ -555,6 +555,27 @@ bool XTensor::CheckData(const void * d, int num, int beg)
return true;
}
bool XTensor::CheckData(const void * d, int num, float tolerance, int beg)
{
if (data == NULL || d == NULL)
return false;
CheckNTErrors(!isSparse, "TODO");
CheckNTErrors(num == unitNum - beg, "Illegal size!");
DTYPE * valuePrt = (DTYPE*)data;
DTYPE value = 0;
DTYPE * answerPrt = (DTYPE*)d;
for (int i = beg; i < num; i++) {
value = ToCPU(devID, valuePrt);
if (fabs(value - *answerPrt) > tolerance)
return false;
valuePrt++;
answerPrt++;
}
return true;
}
/*
set the cell to the ascending order along a given dimension
>> dim - the dimension specified
......@@ -696,6 +717,63 @@ DTYPE XTensor::Get3D(int d0, int d1, int d2)
return ToCPU(devID, value);
}
/*
get the value of a cell in a 1d tensor in int type
>> i - index
<< return - value of cell(i) in int
*/
int XTensor::Get1DInt(int i)
{
CheckNTErrors((order == 1), "Cannot get a 2d cell for a tensor whose order is not 2!");
CheckNTErrors((i >= 0 && i < dimSize[0]), "dimension 0 is out of range!");
CheckNTErrors((dataType == X_INT), "The tensor is not in int type.");
int dimSize[1] = {i};
void * value = GetCell(dimSize, 1);
return ToCPUInt(devID, value);
}
/*
get the value of a cell in a 2d tensor in int type
>> ni - row index
>> mi - column index
<< return - value of cell(ni, mi) in int
*/
int XTensor::Get2DInt(int ni, int mi)
{
CheckNTErrors((order == 2), "Cannot get a 2d cell for a tensor whose order is not 2!");
CheckNTErrors((ni >= 0 && ni < dimSize[0]), "dimension 0 is out of range!");
CheckNTErrors((mi >= 0 && mi < dimSize[1]), "dimension 1 is out of range!");
CheckNTErrors((dataType == X_INT), "The tensor is not in default type.");
int dims[2] = {ni, mi};
void * value = GetCell(dims, 2);
return ToCPUInt(devID, value);
}
/*
get the value of a cell in a 3d tensor in int type
>> d0 - index of dimension 0
>> d1 - index of dimension 1
>> d2 - index of dimension 2
<< return - value of cell(d0, d1, d2) in int
*/
int XTensor::Get3DInt(int d0, int d1, int d2)
{
CheckNTErrors((order == 3), "Cannot get a 2d cell for a tensor whose order is not 2!");
CheckNTErrors((d0 >= 0 && d0 < dimSize[0]), "dimension 0 is out of range!");
CheckNTErrors((d1 >= 0 && d1 < dimSize[1]), "dimension 1 is out of range!");
CheckNTErrors((d2 >= 0 && d2 < dimSize[2]), "dimension 2 is out of range!");
CheckNTErrors((dataType == X_INT), "The tensor is not in default type.");
int dims[3] = {d0, d1, d2};
void * value = GetCell(dims, 3);
return ToCPUInt(devID, value);
}
/*
get the value of a cell in the sparse tensor
>> i - i-th tuple in the tuple list of the sparse tensor
......
......@@ -211,6 +211,9 @@ struct XTensor
/* check whether the data array is the same as the answer */
bool CheckData(const void * answer, int num, int beg = 0);
/* check whether the data array is the same as the answer */
bool CheckData(const void * answer, int num, float tolerance, int beg = 0);
/* set the cell to the ascending order along a given dimension */
void SetAscendingOrder(int dim);
......@@ -220,15 +223,24 @@ struct XTensor
/* get the pointer to a cell */
void * GetCell(int index[], int size = -1);
/* get the value of a cell in a 1d tensor */
/* get the default type value of a cell in a 1d tensor */
DTYPE Get1D(int i);
/* get the value of a cell in a 2d tensor */
/* get the default type value of a cell in a 2d tensor */
DTYPE Get2D(int ni, int mi);
/* get the value of a cell in a 3d tensor */
/* get the default type value of a cell in a 3d tensor */
DTYPE Get3D(int d0, int d1, int d2);
/* get the int value of a cell in a 1d tensor */
int Get1DInt(int i);
/* get the int value of a cell in a 2d tensor */
int Get2DInt(int ni, int mi);
/* get the int value of a cell in a 3d tensor */
int Get3DInt(int d0, int d1, int d2);
/* get the value of a cell in a sparse tensor */
DTYPE GetInSparse(int i);
......
......@@ -26,43 +26,49 @@
#include "../XTensor.h"
#include "Concatenate.h"
#include "ConcatenateSolely.h"
#include "CopyIndexed.h"
#include "CopyInGrid.h"
#include "CopyValues.h"
#include "FlushToMem.h"
#include "MakeMergeBlockIndex.h"
#include "MakeSplitBlockIndex.h"
#include "MatrixMul.h"
#include "MatrixMul2D.h"
#include "MatrixMul2DMultiTheading.h"
#include "MatrixMul2DParallel.h"
#include "MatrixMulBatched.h"
#include "MatrixMULBatchedCPU.h"
#include "Merge.h"
#include "MergeBlockLists.h"
#include "Multiply.h"
#include "Negate.h"
#include "Normalize.h"
#include "Permute.h"
#include "Power.h"
#include "ReduceMax.h"
#include "ReduceMean.h"
#include "ReduceStandardVariance.h"
#include "ReduceSum.h"
#include "ReduceSumSquared.h"
#include "ReduceVariance.h"
#include "ScaleAndShift.h"
#include "SetData.h"
#include "Sort.h"
#include "Split.h"
#include "Sum.h"
#include "SumByColumnTV.h"
#include "SumByColumnVT.h"
#include "TopK.h"
#include "Unsqueeze.h"
#include "XMatrixSegment.h"
#include "XTensorBLAS.h"
#include "shape/Concatenate.h"
#include "shape/ConcatenateSolely.h"
#include "movement/CopyBlocks.h"
#include "movement/CopyBlocksInGrid.h"
#include "movement/CopyBlocksOnSite.h"
#include "movement/CopyData2D.h"
#include "movement/CopyIndexed.h"
#include "movement/CopyInGrid.h"
#include "movement/CopyValues.h"
#include "utilities/FlushToMem.h"
#include "shape/MakeMergeBlockIndex.h"
#include "shape/MakeSplitBlockIndex.h"
#include "arithmetic/MatrixMul.h"
#include "arithmetic/MatrixMul2D.h"
#include "arithmetic/MatrixMul2DMultiTheading.h"
#include "arithmetic/MatrixMul2DParallel.h"
#include "arithmetic/MatrixMulBatched.h"
#include "arithmetic/MatrixMULBatchedCPU.h"
#include "shape/Merge.h"
#include "shape/MergeBlockLists.h"
#include "arithmetic/Multiply.h"
#include "arithmetic/Negate.h"
#include "math/Normalize.h"
#include "shape/Permute.h"
#include "math/Power.h"
#include "reduce/ReduceMax.h"
#include "reduce/ReduceMean.h"
#include "reduce/ReduceStandardVariance.h"
#include "reduce/ReduceSum.h"
#include "reduce/ReduceSumSquared.h"
#include "reduce/ReduceVariance.h"
#include "math/ScaleAndShift.h"
#include "getandset/Select.h"
#include "getandset/SetData.h"
#include "sort/Sort.h"
#include "shape/Split.h"
#include "arithmetic/Sum.h"
#include "arithmetic/SumByColumnTV.h"
#include "arithmetic/SumByColumnVT.h"
#include "sort/TopK.h"
#include "shape/Transpose.h"
#include "shape/Unsqueeze.h"
#include "utilities/XMatrixSegment.h"
#include "arithmetic/XTensorBLAS.h"
#endif // __CHEADER_H__
\ No newline at end of file
......@@ -219,9 +219,8 @@ public:
/* insert a dimension by copying the blocks for x times (where x is the size of the inerted dimension) */
void Unsqueeze(XTensor * a, XTensor * b, int dim, int dSize);
/*******************************************************************
segmentation and parallel processing for 2d tensors (i.e., matrices)
*/
/* segmentation and parallel processing for 2d tensors (i.e., matrices) */
/* segment a 2d tensor (i.e., matrix) into blocks and run jobs in parallel */
static
void RunParallel2D(XPRunner * parallelRunner, void * job, int opNum, int rowNum, int colNum, int argNum, ...);
......
......@@ -19,7 +19,7 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "../XTensor.h"
#include "../../XTensor.h"
#include "MatrixMULBatchedCPU.h"
#include "MatrixMul2D.h"
#include "XTensorBLAS.h"
......@@ -33,9 +33,9 @@ c_i = trans(a_i) * trans(b_i) * \alpha + c_i * \beta for each i in [0,count-1]
>> transposedA - indicate whether the matrix a is transposed
>> b - another list of input matrices (2d tensors)
>> transposedB - indicate whether the matrix b is transposed
>> c - output matrix (2d tensor)
>> alpha - scalar
>> beta - scalar
>> c - output matrix (2d tensor)
*/
void MatrixMULBatchedCPU(XList * a, MATRIX_TRANS_TYPE transposedA,
XList * b, MATRIX_TRANS_TYPE transposedB,
......@@ -64,10 +64,6 @@ void MatrixMULBatchedCPU(XList * a, MATRIX_TRANS_TYPE transposedA,
}
}
//if(isUniform){
//}
//else{
for (int i = 0; i < a->count; i++) {
XTensor * ai = (XTensor*)a->GetItem(i);
XTensor * bi = (XTensor*)b->GetItem(i);
......
......@@ -22,7 +22,7 @@
#ifndef __MATRIXMULBATCHEDCPU_H__
#define __MATRIXMULBATCHEDCPU_H__
#include "../XTensor.h"
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......
......@@ -19,9 +19,9 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "../XTensor.h"
#include "../XDevice.h"
#include "../XName.h"
#include "../../XTensor.h"
#include "../../XDevice.h"
#include "../../XName.h"
#include "MatrixMul.h"
#include "MatrixMul2D.h"
#include "MatrixMULBatchedCPU.h"
......@@ -65,13 +65,12 @@ void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA,
XLink::AddParamToHeadInt(c, transposedB);
XLink::AddParamToHead(c, alpha);
XLink::AddParamToHead(c, beta);
int an = transposedA == X_TRANS ? a->dimSize[1] : a->dimSize[0];
int am = transposedA == X_TRANS ? a->dimSize[0] : a->dimSize[1];
int bn = transposedB == X_TRANS ? b->dimSize[1] : b->dimSize[0];
int bm = transposedB == X_TRANS ? b->dimSize[0] : b->dimSize[1];
int cn = c->dimSize[0];
int cm = c->dimSize[1];
int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1];
int bm = transposedB == X_TRANS ? b->dimSizeRDI[1] : b->dimSizeRDI[0];
int cn = c->dimSizeRDI[1];
int cm = c->dimSizeRDI[0];
CheckNTErrors((am == bn && an == cn && bm == cm),
"Unmatched tensors in multiplication!");
......@@ -87,13 +86,13 @@ void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA,
int cBlockNum = 1;
for (int i = 2; i < a->order; i++) {
CheckNTErrors((a->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!");
CheckNTErrors((a->dimSizeRDI[i] == c->dimSizeRDI[i - 2 + b->order]), "Incorrect tensor sizes!");
aBlockNum *= a->dimSizeRDI[i];
cBlockNum *= a->dimSizeRDI[i];
}
for (int i = 2; i < b->order; i++) {
CheckNTErrors((b->dimSizeRDI[i] == c->dimSizeRDI[i - 2 + a->order]), "Incorrect tensor sizes!");
CheckNTErrors((b->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!");
bBlockNum *= b->dimSizeRDI[i];
cBlockNum *= b->dimSizeRDI[i];
}
......@@ -101,9 +100,9 @@ void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA,
XList * aList = new XList(10);
XList * bList = new XList(10);
XList * cList = new XList(10);
int aDimSize[2] = { -a->dimSize[0], a->dimSize[1] };
int bDimSize[2] = { -b->dimSize[0], b->dimSize[1] };
int cDimSize[2] = { -c->dimSize[0], c->dimSize[1] };
int aDimSize[2] = { a->dimSizeRDI[1], a->dimSizeRDI[0] };
int bDimSize[2] = { b->dimSizeRDI[1], b->dimSizeRDI[0] };
int cDimSize[2] = { c->dimSizeRDI[1], c->dimSizeRDI[0] };
bool isSparseMul = false;
......
......@@ -22,7 +22,7 @@
#ifndef __MATRIXMUL_H__
#define __MATRIXMUL_H__
#include "../XTensor.h"
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......@@ -39,7 +39,7 @@ normal matrix multiplication if A = y * z and B = x * y.
*/
extern "C"
void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA, XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -19,8 +19,8 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "../XTensor.h"
#include "../XName.h"
#include "../../XTensor.h"
#include "../../XName.h"
#include "MatrixMul2D.h"
#include "MatrixMul2D.cuh"
#include "MatrixMul2DParallel.h"
......@@ -112,7 +112,7 @@ void MatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA,
int num = *((int*)b->data);
char * p = (char*)b->data + sizeof(int); // pointer to the first tuple
/* a * b */
/* a * b */
if (transposedA == X_NOTRANS && transposedB == X_NOTRANS) {
for (int i = 0; i < num; i++) {
int key = *((int*)p);
......
......@@ -19,8 +19,8 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "../XDevice.h"
#include "../XTensor.h"
#include "../../XDevice.h"
#include "../../XTensor.h"
#include "MatrixMul2D.h"
#include "MatrixMul2D.cuh"
#include "XTensorBLAS.h"
......@@ -37,11 +37,13 @@ c = a * b * \alpha
>> aColSize - column size of matrix a
>> aRowSize - row size of matrix a
>> b - a sparse matrix
>> transposedA - indicates whether b is transposed
>> transposedB - indicates whether b is transposed
>> bNonZeroNum - number of non-zero items in b
>> bColSize - column size of matrix b
>> bRowSize - row size of matrix b
>> c - the resulting (dense) matrix
>> cColSize - column size of matrix c
>> cRowSize - row size of matrix c
>> alpha - the scaling factor
*/
extern "C" __global__
......@@ -147,7 +149,6 @@ void CudaMatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA,
if (!a->isSparse && !b->isSparse) {
CheckNTErrors((!c->isSparse), "Illegal use of sparse matrix in multiplication!");
//cublasHandle_t * handle = GDevs->GetCudaHandle(a->devID);
cublasHandle_t * handle = a->mem == NULL ? GDevs.GetCudaHandle(a->devID) : a->mem->GetCublasHandle();
/* !!!! might have problems */
......@@ -183,7 +184,6 @@ void CudaMatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA,
if (beta == 0)
c->SetZeroAll();
else if (beta != 1.0F) {
//XTensor::ScaleAndShift(c, beta, 0);
ShowNTErrors("TODO!");
}
......
......@@ -22,7 +22,7 @@
#ifndef __MATRIXMUL2D_H__
#define __MATRIXMUL2D_H__
#include "../XTensor.h"
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......
......@@ -19,7 +19,7 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "../XTensor.h"
#include "../../XTensor.h"
#include "MatrixMul2DMultiTheading.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......
......@@ -22,7 +22,7 @@
#ifndef __MATRIXMUL2DMULTITHEADING_H__
#define __MATRIXMUL2DMULTITHEADING_H__
#include "../XTensor.h"
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......
......@@ -19,10 +19,10 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "../XTensor.h"
#include "../../XTensor.h"
#include "MatrixMul2DParallel.h"
#include "MatrixMul2DMultiTheading.h"
#include "XMatrixSegment.h"
#include "../utilities/XMatrixSegment.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......
......@@ -22,7 +22,7 @@
#ifndef __MATRIXMUL2DPARALLEL_H__
#define __MATRIXMUL2DPARALLEL_H__
#include "../XTensor.h"
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......
......@@ -19,9 +19,9 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "../XTensor.h"
#include "../XDevice.h"
#include "../XName.h"
#include "../../XTensor.h"
#include "../../XDevice.h"
#include "../../XName.h"
#include "MatrixMulBatched.h"
#include "MatrixMULBatchedCPU.h"
#include "XTensorBLAS.h"
......@@ -41,6 +41,7 @@ where trans() returns the transposed matrix if the flag is fired
>> c - where we keep a*b
>> alpha - a coefficient
>> beta - another coefficient
>> parallelRunner - parallel processing module
*/
void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA,
XTensor * b, MATRIX_TRANS_TYPE transposedB,
......@@ -59,13 +60,12 @@ void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA,
XLink::AddParamToHeadInt(c, transposedB);
XLink::AddParamToHead(c, alpha);
XLink::AddParamToHead(c, beta);
int an = transposedA == X_TRANS ? a->dimSize[1] : a->dimSize[0];
int am = transposedA == X_TRANS ? a->dimSize[0] : a->dimSize[1];
int bn = transposedB == X_TRANS ? b->dimSize[1] : b->dimSize[0];
int bm = transposedB == X_TRANS ? b->dimSize[0] : b->dimSize[1];
int cn = c->dimSize[0];
int cm = c->dimSize[1];
int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1];
int bm = transposedB == X_TRANS ? b->dimSizeRDI[1] : b->dimSizeRDI[0];
int cn = c->dimSizeRDI[1];
int cm = c->dimSizeRDI[0];
CheckNTErrors((am == bn && an == cn && bm == cm),
"Unmatched tensors in multiplication!");
......@@ -87,9 +87,9 @@ void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA,
XList * aList = new XList(10);
XList * bList = new XList(10);
XList * cList = new XList(10);
int aDimSize[2] = { -a->dimSizeRDI[0], a->dimSizeRDI[1] };
int bDimSize[2] = { -b->dimSizeRDI[0], b->dimSizeRDI[1] };
int cDimSize[2] = { -c->dimSizeRDI[0], c->dimSizeRDI[1] };
int aDimSize[2] = { -a->dimSizeRDI[1], a->dimSizeRDI[0] };
int bDimSize[2] = { -b->dimSizeRDI[1], b->dimSizeRDI[0] };
int cDimSize[2] = { -c->dimSizeRDI[1], c->dimSizeRDI[0] };
for (int p = 0; p < blockNum; p++) {
void * ap = (char*)a->data + aRealBlockSize * p;
......@@ -114,8 +114,9 @@ void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA,
int devIDBackup;
ProtectCudaDev(a->devID, devIDBackup);
CudaBLASMatrixMULList(a->mem != NULL ? a->mem->GetCublasHandle() : GDevs.GetCudaHandle(a->devID),
aList, transposedA,
cublasHandle_t * handle = a->mem != NULL ? a->mem->GetCublasHandle() : GDevs.GetCudaHandle(a->devID);
CudaBLASMatrixMULList(handle,
aList, transposedA,
bList, transposedB,
cList, aList->count,
alpha, beta);
......
......@@ -22,7 +22,7 @@
#ifndef __MATRIXMULBATCHED_H__
#define __MATRIXMULBATCHED_H__
#include "../XTensor.h"
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......
......@@ -19,12 +19,13 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "../XTensor.h"
#include "../XName.h"
#include "../../XTensor.h"
#include "../../XName.h"
#include "Multiply.h"
#include "Multiply.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
element-wise product of two tensors
c(i) = a(i)*b(i) + \alpha * c(i)
......
......@@ -19,8 +19,8 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "../XDevice.h"
#include "../XTensor.h"
#include "../../XDevice.h"
#include "../../XTensor.h"
#include "Multiply.h"
#include "Multiply.cuh"
......@@ -68,6 +68,7 @@ where |a_lead| means the size of the leading dimension of a
>> a - tensor a
>> b - tensor b
>> c - result tensor
>> alpha - the coefficient
>> stride - the number of items we go over when move next along the leading dimension in a block
>> ldSizeA - size of the leading dimension of a
>> ldSizeB - size of the leading dimension of b
......
......@@ -22,7 +22,7 @@
#ifndef __MULTIPLY_H__
#define __MULTIPLY_H__
#include "../XTensor.h"
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......
......@@ -19,15 +19,15 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "../XTensor.h"
#include "../../XTensor.h"
#include "Negate.h"
#include "Negate.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
set every entry to its minus value
>> a - the tensor we are processing
set every entry to its minus value
>> a - the tensor we are processing
*/
void Negate(XTensor * a)
{
......
......@@ -19,8 +19,8 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "../XDevice.h"
#include "../XTensor.h"
#include "../../XDevice.h"
#include "../../XTensor.h"
#include "Negate.h"
#include "Negate.cuh"
......@@ -42,10 +42,10 @@ void KernelNegate(DTYPE * d, int size)
}
/*
set each entry to its negtive value (CUDA Kernel)
This is for float16 computation
>> d - pointer to the data array
>> size - size of the data array
set each entry to its negtive value (CUDA Kernel)
This is for float16 computation
>> d - pointer to the data array
>> size - size of the data array
*/
__global__
void KernelNegate(__half * d, int size)
......
......@@ -22,7 +22,7 @@
#ifndef __NEGATE_H__
#define __NEGATE_H__
#include "../XTensor.h"
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......
......@@ -19,8 +19,8 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "../XTensor.h"
#include "../XName.h"
#include "../../XTensor.h"
#include "../../XName.h"
#include "Sum.h"
#include "Sum.cuh"
......
......@@ -19,12 +19,13 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "../XDevice.h"
#include "../../XDevice.h"
#include "Sum.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/*
summation of data arrays (CUDA Kernel)
c = a + b * \beta
......
......@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* summation of data arrays (CUDA Kernel) */
/* summation of data arrays (CUDA Kernel) */
extern "C" __global__
void KernelADD(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta = (DTYPE)1.0);
......
......@@ -22,7 +22,7 @@
#ifndef __SUM_H__
#define __SUM_H__
#include "../XTensor.h"
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......
......@@ -19,7 +19,7 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "../XTensor.h"
#include "../../XTensor.h"
#include "SumByColumnTV.h"
#include "SumByColumnTV.cuh"
......
......@@ -19,8 +19,8 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "../XDevice.h"
#include "../XTensor.h"
#include "../../XDevice.h"
#include "../../XTensor.h"
#include "SumByColumnTV.h"
#include "SumByColumnTV.cuh"
......
......@@ -22,7 +22,7 @@
#ifndef __REDUCEMAX_CUH__
#define __REDUCEMAX_CUH__
#include "ReduceMax.h"
#include "../reduce/ReduceMax.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......
......@@ -22,7 +22,7 @@
#ifndef __SUMBYCOLUMNTV_H__
#define __SUMBYCOLUMNTV_H__
#include "../XTensor.h"
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......
......@@ -19,7 +19,7 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "../XTensor.h"
#include "../../XTensor.h"
#include "SumByColumnVT.h"
#include "SumByColumnVT.cuh"
......
......@@ -19,14 +19,15 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "../XDevice.h"
#include "../XTensor.h"
#include "../../XDevice.h"
#include "../../XTensor.h"
#include "SumByColumnVT.h"
#include "SumByColumnVT.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/*
summation of a vector (column vector) and a tensor
c = a + \sum{col} b_col * \beta
......
......@@ -22,11 +22,10 @@
#ifndef __SUMBYCOLUMNVT_H__
#define __SUMBYCOLUMNVT_H__
#include "../XTensor.h"
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* sum of a (column) vector and a tensor */
extern "C"
void SumByColumnVT(XTensor * a, XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);
......
......@@ -20,8 +20,8 @@
*/
#include "XTensorBLAS.h"
#include "../XTensor.h"
#include "../XBLAS.h"
#include "../../XTensor.h"
#include "../../XBLAS.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......
......@@ -19,9 +19,9 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "../XUtility.h"
#include "../XDevice.h"
#include "../XTensor.h"
#include "../../XUtility.h"
#include "../../XDevice.h"
#include "../../XTensor.h"
#include "XTensorBLAS.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......
......@@ -22,7 +22,7 @@
#ifndef __XTENSORBLAS_H__
#define __XTENSORBLAS_H__
#include "../XTensor.h"
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......
......@@ -19,8 +19,8 @@
* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-06-14
*/
#include "../XTensor.h"
#include "../XDevice.h"
#include "../../XTensor.h"
#include "../../XDevice.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......
......@@ -19,8 +19,8 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-04
*/
#include "../XUtility.h"
#include "../XName.h"
#include "../../XUtility.h"
#include "../../XName.h"
#include "Select.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
......@@ -33,7 +33,7 @@ c = select(a)
>> dim - the dimension along with which we do the job
>> low - lower bound
>> high - higher bound.
Note that range [1,3] means that we select 1 and 2.
Note that range [1,3] means that we select 1 and 2.
*/
void SelectRange(XTensor * a, XTensor * c, int dim, int low, int high)
{
......@@ -48,7 +48,7 @@ void SelectRange(XTensor * a, XTensor * c, int dim, int low, int high)
for(int i = 0; i < a->order; i++){
if(i == dim){
CheckNTErrors(low > 0 && low < a->dimSize[dim], "Illegal range specified!");
CheckNTErrors(high > 0 && high < a->dimSize[dim], "Illegal range specified!");
CheckNTErrors(high > 0 && high <= a->dimSize[dim], "Illegal range specified!");
}
else{
CheckNTErrors(a->dimSize[i] == c->dimSize[i], "The size of the dimensions should be same!");
......@@ -62,20 +62,24 @@ void SelectRange(XTensor * a, XTensor * c, int dim, int low, int high)
XLink::AddParamToHeadInt(c, high);
int stride = 1;
for(int i = 0; i < dim; i++)
int dimRDI = a->order - dim - 1;
for(int i = 0; i < dimRDI; i++)
stride *= a->dimSizeRDI[i];
int copyTimes = 1;
for (int i = dimRDI + 1; i < a->order; i++)
copyTimes *= a->dimSizeRDI[i];
int blockSize = stride * (high - low) * a->unitSize;
int stepSizeS = stride * a->dimSize[dim] * a->unitSize;
int stepSizeT = stride * c->dimSize[dim] * a->unitSize;
char * s = (char*)a->data + stride * low * a->unitSize;
char * t = (char*)c->data;
for(int i = 0; i < high - low; i++){
for(int i = 0; i < copyTimes; i++){
XMemCopy(t, c->devID, s, a->devID, blockSize);
s += stepSizeS;
t += stepSizeT;
}
}
} // namespace nts(NiuTrans.Tensor)
......@@ -22,7 +22,7 @@
#ifndef __SELECT_H__
#define __SELECT_H__
#include "../XTensor.h"
#include "../../XTensor.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
......
......@@ -21,7 +21,7 @@
*/
#include "SetData.h"
#include "CopyValues.h"
#include "../movement/CopyValues.h"
#if !defined( WIN32 ) && !defined( _WIN32 )
#include "sys/time.h"
......@@ -68,10 +68,11 @@ void SetDataRand(XTensor * tensor, DTYPE low, DTYPE high)
ShowNTErrors("TODO");
}
}
/* GPU code
The trick here is that initialize the data on a temperary tensor on CPU.
The CPU data is then copied to GPU.
TODO: generate data points on GPUs straightforwardly.
/*
GPU code
The trick here is that initialize the data on a temperary tensor on CPU.
The CPU data is then copied to GPU.
TODO: generate data points on GPUs straightforwardly.
*/
else{
XTensor * t2 = NewTensor(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, -1);
......
......@@ -23,7 +23,7 @@
#ifndef __SETDATA_H__
#define __SETDATA_H__
#include "../XTensor.h"
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......
......@@ -20,11 +20,12 @@
*/
#include <math.h>
#include "../XTensor.h"
#include "../../XTensor.h"
#include "Normalize.h"
#include "Normalize.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
normalized the data with normal distribution. For an input x,
y = a * (x-mean)/sqrt(variance+\epsilon) + b
......
......@@ -19,12 +19,13 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "../XDevice.h"
#include "../XTensor.h"
#include "../../XDevice.h"
#include "../../XTensor.h"
#include "Normalize.h"
#include "Normalize.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/*
normalized the data with normal distribution (kernel code). For an input x,
......
......@@ -28,7 +28,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* normalized the data with normal distribution (Kernel code). For an input x,
/*
normalized the data with normal distribution (Kernel code). For an input x,
y = a * (x-mean)/sqrt(variance+\epsilon) + b
where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter
*/
......@@ -37,7 +38,8 @@ void KernelNormalize(DTYPE * input, DTYPE * output, DTYPE * mean, DTYPE * var,
DTYPE * a, DTYPE * b, DTYPE epsilon,
int stride, int strideNum, int blockNum);
/* normalized the data with normal distribution. For an input x,
/*
normalized the data with normal distribution. For an input x,
y = a * (x-mean)/sqrt(variance+\epsilon) + b
where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter
*/
......
......@@ -22,7 +22,7 @@
#ifndef __NORMALIZE_H__
#define __NORMALIZE_H__
#include "../XTensor.h"
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......
......@@ -20,15 +20,16 @@
*/
#include <math.h>
#include "../XTensor.h"
#include "../../XTensor.h"
#include "Power.h"
#include "Power.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
get the power(a, p)
>> a - the tensor
>> power - as it is
>> p - as it is
*/
void Power(XTensor * a, DTYPE p)
{
......
......@@ -19,8 +19,8 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "../XDevice.h"
#include "../XTensor.h"
#include "../../XDevice.h"
#include "../../XTensor.h"
#include "Power.h"
#include "Power.cuh"
......@@ -87,9 +87,6 @@ __global__
void KernelPower(__half * d, __half p, int size)
{
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
//int i = blockDim.x * blockIdx.x + threadIdx.x;
//if (i < size)
// d[i] = hpow(d[i], p);
#else
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size)
......@@ -126,9 +123,6 @@ void CudaPower(XTensor * a, DTYPE p)
}
else if (p != (DTYPE)1.0) {
ShowNTErrors("TODO!");
//unsigned short p2 = FloatToFloat16(p);
//__half * pp = (__half*)&p2;
//KernelPower<<<blocks, threads>>>((__half*)a->data, *pp, a->unitNum);
}
}
else {
......
......@@ -22,7 +22,7 @@
#ifndef __POWER_H__
#define __POWER_H__
#include "../XTensor.h"
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......
......@@ -26,9 +26,7 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
/*
scale and shift all tensor entires
p = p * scale + shift
>> a - the tensor
>> scale - the scaler factor
>> shift - the shift factor
......
......@@ -21,7 +21,7 @@
#include "ScaleAndShift.h"
#include "ScaleAndShift.cuh"
#include "../XDevice.h"
#include "../../XDevice.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
......@@ -80,9 +80,7 @@ void KernelScaleAndShift(__half * d, int size, __half scale, __half shift)
/*
scale and shift all matrix entires
p = p * scale + shift
>> a - the tensor
>> scale - the scaler factor
>> shift - the shift factor
......
......@@ -22,7 +22,7 @@
#ifndef __SCALEANDSHIFT_CUH__
#define __SCALEANDSHIFT_CUH__
#include "../XTensor.h"
#include "../../XTensor.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
......
......@@ -22,7 +22,7 @@
#ifndef __SCALEANDSHIFT_H__
#define __SCALEANDSHIFT_H__
#include "../XTensor.h"
#include "../../XTensor.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
......
......@@ -19,8 +19,8 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "../XTensor.h"
#include "../XUtility.h"
#include "../../XTensor.h"
#include "../../XUtility.h"
#include "CopyBlocks.h"
#include "CopyBlocksOnSite.h"
#include "CopyBlocksSelected.cuh"
......@@ -78,9 +78,11 @@ void CopyBlocks(void * source, int blockSize, int * sourceBlocks, int blockNum,
else {
int devID = myMem != NULL ? myMem->devID : -1;
/* The following code should be fine with GPUs, but too many
/*
The following code should be fine with GPUs, but too many
kernel calls would slow down the system. We prefer to use
one kernel to do block copy in batch (kernel fusion). */
one kernel to do block copy in batch (kernel fusion).
*/
for (int i = 0; i < blockNum; i++) {
XMemCopy((char*)target + targetBlocks[i] * blockSize, devID,
(char*)source + sourceBlocks[i] * blockSize, devID, blockSize);
......
......@@ -22,7 +22,7 @@
#ifndef __COPYBLOCKS_H__
#define __COPYBLOCKS_H__
#include "../XTensor.h"
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......
......@@ -19,9 +19,9 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "../XTensor.h"
#include "../../XTensor.h"
#include "CopyBlocksInGrid.h"
#include "../XUtility.h"
#include "../../XUtility.h"
#include "CopyBlocksInGrid.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor)
......
......@@ -21,7 +21,7 @@
#include "CopyBlocksInGrid.h"
#include "CopyBlocksInGrid.cuh"
#include "../XDevice.h"
#include "../../XDevice.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......
......@@ -22,7 +22,7 @@
#ifndef __COPYBLOCKSINGRID_CUH__
#define __COPYBLOCKSINGRID_CUH__
#include "../XTensor.h"
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......
......@@ -22,7 +22,7 @@
#ifndef __COPYBLOCKSINGRID_H__
#define __COPYBLOCKSINGRID_H__
#include "../XTensor.h"
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......
......@@ -19,12 +19,13 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "../XTensor.h"
#include "../XUtility.h"
#include "../../XTensor.h"
#include "../../XUtility.h"
#include "CopyBlocksOnSite.h"
#include "CopyBlocksOnSite.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
copy a number of blocks to target positions. Here we assume that
all the data has been on the device (CPU/GPU) already.
......@@ -47,9 +48,11 @@ void CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target,
else {
int devID = myMem != NULL ? myMem->devID : -1;
/* The following code should be fine with GPUs, but too many
/*
The following code should be fine with GPUs, but too many
kernel calls would slow down the system. We prefer to use
one kernel to do block copy in batch (kernel fusion). */
one kernel to do block copy in batch (kernel fusion).
*/
for (int i = 0, b = 0; i < blockNum; i++, b += blockSize) {
XMemCopy((char*)target + targetBlocks[i] * blockSize, devID,
(char*)source + b, devID, blockSize);
......
......@@ -21,7 +21,7 @@
#include "CopyBlocksOnSite.h"
#include "CopyBlocksOnSite.cuh"
#include "../XDevice.h"
#include "../../XDevice.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......
......@@ -22,7 +22,7 @@
#ifndef __COPYBLOCKS_CUH__
#define __COPYBLOCKS_CUH__
#include "../XTensor.h"
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......
......@@ -22,7 +22,7 @@
#ifndef __COPYBLOCKSONSITE_H__
#define __COPYBLOCKSONSITE_H__
#include "../XTensor.h"
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......
......@@ -21,8 +21,8 @@
#include "CopyBlocks.h"
#include "CopyBlocksSelected.cuh"
#include "../XUtility.h"
#include "../XDevice.h"
#include "../../XUtility.h"
#include "../../XDevice.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......
......@@ -22,7 +22,7 @@
#ifndef __COPYBLOCKSSELECTED_CUH__
#define __COPYBLOCKSSELECTED_CUH__
#include "../XTensor.h"
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......
......@@ -19,9 +19,9 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "../XTensor.h"
#include "../../XTensor.h"
#include "CopyData2D.h"
#include "../XUtility.h"
#include "../../XUtility.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......
......@@ -22,7 +22,7 @@
#ifndef __COPYDATA2D_H__
#define __COPYDATA2D_H__
#include "../XTensor.h"
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......
......@@ -19,7 +19,7 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "../XTensor.h"
#include "../../XTensor.h"
#include "CopyInGrid.h"
#include "CopyBlocksInGrid.h"
......@@ -34,7 +34,7 @@ i.e., reorder the data blocks in the same memory piece
in the k-th grid
>> blockDim - leading dimension of blocks
>> blockNumInGrid - number of blocks in each grid
>> isOnDev - indicates whether the index is on the device already
>> isIndexOnDev - indicates whether the index is on the device already
*/
void CopyInGrid(XTensor * s, XTensor * t, int * index, int blockDim, int blockNumInGrid, bool isIndexOnDev)
{
......
......@@ -22,7 +22,7 @@
#ifndef __COPYINGRID_H__
#define __COPYINGRID_H__
#include "../XTensor.h"
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......
......@@ -36,6 +36,7 @@ copy indexed sub-tensors
>> tgtIndex - index of the target sub-tensors
>> copyNum - number of the sub-tensors we copy for each source index, e.g.,
for srcIndex = [1,4] and copyNum = 2, we actually copy the source sub-tensors 1, 2, 4, 5
<< return - whether copy indexed operation was successful
*/
bool CopyIndexed(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum)
{
......
......@@ -22,7 +22,7 @@
#ifndef __COPYINDEXED_H__
#define __COPYINDEXED_H__
#include "../XTensor.h"
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......
......@@ -19,7 +19,7 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "../XName.h"
#include "../../XName.h"
#include "CopyValues.h"
#include "CopyValues.cuh"
......
......@@ -21,8 +21,8 @@
#include "CopyValues.h"
#include "CopyValues.cuh"
#include "../XUtility.h"
#include "../XDevice.h"
#include "../../XUtility.h"
#include "../../XDevice.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......
......@@ -22,13 +22,12 @@
#ifndef __COPYVALUES_CUH__
#define __COPYVALUES_CUH__
#include "../XTensor.h"
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/**************************************/
/* copy all elements from a source matrix to a target matrix */
extern "C"
bool CudaCopyValues(XTensor * s, XTensor * t, XStream * stream = NULL);
......
......@@ -22,7 +22,7 @@
#ifndef __COPYVALUES_H__
#define __COPYVALUES_H__
#include "../XTensor.h"
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......
......@@ -19,8 +19,8 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "../XTensor.h"
#include "../XName.h"
#include "../../XTensor.h"
#include "../../XName.h"
#include "ReduceMax.h"
#include "ReduceMax.cuh"
......
......@@ -19,9 +19,9 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "../XDevice.h"
#include "../XTensor.h"
#include "../XUtility.h"
#include "../../XDevice.h"
#include "../../XTensor.h"
#include "../../XUtility.h"
#include "ReduceMax.h"
#include "ReduceMax.cuh"
......@@ -31,14 +31,10 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
/*
reduce a tensor to another that keeps the max value along a dimension - slow version
Given a block of data, we go over each dimension i in the stride and we have
sum_i = max_{0<=j<strideNum} input_{i,j}
where we can view the block as a matrix and input_{i,j} represent the item at the
crossing of the i-th columne and the j-th row.
>> input - the input array (representing a tensor)
>> output - the sum over each block. NOTE: output is also an array
>> stride - stride that we need to move to the next item
......@@ -89,82 +85,77 @@ void KernelReduceMax(DTYPE * input, DTYPE * output,
}
/*
reduce a tensor to another that keeps the max value along a dimension - slow version
Given a block of data, we go over each dimension i in the stride and we have
sum_i = max_{0<=j<strideNum} input_{i,j}
where we can view the block as a matrix and input_{i,j} represent the item at the
crossing of the i-th columne and the j-th row.
>> input - the input array (representing a tensor)
>> output - the sum over each block. NOTE: output is also an array
>> stride - stride that we need to move to the next item
>> strideNum - how many strides we need to finish the reduce
>> reducedStrideNum - the number of strides after reducation
>> blockSize - size of the block (i.e., stride * strideNum)
>> blockNum - how many blocks
*/
__global__
void KernelReduceMax(__half * input, __half * output,
int stride, int strideNum, int reducedStrideNum,
int blockSize, int blockNum)
{
int idx = threadIdx.x * blockDim.y + threadIdx.y;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int j = blockIdx.y*blockDim.y + threadIdx.y;
/*
reduce a tensor to another that keeps the max value along a dimension - slow version
Given a block of data, we go over each dimension i in the stride and we have
sum_i = max_{0<=j<strideNum} input_{i,j}
where we can view the block as a matrix and input_{i,j} represent the item at the
crossing of the i-th columne and the j-th row.
>> input - the input array (representing a tensor)
>> output - the sum over each block. NOTE: output is also an array
>> stride - stride that we need to move to the next item
>> strideNum - how many strides we need to finish the reduce
>> reducedStrideNum - the number of strides after reducation
>> blockSize - size of the block (i.e., stride * strideNum)
>> blockNum - how many blocks
*/
__global__
void KernelReduceMax(__half * input, __half * output,
int stride, int strideNum, int reducedStrideNum,
int blockSize, int blockNum)
{
int idx = threadIdx.x * blockDim.y + threadIdx.y;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int j = blockIdx.y*blockDim.y + threadIdx.y;
if (i >= stride * blockNum)
return;
if (i >= stride * blockNum)
return;
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
__shared__ __half iData[MAX_CUDA_THREAD_NUM_PER_BLOCK * MIN_CUDA_SHARED_MEM_COL_SIZE / 2];
__shared__ __half iData[MAX_CUDA_THREAD_NUM_PER_BLOCK * MIN_CUDA_SHARED_MEM_COL_SIZE / 2];
#else
__shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK * MIN_CUDA_SHARED_MEM_COL_SIZE / 2];
__shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK * MIN_CUDA_SHARED_MEM_COL_SIZE / 2];
#endif
__syncthreads();
__syncthreads();
int k = i / stride;
int iOffset = i % stride;
int k = i / stride;
int iOffset = i % stride;
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
__half value = (i < stride * blockNum && j < strideNum) ?
__half value = (i < stride * blockNum && j < strideNum) ?
input[blockSize * k + stride * j + iOffset] : __half(FLOAT16_MIN);
#else
DTYPE value = (i < stride * blockNum && j < strideNum) ?
__half2float(input[blockSize * k + stride * j + iOffset]) : FLOAT_MIN;
DTYPE value = (i < stride * blockNum && j < strideNum) ?
__half2float(input[blockSize * k + stride * j + iOffset]) : FLOAT_MIN;
#endif
/* load data into the shared mem */
iData[threadIdx.x * blockDim.y + threadIdx.y] = value;
/* load data into the shared mem */
iData[threadIdx.x * blockDim.y + threadIdx.y] = value;
__syncthreads();
__syncthreads();
/* do reduction in shared mem */
for (unsigned int s = blockDim.y / 2; s > 0; s >>= 1) {
if (threadIdx.y < s && iData[idx] < iData[idx + s]) {
iData[idx] = iData[idx + s];
}
/* do reduction in shared mem */
for (unsigned int s = blockDim.y / 2; s > 0; s >>= 1) {
if (threadIdx.y < s && iData[idx] < iData[idx + s]) {
iData[idx] = iData[idx + s];
}
__syncthreads();
}
__syncthreads();
}
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
/* write result for this block to the output array */
if (threadIdx.y == 0 && blockIdx.y < reducedStrideNum)
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = iData[threadIdx.x * blockDim.y];
/* write result for this block to the output array */
if (threadIdx.y == 0 && blockIdx.y < reducedStrideNum)
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = iData[threadIdx.x * blockDim.y];
#else
/* write result for this block to the output array */
if (threadIdx.y == 0 && blockIdx.y < reducedStrideNum)
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = __half(iData[threadIdx.x * blockDim.y]);
/* write result for this block to the output array */
if (threadIdx.y == 0 && blockIdx.y < reducedStrideNum)
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = __half(iData[threadIdx.x * blockDim.y]);
#endif
}
/*
reduce a tensor to another that keeps the max value along a dimension - fast version
>> input - the input array (representing a tensor)
......@@ -338,9 +329,7 @@ void KernelReduceMaxSimpleFast(DTYPE * input, DTYPE * output,
/*
get the max-valued items along a dimension of the tensor (cuda version).
For a 1-dimensional data array a,
sum_i = max_{0<=j<strideNum} input_{i,j}
>> input - the input tensor
>> output - the output tensor
>> dim - which dimension to reduce
......
......@@ -22,7 +22,7 @@
#ifndef __REDUCEMAX_H__
#define __REDUCEMAX_H__
#include "../XTensor.h"
#include "../../XTensor.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
......
......@@ -19,7 +19,7 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "ScaleAndShift.h"
#include "../math/ScaleAndShift.h"
#include "ReduceSum.h"
#include "ReduceMean.h"
......@@ -28,7 +28,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
/*
get the mean value along a dimension of the tensor. For a 1-dimensional data array a,
mean = (1/n) * sum_i input_i
>> input - the input tensor
>> output - the output tensor
>> dim - the dimension where the reduction is performed on
......@@ -44,5 +43,4 @@ void ReduceMean(XTensor * input, XTensor * output, int dim)
ScaleAndShift(output, (DTYPE)1/num, 0);
}
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
......@@ -22,7 +22,7 @@
#ifndef __REDUCEMEAN_H__
#define __REDUCEMEAN_H__
#include "../XTensor.h"
#include "../../XTensor.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
......
......@@ -22,7 +22,7 @@
#ifndef __REDUCESTANDARDVARIANCE_H__
#define __REDUCESTANDARDVARIANCE_H__
#include "../XTensor.h"
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......
......@@ -22,7 +22,7 @@
#include <math.h>
#include "ReduceSum.h"
#include "ReduceSum.cuh"
#include "../XName.h"
#include "../../XName.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
......
......@@ -19,8 +19,8 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "../XDevice.h"
#include "../XUtility.h"
#include "../../XDevice.h"
#include "../../XUtility.h"
#include "ReduceSum.cuh"
namespace nts{ // namespace nts(NiuTrans.Tensor)
......@@ -29,13 +29,11 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
/*
reduce a tensor to another that keeps the sum along a dimension - slow version
Given a block of data, we go over each dimension i in the stride and we have
sum_i = sum_{0<=j<strideNum} exp(input_{i,j} - shift) if isExp == true;
= sum_{0<=j<strideNum} input_{i,j} - shift if isExp == false;
where we can view the block as a matrix and input_{i,j} represent the item at the
crossing of the i-th columne and the j-th row.
>> input - the input array (representing a tensor)
>> output - the sum over each block. NOTE: output is also an array
>> stride - stride that we need to move to the next item
......@@ -107,13 +105,11 @@ void KernelReduceSum(DTYPE * input, DTYPE * output,
/*
reduce a tensor to another that keeps the sum along a dimension - slow version
This is for float16 reduction.
Given a block of data, we go over each dimension i in the stride and we have
sum_i = sum_{0<=j<strideNum} exp(input_{i,j} - shift) if isExp == true;
= sum_{0<=j<strideNum} input_{i,j} - shift if isExp == false;
where we can view the block as a matrix and input_{i,j} represent the item at the
crossing of the i-th columne and the j-th row.
>> input - the input array (representing a tensor)
>> output - the sum over each block. NOTE: output is also an array
>> stride - stride that we need to move to the next item
......@@ -304,7 +300,6 @@ void KernelReduceSumFast(DTYPE * input, DTYPE * output,
/*
reduce a tensor to another that keeps the sum along a dimension - fast version
This is for float16 reduction
>> input - the input array (representing a tensor)
>> output - the sum over each block. NOTE: output is also an array
>> stride - stride that we need to move to the next item
......
......@@ -22,7 +22,7 @@
#ifndef __REDUCESUM_H__
#define __REDUCESUM_H__
#include "../XTensor.h"
#include "../../XTensor.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
......
......@@ -28,7 +28,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
squared sum of the items along a dimension of the tensor.
For a 1-dimensional data array a,
sum = \sum_i (a_i - shift)^2
>> input - the input tensor
>> output - the output tensor
>> dim - the dimension where the reduction is performed on
......
......@@ -22,7 +22,7 @@
#ifndef __REDUCESUMSQUARED_H__
#define __REDUCESUMSQUARED_H__
#include "../XTensor.h"
#include "../../XTensor.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
......
......@@ -19,7 +19,7 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "ScaleAndShift.h"
#include "../math/ScaleAndShift.h"
#include "ReduceSum.h"
#include "ReduceVariance.h"
......@@ -29,7 +29,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
variance of the items along a dimension of the tensor.
For a 1-dimensional data array a,
variance = 1/n * \sum_i (a_i - mean)^2
>> input - the input tensor
>> output - the output tensor
>> dim - the dimension where the reduction is performed on
......
......@@ -22,7 +22,7 @@
#ifndef __REDUCEVARIANCE_H__
#define __REDUCEVARIANCE_H__
#include "../XTensor.h"
#include "../../XTensor.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
......
......@@ -19,7 +19,7 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "../XTensor.h"
#include "../../XTensor.h"
#include "Concatenate.h"
#include "Merge.h"
#include "ConcatenateSolely.h"
......@@ -53,6 +53,10 @@ void Concatenate(XList * smalls, XTensor * big, int dim)
/*
concatenate two tensors along a given dimension
>> smallA - one tensor for concatenation
>> smallB - the other tensor for concatenation
>> big - the resulting tensor
>> dim - which dimension we perform the concatenation
*/
void Concatenate(XTensor * smallA, XTensor * smallB, XTensor * big, int dim)
{
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论