Commit abeb3e64 by liyinqiao

merged

parents dcabc2b0 414ff54f
...@@ -38,7 +38,7 @@ ...@@ -38,7 +38,7 @@
#include "XMem.h" #include "XMem.h"
#include "XHeap.h" #include "XHeap.h"
#include "XBLAS.h" #include "XBLAS.h"
#include "core/MergeBlockLists.h" #include "core/shape/MergeBlockLists.h"
#ifdef USE_CUDA #ifdef USE_CUDA
...@@ -47,8 +47,8 @@ ...@@ -47,8 +47,8 @@
#include <cublas_v2.h> #include <cublas_v2.h>
#include <cuda.h> #include <cuda.h>
#include <curand.h> #include <curand.h>
#include "core/FlushToMem.cuh" #include "core/utilities/FlushToMem.cuh"
#include "core/SetAscendingOrder.cuh" #include "core/utilities/SetAscendingOrder.cuh"
#endif #endif
...@@ -555,6 +555,27 @@ bool XTensor::CheckData(const void * d, int num, int beg) ...@@ -555,6 +555,27 @@ bool XTensor::CheckData(const void * d, int num, int beg)
return true; return true;
} }
bool XTensor::CheckData(const void * d, int num, float tolerance, int beg)
{
if (data == NULL || d == NULL)
return false;
CheckNTErrors(!isSparse, "TODO");
CheckNTErrors(num == unitNum - beg, "Illegal size!");
DTYPE * valuePrt = (DTYPE*)data;
DTYPE value = 0;
DTYPE * answerPrt = (DTYPE*)d;
for (int i = beg; i < num; i++) {
value = ToCPU(devID, valuePrt);
if (fabs(value - *answerPrt) > tolerance)
return false;
valuePrt++;
answerPrt++;
}
return true;
}
/* /*
set the cell to the ascending order along a given dimension set the cell to the ascending order along a given dimension
>> dim - the dimension specified >> dim - the dimension specified
...@@ -696,6 +717,63 @@ DTYPE XTensor::Get3D(int d0, int d1, int d2) ...@@ -696,6 +717,63 @@ DTYPE XTensor::Get3D(int d0, int d1, int d2)
return ToCPU(devID, value); return ToCPU(devID, value);
} }
/*
get the value of a cell in a 1d tensor in int type
>> i - index
<< return - value of cell(i) in int
*/
int XTensor::Get1DInt(int i)
{
CheckNTErrors((order == 1), "Cannot get a 2d cell for a tensor whose order is not 2!");
CheckNTErrors((i >= 0 && i < dimSize[0]), "dimension 0 is out of range!");
CheckNTErrors((dataType == X_INT), "The tensor is not in int type.");
int dimSize[1] = {i};
void * value = GetCell(dimSize, 1);
return ToCPUInt(devID, value);
}
/*
get the value of a cell in a 2d tensor in int type
>> ni - row index
>> mi - column index
<< return - value of cell(ni, mi) in int
*/
int XTensor::Get2DInt(int ni, int mi)
{
CheckNTErrors((order == 2), "Cannot get a 2d cell for a tensor whose order is not 2!");
CheckNTErrors((ni >= 0 && ni < dimSize[0]), "dimension 0 is out of range!");
CheckNTErrors((mi >= 0 && mi < dimSize[1]), "dimension 1 is out of range!");
CheckNTErrors((dataType == X_INT), "The tensor is not in default type.");
int dims[2] = {ni, mi};
void * value = GetCell(dims, 2);
return ToCPUInt(devID, value);
}
/*
get the value of a cell in a 3d tensor in int type
>> d0 - index of dimension 0
>> d1 - index of dimension 1
>> d2 - index of dimension 2
<< return - value of cell(d0, d1, d2) in int
*/
int XTensor::Get3DInt(int d0, int d1, int d2)
{
CheckNTErrors((order == 3), "Cannot get a 2d cell for a tensor whose order is not 2!");
CheckNTErrors((d0 >= 0 && d0 < dimSize[0]), "dimension 0 is out of range!");
CheckNTErrors((d1 >= 0 && d1 < dimSize[1]), "dimension 1 is out of range!");
CheckNTErrors((d2 >= 0 && d2 < dimSize[2]), "dimension 2 is out of range!");
CheckNTErrors((dataType == X_INT), "The tensor is not in default type.");
int dims[3] = {d0, d1, d2};
void * value = GetCell(dims, 3);
return ToCPUInt(devID, value);
}
/* /*
get the value of a cell in the sparse tensor get the value of a cell in the sparse tensor
>> i - i-th tuple in the tuple list of the sparse tensor >> i - i-th tuple in the tuple list of the sparse tensor
......
...@@ -211,6 +211,9 @@ struct XTensor ...@@ -211,6 +211,9 @@ struct XTensor
/* check whether the data array is the same as the answer */ /* check whether the data array is the same as the answer */
bool CheckData(const void * answer, int num, int beg = 0); bool CheckData(const void * answer, int num, int beg = 0);
/* check whether the data array is the same as the answer */
bool CheckData(const void * answer, int num, float tolerance, int beg = 0);
/* set the cell to the ascending order along a given dimension */ /* set the cell to the ascending order along a given dimension */
void SetAscendingOrder(int dim); void SetAscendingOrder(int dim);
...@@ -220,15 +223,24 @@ struct XTensor ...@@ -220,15 +223,24 @@ struct XTensor
/* get the pointer to a cell */ /* get the pointer to a cell */
void * GetCell(int index[], int size = -1); void * GetCell(int index[], int size = -1);
/* get the value of a cell in a 1d tensor */ /* get the default type value of a cell in a 1d tensor */
DTYPE Get1D(int i); DTYPE Get1D(int i);
/* get the value of a cell in a 2d tensor */ /* get the default type value of a cell in a 2d tensor */
DTYPE Get2D(int ni, int mi); DTYPE Get2D(int ni, int mi);
/* get the value of a cell in a 3d tensor */ /* get the default type value of a cell in a 3d tensor */
DTYPE Get3D(int d0, int d1, int d2); DTYPE Get3D(int d0, int d1, int d2);
/* get the int value of a cell in a 1d tensor */
int Get1DInt(int i);
/* get the int value of a cell in a 2d tensor */
int Get2DInt(int ni, int mi);
/* get the int value of a cell in a 3d tensor */
int Get3DInt(int d0, int d1, int d2);
/* get the value of a cell in a sparse tensor */ /* get the value of a cell in a sparse tensor */
DTYPE GetInSparse(int i); DTYPE GetInSparse(int i);
......
...@@ -26,43 +26,49 @@ ...@@ -26,43 +26,49 @@
#include "../XTensor.h" #include "../XTensor.h"
#include "Concatenate.h" #include "shape/Concatenate.h"
#include "ConcatenateSolely.h" #include "shape/ConcatenateSolely.h"
#include "CopyIndexed.h" #include "movement/CopyBlocks.h"
#include "CopyInGrid.h" #include "movement/CopyBlocksInGrid.h"
#include "CopyValues.h" #include "movement/CopyBlocksOnSite.h"
#include "FlushToMem.h" #include "movement/CopyData2D.h"
#include "MakeMergeBlockIndex.h" #include "movement/CopyIndexed.h"
#include "MakeSplitBlockIndex.h" #include "movement/CopyInGrid.h"
#include "MatrixMul.h" #include "movement/CopyValues.h"
#include "MatrixMul2D.h" #include "utilities/FlushToMem.h"
#include "MatrixMul2DMultiTheading.h" #include "shape/MakeMergeBlockIndex.h"
#include "MatrixMul2DParallel.h" #include "shape/MakeSplitBlockIndex.h"
#include "MatrixMulBatched.h" #include "arithmetic/MatrixMul.h"
#include "MatrixMULBatchedCPU.h" #include "arithmetic/MatrixMul2D.h"
#include "Merge.h" #include "arithmetic/MatrixMul2DMultiTheading.h"
#include "MergeBlockLists.h" #include "arithmetic/MatrixMul2DParallel.h"
#include "Multiply.h" #include "arithmetic/MatrixMulBatched.h"
#include "Negate.h" #include "arithmetic/MatrixMULBatchedCPU.h"
#include "Normalize.h" #include "shape/Merge.h"
#include "Permute.h" #include "shape/MergeBlockLists.h"
#include "Power.h" #include "arithmetic/Multiply.h"
#include "ReduceMax.h" #include "arithmetic/Negate.h"
#include "ReduceMean.h" #include "math/Normalize.h"
#include "ReduceStandardVariance.h" #include "shape/Permute.h"
#include "ReduceSum.h" #include "math/Power.h"
#include "ReduceSumSquared.h" #include "reduce/ReduceMax.h"
#include "ReduceVariance.h" #include "reduce/ReduceMean.h"
#include "ScaleAndShift.h" #include "reduce/ReduceStandardVariance.h"
#include "SetData.h" #include "reduce/ReduceSum.h"
#include "Sort.h" #include "reduce/ReduceSumSquared.h"
#include "Split.h" #include "reduce/ReduceVariance.h"
#include "Sum.h" #include "math/ScaleAndShift.h"
#include "SumByColumnTV.h" #include "getandset/Select.h"
#include "SumByColumnVT.h" #include "getandset/SetData.h"
#include "TopK.h" #include "sort/Sort.h"
#include "Unsqueeze.h" #include "shape/Split.h"
#include "XMatrixSegment.h" #include "arithmetic/Sum.h"
#include "XTensorBLAS.h" #include "arithmetic/SumByColumnTV.h"
#include "arithmetic/SumByColumnVT.h"
#include "sort/TopK.h"
#include "shape/Transpose.h"
#include "shape/Unsqueeze.h"
#include "utilities/XMatrixSegment.h"
#include "arithmetic/XTensorBLAS.h"
#endif // __CHEADER_H__ #endif // __CHEADER_H__
\ No newline at end of file
...@@ -219,9 +219,8 @@ public: ...@@ -219,9 +219,8 @@ public:
/* insert a dimension by copying the blocks for x times (where x is the size of the inerted dimension) */ /* insert a dimension by copying the blocks for x times (where x is the size of the inerted dimension) */
void Unsqueeze(XTensor * a, XTensor * b, int dim, int dSize); void Unsqueeze(XTensor * a, XTensor * b, int dim, int dSize);
/******************************************************************* /* segmentation and parallel processing for 2d tensors (i.e., matrices) */
segmentation and parallel processing for 2d tensors (i.e., matrices)
*/
/* segment a 2d tensor (i.e., matrix) into blocks and run jobs in parallel */ /* segment a 2d tensor (i.e., matrix) into blocks and run jobs in parallel */
static static
void RunParallel2D(XPRunner * parallelRunner, void * job, int opNum, int rowNum, int colNum, int argNum, ...); void RunParallel2D(XPRunner * parallelRunner, void * job, int opNum, int rowNum, int colNum, int argNum, ...);
......
...@@ -19,7 +19,7 @@ ...@@ -19,7 +19,7 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/ */
#include "../XTensor.h" #include "../../XTensor.h"
#include "MatrixMULBatchedCPU.h" #include "MatrixMULBatchedCPU.h"
#include "MatrixMul2D.h" #include "MatrixMul2D.h"
#include "XTensorBLAS.h" #include "XTensorBLAS.h"
...@@ -33,9 +33,9 @@ c_i = trans(a_i) * trans(b_i) * \alpha + c_i * \beta for each i in [0,count-1] ...@@ -33,9 +33,9 @@ c_i = trans(a_i) * trans(b_i) * \alpha + c_i * \beta for each i in [0,count-1]
>> transposedA - indicate whether the matrix a is transposed >> transposedA - indicate whether the matrix a is transposed
>> b - another list of input matrices (2d tensors) >> b - another list of input matrices (2d tensors)
>> transposedB - indicate whether the matrix b is transposed >> transposedB - indicate whether the matrix b is transposed
>> c - output matrix (2d tensor)
>> alpha - scalar >> alpha - scalar
>> beta - scalar >> beta - scalar
>> c - output matrix (2d tensor)
*/ */
void MatrixMULBatchedCPU(XList * a, MATRIX_TRANS_TYPE transposedA, void MatrixMULBatchedCPU(XList * a, MATRIX_TRANS_TYPE transposedA,
XList * b, MATRIX_TRANS_TYPE transposedB, XList * b, MATRIX_TRANS_TYPE transposedB,
...@@ -64,10 +64,6 @@ void MatrixMULBatchedCPU(XList * a, MATRIX_TRANS_TYPE transposedA, ...@@ -64,10 +64,6 @@ void MatrixMULBatchedCPU(XList * a, MATRIX_TRANS_TYPE transposedA,
} }
} }
//if(isUniform){
//}
//else{
for (int i = 0; i < a->count; i++) { for (int i = 0; i < a->count; i++) {
XTensor * ai = (XTensor*)a->GetItem(i); XTensor * ai = (XTensor*)a->GetItem(i);
XTensor * bi = (XTensor*)b->GetItem(i); XTensor * bi = (XTensor*)b->GetItem(i);
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
#ifndef __MATRIXMULBATCHEDCPU_H__ #ifndef __MATRIXMULBATCHEDCPU_H__
#define __MATRIXMULBATCHEDCPU_H__ #define __MATRIXMULBATCHEDCPU_H__
#include "../XTensor.h" #include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
......
...@@ -19,9 +19,9 @@ ...@@ -19,9 +19,9 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/ */
#include "../XTensor.h" #include "../../XTensor.h"
#include "../XDevice.h" #include "../../XDevice.h"
#include "../XName.h" #include "../../XName.h"
#include "MatrixMul.h" #include "MatrixMul.h"
#include "MatrixMul2D.h" #include "MatrixMul2D.h"
#include "MatrixMULBatchedCPU.h" #include "MatrixMULBatchedCPU.h"
...@@ -65,13 +65,12 @@ void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -65,13 +65,12 @@ void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA,
XLink::AddParamToHeadInt(c, transposedB); XLink::AddParamToHeadInt(c, transposedB);
XLink::AddParamToHead(c, alpha); XLink::AddParamToHead(c, alpha);
XLink::AddParamToHead(c, beta); XLink::AddParamToHead(c, beta);
int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
int an = transposedA == X_TRANS ? a->dimSize[1] : a->dimSize[0]; int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
int am = transposedA == X_TRANS ? a->dimSize[0] : a->dimSize[1]; int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1];
int bn = transposedB == X_TRANS ? b->dimSize[1] : b->dimSize[0]; int bm = transposedB == X_TRANS ? b->dimSizeRDI[1] : b->dimSizeRDI[0];
int bm = transposedB == X_TRANS ? b->dimSize[0] : b->dimSize[1]; int cn = c->dimSizeRDI[1];
int cn = c->dimSize[0]; int cm = c->dimSizeRDI[0];
int cm = c->dimSize[1];
CheckNTErrors((am == bn && an == cn && bm == cm), CheckNTErrors((am == bn && an == cn && bm == cm),
"Unmatched tensors in multiplication!"); "Unmatched tensors in multiplication!");
...@@ -87,13 +86,13 @@ void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -87,13 +86,13 @@ void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA,
int cBlockNum = 1; int cBlockNum = 1;
for (int i = 2; i < a->order; i++) { for (int i = 2; i < a->order; i++) {
CheckNTErrors((a->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!"); CheckNTErrors((a->dimSizeRDI[i] == c->dimSizeRDI[i - 2 + b->order]), "Incorrect tensor sizes!");
aBlockNum *= a->dimSizeRDI[i]; aBlockNum *= a->dimSizeRDI[i];
cBlockNum *= a->dimSizeRDI[i]; cBlockNum *= a->dimSizeRDI[i];
} }
for (int i = 2; i < b->order; i++) { for (int i = 2; i < b->order; i++) {
CheckNTErrors((b->dimSizeRDI[i] == c->dimSizeRDI[i - 2 + a->order]), "Incorrect tensor sizes!"); CheckNTErrors((b->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!");
bBlockNum *= b->dimSizeRDI[i]; bBlockNum *= b->dimSizeRDI[i];
cBlockNum *= b->dimSizeRDI[i]; cBlockNum *= b->dimSizeRDI[i];
} }
...@@ -101,9 +100,9 @@ void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -101,9 +100,9 @@ void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA,
XList * aList = new XList(10); XList * aList = new XList(10);
XList * bList = new XList(10); XList * bList = new XList(10);
XList * cList = new XList(10); XList * cList = new XList(10);
int aDimSize[2] = { -a->dimSize[0], a->dimSize[1] }; int aDimSize[2] = { a->dimSizeRDI[1], a->dimSizeRDI[0] };
int bDimSize[2] = { -b->dimSize[0], b->dimSize[1] }; int bDimSize[2] = { b->dimSizeRDI[1], b->dimSizeRDI[0] };
int cDimSize[2] = { -c->dimSize[0], c->dimSize[1] }; int cDimSize[2] = { c->dimSizeRDI[1], c->dimSizeRDI[0] };
bool isSparseMul = false; bool isSparseMul = false;
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
#ifndef __MATRIXMUL_H__ #ifndef __MATRIXMUL_H__
#define __MATRIXMUL_H__ #define __MATRIXMUL_H__
#include "../XTensor.h" #include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -39,7 +39,7 @@ normal matrix multiplication if A = y * z and B = x * y. ...@@ -39,7 +39,7 @@ normal matrix multiplication if A = y * z and B = x * y.
*/ */
extern "C" extern "C"
void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA, XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c, void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA, XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL); DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -19,8 +19,8 @@ ...@@ -19,8 +19,8 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/ */
#include "../XTensor.h" #include "../../XTensor.h"
#include "../XName.h" #include "../../XName.h"
#include "MatrixMul2D.h" #include "MatrixMul2D.h"
#include "MatrixMul2D.cuh" #include "MatrixMul2D.cuh"
#include "MatrixMul2DParallel.h" #include "MatrixMul2DParallel.h"
...@@ -112,7 +112,7 @@ void MatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -112,7 +112,7 @@ void MatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA,
int num = *((int*)b->data); int num = *((int*)b->data);
char * p = (char*)b->data + sizeof(int); // pointer to the first tuple char * p = (char*)b->data + sizeof(int); // pointer to the first tuple
/* a * b */ /* a * b */
if (transposedA == X_NOTRANS && transposedB == X_NOTRANS) { if (transposedA == X_NOTRANS && transposedB == X_NOTRANS) {
for (int i = 0; i < num; i++) { for (int i = 0; i < num; i++) {
int key = *((int*)p); int key = *((int*)p);
......
...@@ -19,8 +19,8 @@ ...@@ -19,8 +19,8 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/ */
#include "../XDevice.h" #include "../../XDevice.h"
#include "../XTensor.h" #include "../../XTensor.h"
#include "MatrixMul2D.h" #include "MatrixMul2D.h"
#include "MatrixMul2D.cuh" #include "MatrixMul2D.cuh"
#include "XTensorBLAS.h" #include "XTensorBLAS.h"
...@@ -37,11 +37,13 @@ c = a * b * \alpha ...@@ -37,11 +37,13 @@ c = a * b * \alpha
>> aColSize - column size of matrix a >> aColSize - column size of matrix a
>> aRowSize - row size of matrix a >> aRowSize - row size of matrix a
>> b - a sparse matrix >> b - a sparse matrix
>> transposedA - indicates whether b is transposed >> transposedB - indicates whether b is transposed
>> bNonZeroNum - number of non-zero items in b >> bNonZeroNum - number of non-zero items in b
>> bColSize - column size of matrix b >> bColSize - column size of matrix b
>> bRowSize - row size of matrix b >> bRowSize - row size of matrix b
>> c - the resulting (dense) matrix >> c - the resulting (dense) matrix
>> cColSize - column size of matrix c
>> cRowSize - row size of matrix c
>> alpha - the scaling factor >> alpha - the scaling factor
*/ */
extern "C" __global__ extern "C" __global__
...@@ -147,7 +149,6 @@ void CudaMatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -147,7 +149,6 @@ void CudaMatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA,
if (!a->isSparse && !b->isSparse) { if (!a->isSparse && !b->isSparse) {
CheckNTErrors((!c->isSparse), "Illegal use of sparse matrix in multiplication!"); CheckNTErrors((!c->isSparse), "Illegal use of sparse matrix in multiplication!");
//cublasHandle_t * handle = GDevs->GetCudaHandle(a->devID);
cublasHandle_t * handle = a->mem == NULL ? GDevs.GetCudaHandle(a->devID) : a->mem->GetCublasHandle(); cublasHandle_t * handle = a->mem == NULL ? GDevs.GetCudaHandle(a->devID) : a->mem->GetCublasHandle();
/* !!!! might have problems */ /* !!!! might have problems */
...@@ -183,7 +184,6 @@ void CudaMatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -183,7 +184,6 @@ void CudaMatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA,
if (beta == 0) if (beta == 0)
c->SetZeroAll(); c->SetZeroAll();
else if (beta != 1.0F) { else if (beta != 1.0F) {
//XTensor::ScaleAndShift(c, beta, 0);
ShowNTErrors("TODO!"); ShowNTErrors("TODO!");
} }
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
#ifndef __MATRIXMUL2D_H__ #ifndef __MATRIXMUL2D_H__
#define __MATRIXMUL2D_H__ #define __MATRIXMUL2D_H__
#include "../XTensor.h" #include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
......
...@@ -19,7 +19,7 @@ ...@@ -19,7 +19,7 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/ */
#include "../XTensor.h" #include "../../XTensor.h"
#include "MatrixMul2DMultiTheading.h" #include "MatrixMul2DMultiTheading.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
#ifndef __MATRIXMUL2DMULTITHEADING_H__ #ifndef __MATRIXMUL2DMULTITHEADING_H__
#define __MATRIXMUL2DMULTITHEADING_H__ #define __MATRIXMUL2DMULTITHEADING_H__
#include "../XTensor.h" #include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
......
...@@ -19,10 +19,10 @@ ...@@ -19,10 +19,10 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/ */
#include "../XTensor.h" #include "../../XTensor.h"
#include "MatrixMul2DParallel.h" #include "MatrixMul2DParallel.h"
#include "MatrixMul2DMultiTheading.h" #include "MatrixMul2DMultiTheading.h"
#include "XMatrixSegment.h" #include "../utilities/XMatrixSegment.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
#ifndef __MATRIXMUL2DPARALLEL_H__ #ifndef __MATRIXMUL2DPARALLEL_H__
#define __MATRIXMUL2DPARALLEL_H__ #define __MATRIXMUL2DPARALLEL_H__
#include "../XTensor.h" #include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
......
...@@ -19,9 +19,9 @@ ...@@ -19,9 +19,9 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/ */
#include "../XTensor.h" #include "../../XTensor.h"
#include "../XDevice.h" #include "../../XDevice.h"
#include "../XName.h" #include "../../XName.h"
#include "MatrixMulBatched.h" #include "MatrixMulBatched.h"
#include "MatrixMULBatchedCPU.h" #include "MatrixMULBatchedCPU.h"
#include "XTensorBLAS.h" #include "XTensorBLAS.h"
...@@ -41,6 +41,7 @@ where trans() returns the transposed matrix if the flag is fired ...@@ -41,6 +41,7 @@ where trans() returns the transposed matrix if the flag is fired
>> c - where we keep a*b >> c - where we keep a*b
>> alpha - a coefficient >> alpha - a coefficient
>> beta - another coefficient >> beta - another coefficient
>> parallelRunner - parallel processing module
*/ */
void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA, void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA,
XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * b, MATRIX_TRANS_TYPE transposedB,
...@@ -59,13 +60,12 @@ void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -59,13 +60,12 @@ void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA,
XLink::AddParamToHeadInt(c, transposedB); XLink::AddParamToHeadInt(c, transposedB);
XLink::AddParamToHead(c, alpha); XLink::AddParamToHead(c, alpha);
XLink::AddParamToHead(c, beta); XLink::AddParamToHead(c, beta);
int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
int an = transposedA == X_TRANS ? a->dimSize[1] : a->dimSize[0]; int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
int am = transposedA == X_TRANS ? a->dimSize[0] : a->dimSize[1]; int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1];
int bn = transposedB == X_TRANS ? b->dimSize[1] : b->dimSize[0]; int bm = transposedB == X_TRANS ? b->dimSizeRDI[1] : b->dimSizeRDI[0];
int bm = transposedB == X_TRANS ? b->dimSize[0] : b->dimSize[1]; int cn = c->dimSizeRDI[1];
int cn = c->dimSize[0]; int cm = c->dimSizeRDI[0];
int cm = c->dimSize[1];
CheckNTErrors((am == bn && an == cn && bm == cm), CheckNTErrors((am == bn && an == cn && bm == cm),
"Unmatched tensors in multiplication!"); "Unmatched tensors in multiplication!");
...@@ -87,9 +87,9 @@ void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -87,9 +87,9 @@ void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA,
XList * aList = new XList(10); XList * aList = new XList(10);
XList * bList = new XList(10); XList * bList = new XList(10);
XList * cList = new XList(10); XList * cList = new XList(10);
int aDimSize[2] = { -a->dimSizeRDI[0], a->dimSizeRDI[1] }; int aDimSize[2] = { -a->dimSizeRDI[1], a->dimSizeRDI[0] };
int bDimSize[2] = { -b->dimSizeRDI[0], b->dimSizeRDI[1] }; int bDimSize[2] = { -b->dimSizeRDI[1], b->dimSizeRDI[0] };
int cDimSize[2] = { -c->dimSizeRDI[0], c->dimSizeRDI[1] }; int cDimSize[2] = { -c->dimSizeRDI[1], c->dimSizeRDI[0] };
for (int p = 0; p < blockNum; p++) { for (int p = 0; p < blockNum; p++) {
void * ap = (char*)a->data + aRealBlockSize * p; void * ap = (char*)a->data + aRealBlockSize * p;
...@@ -114,8 +114,9 @@ void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -114,8 +114,9 @@ void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA,
int devIDBackup; int devIDBackup;
ProtectCudaDev(a->devID, devIDBackup); ProtectCudaDev(a->devID, devIDBackup);
CudaBLASMatrixMULList(a->mem != NULL ? a->mem->GetCublasHandle() : GDevs.GetCudaHandle(a->devID), cublasHandle_t * handle = a->mem != NULL ? a->mem->GetCublasHandle() : GDevs.GetCudaHandle(a->devID);
aList, transposedA, CudaBLASMatrixMULList(handle,
aList, transposedA,
bList, transposedB, bList, transposedB,
cList, aList->count, cList, aList->count,
alpha, beta); alpha, beta);
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
#ifndef __MATRIXMULBATCHED_H__ #ifndef __MATRIXMULBATCHED_H__
#define __MATRIXMULBATCHED_H__ #define __MATRIXMULBATCHED_H__
#include "../XTensor.h" #include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
......
...@@ -19,12 +19,13 @@ ...@@ -19,12 +19,13 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/ */
#include "../XTensor.h" #include "../../XTensor.h"
#include "../XName.h" #include "../../XName.h"
#include "Multiply.h" #include "Multiply.h"
#include "Multiply.cuh" #include "Multiply.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* /*
element-wise product of two tensors element-wise product of two tensors
c(i) = a(i)*b(i) + \alpha * c(i) c(i) = a(i)*b(i) + \alpha * c(i)
......
...@@ -19,8 +19,8 @@ ...@@ -19,8 +19,8 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/ */
#include "../XDevice.h" #include "../../XDevice.h"
#include "../XTensor.h" #include "../../XTensor.h"
#include "Multiply.h" #include "Multiply.h"
#include "Multiply.cuh" #include "Multiply.cuh"
...@@ -68,6 +68,7 @@ where |a_lead| means the size of the leading dimension of a ...@@ -68,6 +68,7 @@ where |a_lead| means the size of the leading dimension of a
>> a - tensor a >> a - tensor a
>> b - tensor b >> b - tensor b
>> c - result tensor >> c - result tensor
>> alpha - the coefficient
>> stride - the number of items we go over when move next along the leading dimension in a block >> stride - the number of items we go over when move next along the leading dimension in a block
>> ldSizeA - size of the leading dimension of a >> ldSizeA - size of the leading dimension of a
>> ldSizeB - size of the leading dimension of b >> ldSizeB - size of the leading dimension of b
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
#ifndef __MULTIPLY_H__ #ifndef __MULTIPLY_H__
#define __MULTIPLY_H__ #define __MULTIPLY_H__
#include "../XTensor.h" #include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
......
...@@ -19,15 +19,15 @@ ...@@ -19,15 +19,15 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/ */
#include "../XTensor.h" #include "../../XTensor.h"
#include "Negate.h" #include "Negate.h"
#include "Negate.cuh" #include "Negate.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* /*
set every entry to its minus value set every entry to its minus value
>> a - the tensor we are processing >> a - the tensor we are processing
*/ */
void Negate(XTensor * a) void Negate(XTensor * a)
{ {
......
...@@ -19,8 +19,8 @@ ...@@ -19,8 +19,8 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/ */
#include "../XDevice.h" #include "../../XDevice.h"
#include "../XTensor.h" #include "../../XTensor.h"
#include "Negate.h" #include "Negate.h"
#include "Negate.cuh" #include "Negate.cuh"
...@@ -42,10 +42,10 @@ void KernelNegate(DTYPE * d, int size) ...@@ -42,10 +42,10 @@ void KernelNegate(DTYPE * d, int size)
} }
/* /*
set each entry to its negtive value (CUDA Kernel) set each entry to its negtive value (CUDA Kernel)
This is for float16 computation This is for float16 computation
>> d - pointer to the data array >> d - pointer to the data array
>> size - size of the data array >> size - size of the data array
*/ */
__global__ __global__
void KernelNegate(__half * d, int size) void KernelNegate(__half * d, int size)
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
#ifndef __NEGATE_H__ #ifndef __NEGATE_H__
#define __NEGATE_H__ #define __NEGATE_H__
#include "../XTensor.h" #include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
......
...@@ -19,8 +19,8 @@ ...@@ -19,8 +19,8 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/ */
#include "../XTensor.h" #include "../../XTensor.h"
#include "../XName.h" #include "../../XName.h"
#include "Sum.h" #include "Sum.h"
#include "Sum.cuh" #include "Sum.cuh"
......
...@@ -19,12 +19,13 @@ ...@@ -19,12 +19,13 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/ */
#include "../XDevice.h" #include "../../XDevice.h"
#include "Sum.cuh" #include "Sum.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* /*
summation of data arrays (CUDA Kernel) summation of data arrays (CUDA Kernel)
c = a + b * \beta c = a + b * \beta
......
...@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* summation of data arrays (CUDA Kernel) */ /* summation of data arrays (CUDA Kernel) */
extern "C" __global__ extern "C" __global__
void KernelADD(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta = (DTYPE)1.0); void KernelADD(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta = (DTYPE)1.0);
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
#ifndef __SUM_H__ #ifndef __SUM_H__
#define __SUM_H__ #define __SUM_H__
#include "../XTensor.h" #include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
......
...@@ -19,7 +19,7 @@ ...@@ -19,7 +19,7 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/ */
#include "../XTensor.h" #include "../../XTensor.h"
#include "SumByColumnTV.h" #include "SumByColumnTV.h"
#include "SumByColumnTV.cuh" #include "SumByColumnTV.cuh"
......
...@@ -19,8 +19,8 @@ ...@@ -19,8 +19,8 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/ */
#include "../XDevice.h" #include "../../XDevice.h"
#include "../XTensor.h" #include "../../XTensor.h"
#include "SumByColumnTV.h" #include "SumByColumnTV.h"
#include "SumByColumnTV.cuh" #include "SumByColumnTV.cuh"
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
#ifndef __REDUCEMAX_CUH__ #ifndef __REDUCEMAX_CUH__
#define __REDUCEMAX_CUH__ #define __REDUCEMAX_CUH__
#include "ReduceMax.h" #include "../reduce/ReduceMax.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
#ifndef __SUMBYCOLUMNTV_H__ #ifndef __SUMBYCOLUMNTV_H__
#define __SUMBYCOLUMNTV_H__ #define __SUMBYCOLUMNTV_H__
#include "../XTensor.h" #include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
......
...@@ -19,7 +19,7 @@ ...@@ -19,7 +19,7 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/ */
#include "../XTensor.h" #include "../../XTensor.h"
#include "SumByColumnVT.h" #include "SumByColumnVT.h"
#include "SumByColumnVT.cuh" #include "SumByColumnVT.cuh"
......
...@@ -19,14 +19,15 @@ ...@@ -19,14 +19,15 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/ */
#include "../XDevice.h" #include "../../XDevice.h"
#include "../XTensor.h" #include "../../XTensor.h"
#include "SumByColumnVT.h" #include "SumByColumnVT.h"
#include "SumByColumnVT.cuh" #include "SumByColumnVT.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* /*
summation of a vector (column vector) and a tensor summation of a vector (column vector) and a tensor
c = a + \sum{col} b_col * \beta c = a + \sum{col} b_col * \beta
......
...@@ -22,11 +22,10 @@ ...@@ -22,11 +22,10 @@
#ifndef __SUMBYCOLUMNVT_H__ #ifndef __SUMBYCOLUMNVT_H__
#define __SUMBYCOLUMNVT_H__ #define __SUMBYCOLUMNVT_H__
#include "../XTensor.h" #include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* sum of a (column) vector and a tensor */ /* sum of a (column) vector and a tensor */
extern "C" extern "C"
void SumByColumnVT(XTensor * a, XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0); void SumByColumnVT(XTensor * a, XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);
......
...@@ -20,8 +20,8 @@ ...@@ -20,8 +20,8 @@
*/ */
#include "XTensorBLAS.h" #include "XTensorBLAS.h"
#include "../XTensor.h" #include "../../XTensor.h"
#include "../XBLAS.h" #include "../../XBLAS.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
......
...@@ -19,9 +19,9 @@ ...@@ -19,9 +19,9 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/ */
#include "../XUtility.h" #include "../../XUtility.h"
#include "../XDevice.h" #include "../../XDevice.h"
#include "../XTensor.h" #include "../../XTensor.h"
#include "XTensorBLAS.h" #include "XTensorBLAS.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
#ifndef __XTENSORBLAS_H__ #ifndef __XTENSORBLAS_H__
#define __XTENSORBLAS_H__ #define __XTENSORBLAS_H__
#include "../XTensor.h" #include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
......
...@@ -19,8 +19,8 @@ ...@@ -19,8 +19,8 @@
* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-06-14 * $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-06-14
*/ */
#include "../XTensor.h" #include "../../XTensor.h"
#include "../XDevice.h" #include "../../XDevice.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
......
...@@ -19,8 +19,8 @@ ...@@ -19,8 +19,8 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-04 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-04
*/ */
#include "../XUtility.h" #include "../../XUtility.h"
#include "../XName.h" #include "../../XName.h"
#include "Select.h" #include "Select.h"
namespace nts{ // namespace nts(NiuTrans.Tensor) namespace nts{ // namespace nts(NiuTrans.Tensor)
...@@ -33,7 +33,7 @@ c = select(a) ...@@ -33,7 +33,7 @@ c = select(a)
>> dim - the dimension along with which we do the job >> dim - the dimension along with which we do the job
>> low - lower bound >> low - lower bound
>> high - higher bound. >> high - higher bound.
Note that range [1,3] means that we select 1 and 2. Note that range [1,3] means that we select 1 and 2.
*/ */
void SelectRange(XTensor * a, XTensor * c, int dim, int low, int high) void SelectRange(XTensor * a, XTensor * c, int dim, int low, int high)
{ {
...@@ -48,7 +48,7 @@ void SelectRange(XTensor * a, XTensor * c, int dim, int low, int high) ...@@ -48,7 +48,7 @@ void SelectRange(XTensor * a, XTensor * c, int dim, int low, int high)
for(int i = 0; i < a->order; i++){ for(int i = 0; i < a->order; i++){
if(i == dim){ if(i == dim){
CheckNTErrors(low > 0 && low < a->dimSize[dim], "Illegal range specified!"); CheckNTErrors(low > 0 && low < a->dimSize[dim], "Illegal range specified!");
CheckNTErrors(high > 0 && high < a->dimSize[dim], "Illegal range specified!"); CheckNTErrors(high > 0 && high <= a->dimSize[dim], "Illegal range specified!");
} }
else{ else{
CheckNTErrors(a->dimSize[i] == c->dimSize[i], "The size of the dimensions should be same!"); CheckNTErrors(a->dimSize[i] == c->dimSize[i], "The size of the dimensions should be same!");
...@@ -62,20 +62,24 @@ void SelectRange(XTensor * a, XTensor * c, int dim, int low, int high) ...@@ -62,20 +62,24 @@ void SelectRange(XTensor * a, XTensor * c, int dim, int low, int high)
XLink::AddParamToHeadInt(c, high); XLink::AddParamToHeadInt(c, high);
int stride = 1; int stride = 1;
for(int i = 0; i < dim; i++) int dimRDI = a->order - dim - 1;
for(int i = 0; i < dimRDI; i++)
stride *= a->dimSizeRDI[i]; stride *= a->dimSizeRDI[i];
int copyTimes = 1;
for (int i = dimRDI + 1; i < a->order; i++)
copyTimes *= a->dimSizeRDI[i];
int blockSize = stride * (high - low) * a->unitSize; int blockSize = stride * (high - low) * a->unitSize;
int stepSizeS = stride * a->dimSize[dim] * a->unitSize; int stepSizeS = stride * a->dimSize[dim] * a->unitSize;
int stepSizeT = stride * c->dimSize[dim] * a->unitSize; int stepSizeT = stride * c->dimSize[dim] * a->unitSize;
char * s = (char*)a->data + stride * low * a->unitSize; char * s = (char*)a->data + stride * low * a->unitSize;
char * t = (char*)c->data; char * t = (char*)c->data;
for(int i = 0; i < high - low; i++){ for(int i = 0; i < copyTimes; i++){
XMemCopy(t, c->devID, s, a->devID, blockSize); XMemCopy(t, c->devID, s, a->devID, blockSize);
s += stepSizeS; s += stepSizeS;
t += stepSizeT; t += stepSizeT;
} }
} }
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
#ifndef __SELECT_H__ #ifndef __SELECT_H__
#define __SELECT_H__ #define __SELECT_H__
#include "../XTensor.h" #include "../../XTensor.h"
namespace nts{ // namespace nts(NiuTrans.Tensor) namespace nts{ // namespace nts(NiuTrans.Tensor)
......
...@@ -21,7 +21,7 @@ ...@@ -21,7 +21,7 @@
*/ */
#include "SetData.h" #include "SetData.h"
#include "CopyValues.h" #include "../movement/CopyValues.h"
#if !defined( WIN32 ) && !defined( _WIN32 ) #if !defined( WIN32 ) && !defined( _WIN32 )
#include "sys/time.h" #include "sys/time.h"
...@@ -68,10 +68,11 @@ void SetDataRand(XTensor * tensor, DTYPE low, DTYPE high) ...@@ -68,10 +68,11 @@ void SetDataRand(XTensor * tensor, DTYPE low, DTYPE high)
ShowNTErrors("TODO"); ShowNTErrors("TODO");
} }
} }
/* GPU code /*
The trick here is that initialize the data on a temperary tensor on CPU. GPU code
The CPU data is then copied to GPU. The trick here is that initialize the data on a temperary tensor on CPU.
TODO: generate data points on GPUs straightforwardly. The CPU data is then copied to GPU.
TODO: generate data points on GPUs straightforwardly.
*/ */
else{ else{
XTensor * t2 = NewTensor(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, -1); XTensor * t2 = NewTensor(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, -1);
......
...@@ -23,7 +23,7 @@ ...@@ -23,7 +23,7 @@
#ifndef __SETDATA_H__ #ifndef __SETDATA_H__
#define __SETDATA_H__ #define __SETDATA_H__
#include "../XTensor.h" #include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
......
...@@ -20,11 +20,12 @@ ...@@ -20,11 +20,12 @@
*/ */
#include <math.h> #include <math.h>
#include "../XTensor.h" #include "../../XTensor.h"
#include "Normalize.h" #include "Normalize.h"
#include "Normalize.cuh" #include "Normalize.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* /*
normalized the data with normal distribution. For an input x, normalized the data with normal distribution. For an input x,
y = a * (x-mean)/sqrt(variance+\epsilon) + b y = a * (x-mean)/sqrt(variance+\epsilon) + b
......
...@@ -19,12 +19,13 @@ ...@@ -19,12 +19,13 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/ */
#include "../XDevice.h" #include "../../XDevice.h"
#include "../XTensor.h" #include "../../XTensor.h"
#include "Normalize.h" #include "Normalize.h"
#include "Normalize.cuh" #include "Normalize.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* /*
normalized the data with normal distribution (kernel code). For an input x, normalized the data with normal distribution (kernel code). For an input x,
......
...@@ -28,7 +28,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -28,7 +28,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* normalized the data with normal distribution (Kernel code). For an input x, /*
normalized the data with normal distribution (Kernel code). For an input x,
y = a * (x-mean)/sqrt(variance+\epsilon) + b y = a * (x-mean)/sqrt(variance+\epsilon) + b
where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter
*/ */
...@@ -37,7 +38,8 @@ void KernelNormalize(DTYPE * input, DTYPE * output, DTYPE * mean, DTYPE * var, ...@@ -37,7 +38,8 @@ void KernelNormalize(DTYPE * input, DTYPE * output, DTYPE * mean, DTYPE * var,
DTYPE * a, DTYPE * b, DTYPE epsilon, DTYPE * a, DTYPE * b, DTYPE epsilon,
int stride, int strideNum, int blockNum); int stride, int strideNum, int blockNum);
/* normalized the data with normal distribution. For an input x, /*
normalized the data with normal distribution. For an input x,
y = a * (x-mean)/sqrt(variance+\epsilon) + b y = a * (x-mean)/sqrt(variance+\epsilon) + b
where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter
*/ */
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
#ifndef __NORMALIZE_H__ #ifndef __NORMALIZE_H__
#define __NORMALIZE_H__ #define __NORMALIZE_H__
#include "../XTensor.h" #include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
......
...@@ -20,15 +20,16 @@ ...@@ -20,15 +20,16 @@
*/ */
#include <math.h> #include <math.h>
#include "../XTensor.h" #include "../../XTensor.h"
#include "Power.h" #include "Power.h"
#include "Power.cuh" #include "Power.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* /*
get the power(a, p) get the power(a, p)
>> a - the tensor >> a - the tensor
>> power - as it is >> p - as it is
*/ */
void Power(XTensor * a, DTYPE p) void Power(XTensor * a, DTYPE p)
{ {
......
...@@ -19,8 +19,8 @@ ...@@ -19,8 +19,8 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/ */
#include "../XDevice.h" #include "../../XDevice.h"
#include "../XTensor.h" #include "../../XTensor.h"
#include "Power.h" #include "Power.h"
#include "Power.cuh" #include "Power.cuh"
...@@ -87,9 +87,6 @@ __global__ ...@@ -87,9 +87,6 @@ __global__
void KernelPower(__half * d, __half p, int size) void KernelPower(__half * d, __half p, int size)
{ {
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) #if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
//int i = blockDim.x * blockIdx.x + threadIdx.x;
//if (i < size)
// d[i] = hpow(d[i], p);
#else #else
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size) if (i < size)
...@@ -126,9 +123,6 @@ void CudaPower(XTensor * a, DTYPE p) ...@@ -126,9 +123,6 @@ void CudaPower(XTensor * a, DTYPE p)
} }
else if (p != (DTYPE)1.0) { else if (p != (DTYPE)1.0) {
ShowNTErrors("TODO!"); ShowNTErrors("TODO!");
//unsigned short p2 = FloatToFloat16(p);
//__half * pp = (__half*)&p2;
//KernelPower<<<blocks, threads>>>((__half*)a->data, *pp, a->unitNum);
} }
} }
else { else {
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
#ifndef __POWER_H__ #ifndef __POWER_H__
#define __POWER_H__ #define __POWER_H__
#include "../XTensor.h" #include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
......
...@@ -26,9 +26,7 @@ namespace nts{ // namespace nts(NiuTrans.Tensor) ...@@ -26,9 +26,7 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
/* /*
scale and shift all tensor entires scale and shift all tensor entires
p = p * scale + shift p = p * scale + shift
>> a - the tensor >> a - the tensor
>> scale - the scaler factor >> scale - the scaler factor
>> shift - the shift factor >> shift - the shift factor
......
...@@ -21,7 +21,7 @@ ...@@ -21,7 +21,7 @@
#include "ScaleAndShift.h" #include "ScaleAndShift.h"
#include "ScaleAndShift.cuh" #include "ScaleAndShift.cuh"
#include "../XDevice.h" #include "../../XDevice.h"
namespace nts{ // namespace nts(NiuTrans.Tensor) namespace nts{ // namespace nts(NiuTrans.Tensor)
...@@ -80,9 +80,7 @@ void KernelScaleAndShift(__half * d, int size, __half scale, __half shift) ...@@ -80,9 +80,7 @@ void KernelScaleAndShift(__half * d, int size, __half scale, __half shift)
/* /*
scale and shift all matrix entires scale and shift all matrix entires
p = p * scale + shift p = p * scale + shift
>> a - the tensor >> a - the tensor
>> scale - the scaler factor >> scale - the scaler factor
>> shift - the shift factor >> shift - the shift factor
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
#ifndef __SCALEANDSHIFT_CUH__ #ifndef __SCALEANDSHIFT_CUH__
#define __SCALEANDSHIFT_CUH__ #define __SCALEANDSHIFT_CUH__
#include "../XTensor.h" #include "../../XTensor.h"
namespace nts{ // namespace nts(NiuTrans.Tensor) namespace nts{ // namespace nts(NiuTrans.Tensor)
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
#ifndef __SCALEANDSHIFT_H__ #ifndef __SCALEANDSHIFT_H__
#define __SCALEANDSHIFT_H__ #define __SCALEANDSHIFT_H__
#include "../XTensor.h" #include "../../XTensor.h"
namespace nts{ // namespace nts(NiuTrans.Tensor) namespace nts{ // namespace nts(NiuTrans.Tensor)
......
...@@ -19,8 +19,8 @@ ...@@ -19,8 +19,8 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/ */
#include "../XTensor.h" #include "../../XTensor.h"
#include "../XUtility.h" #include "../../XUtility.h"
#include "CopyBlocks.h" #include "CopyBlocks.h"
#include "CopyBlocksOnSite.h" #include "CopyBlocksOnSite.h"
#include "CopyBlocksSelected.cuh" #include "CopyBlocksSelected.cuh"
...@@ -78,9 +78,11 @@ void CopyBlocks(void * source, int blockSize, int * sourceBlocks, int blockNum, ...@@ -78,9 +78,11 @@ void CopyBlocks(void * source, int blockSize, int * sourceBlocks, int blockNum,
else { else {
int devID = myMem != NULL ? myMem->devID : -1; int devID = myMem != NULL ? myMem->devID : -1;
/* The following code should be fine with GPUs, but too many /*
The following code should be fine with GPUs, but too many
kernel calls would slow down the system. We prefer to use kernel calls would slow down the system. We prefer to use
one kernel to do block copy in batch (kernel fusion). */ one kernel to do block copy in batch (kernel fusion).
*/
for (int i = 0; i < blockNum; i++) { for (int i = 0; i < blockNum; i++) {
XMemCopy((char*)target + targetBlocks[i] * blockSize, devID, XMemCopy((char*)target + targetBlocks[i] * blockSize, devID,
(char*)source + sourceBlocks[i] * blockSize, devID, blockSize); (char*)source + sourceBlocks[i] * blockSize, devID, blockSize);
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
#ifndef __COPYBLOCKS_H__ #ifndef __COPYBLOCKS_H__
#define __COPYBLOCKS_H__ #define __COPYBLOCKS_H__
#include "../XTensor.h" #include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
......
...@@ -19,9 +19,9 @@ ...@@ -19,9 +19,9 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/ */
#include "../XTensor.h" #include "../../XTensor.h"
#include "CopyBlocksInGrid.h" #include "CopyBlocksInGrid.h"
#include "../XUtility.h" #include "../../XUtility.h"
#include "CopyBlocksInGrid.cuh" #include "CopyBlocksInGrid.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
......
...@@ -21,7 +21,7 @@ ...@@ -21,7 +21,7 @@
#include "CopyBlocksInGrid.h" #include "CopyBlocksInGrid.h"
#include "CopyBlocksInGrid.cuh" #include "CopyBlocksInGrid.cuh"
#include "../XDevice.h" #include "../../XDevice.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
#ifndef __COPYBLOCKSINGRID_CUH__ #ifndef __COPYBLOCKSINGRID_CUH__
#define __COPYBLOCKSINGRID_CUH__ #define __COPYBLOCKSINGRID_CUH__
#include "../XTensor.h" #include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
#ifndef __COPYBLOCKSINGRID_H__ #ifndef __COPYBLOCKSINGRID_H__
#define __COPYBLOCKSINGRID_H__ #define __COPYBLOCKSINGRID_H__
#include "../XTensor.h" #include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
......
...@@ -19,12 +19,13 @@ ...@@ -19,12 +19,13 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/ */
#include "../XTensor.h" #include "../../XTensor.h"
#include "../XUtility.h" #include "../../XUtility.h"
#include "CopyBlocksOnSite.h" #include "CopyBlocksOnSite.h"
#include "CopyBlocksOnSite.cuh" #include "CopyBlocksOnSite.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* /*
copy a number of blocks to target positions. Here we assume that copy a number of blocks to target positions. Here we assume that
all the data has been on the device (CPU/GPU) already. all the data has been on the device (CPU/GPU) already.
...@@ -47,9 +48,11 @@ void CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target, ...@@ -47,9 +48,11 @@ void CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target,
else { else {
int devID = myMem != NULL ? myMem->devID : -1; int devID = myMem != NULL ? myMem->devID : -1;
/* The following code should be fine with GPUs, but too many /*
The following code should be fine with GPUs, but too many
kernel calls would slow down the system. We prefer to use kernel calls would slow down the system. We prefer to use
one kernel to do block copy in batch (kernel fusion). */ one kernel to do block copy in batch (kernel fusion).
*/
for (int i = 0, b = 0; i < blockNum; i++, b += blockSize) { for (int i = 0, b = 0; i < blockNum; i++, b += blockSize) {
XMemCopy((char*)target + targetBlocks[i] * blockSize, devID, XMemCopy((char*)target + targetBlocks[i] * blockSize, devID,
(char*)source + b, devID, blockSize); (char*)source + b, devID, blockSize);
......
...@@ -21,7 +21,7 @@ ...@@ -21,7 +21,7 @@
#include "CopyBlocksOnSite.h" #include "CopyBlocksOnSite.h"
#include "CopyBlocksOnSite.cuh" #include "CopyBlocksOnSite.cuh"
#include "../XDevice.h" #include "../../XDevice.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
#ifndef __COPYBLOCKS_CUH__ #ifndef __COPYBLOCKS_CUH__
#define __COPYBLOCKS_CUH__ #define __COPYBLOCKS_CUH__
#include "../XTensor.h" #include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
#ifndef __COPYBLOCKSONSITE_H__ #ifndef __COPYBLOCKSONSITE_H__
#define __COPYBLOCKSONSITE_H__ #define __COPYBLOCKSONSITE_H__
#include "../XTensor.h" #include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
......
...@@ -21,8 +21,8 @@ ...@@ -21,8 +21,8 @@
#include "CopyBlocks.h" #include "CopyBlocks.h"
#include "CopyBlocksSelected.cuh" #include "CopyBlocksSelected.cuh"
#include "../XUtility.h" #include "../../XUtility.h"
#include "../XDevice.h" #include "../../XDevice.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
#ifndef __COPYBLOCKSSELECTED_CUH__ #ifndef __COPYBLOCKSSELECTED_CUH__
#define __COPYBLOCKSSELECTED_CUH__ #define __COPYBLOCKSSELECTED_CUH__
#include "../XTensor.h" #include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
......
...@@ -19,9 +19,9 @@ ...@@ -19,9 +19,9 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/ */
#include "../XTensor.h" #include "../../XTensor.h"
#include "CopyData2D.h" #include "CopyData2D.h"
#include "../XUtility.h" #include "../../XUtility.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
#ifndef __COPYDATA2D_H__ #ifndef __COPYDATA2D_H__
#define __COPYDATA2D_H__ #define __COPYDATA2D_H__
#include "../XTensor.h" #include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
......
...@@ -19,7 +19,7 @@ ...@@ -19,7 +19,7 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/ */
#include "../XTensor.h" #include "../../XTensor.h"
#include "CopyInGrid.h" #include "CopyInGrid.h"
#include "CopyBlocksInGrid.h" #include "CopyBlocksInGrid.h"
...@@ -34,7 +34,7 @@ i.e., reorder the data blocks in the same memory piece ...@@ -34,7 +34,7 @@ i.e., reorder the data blocks in the same memory piece
in the k-th grid in the k-th grid
>> blockDim - leading dimension of blocks >> blockDim - leading dimension of blocks
>> blockNumInGrid - number of blocks in each grid >> blockNumInGrid - number of blocks in each grid
>> isOnDev - indicates whether the index is on the device already >> isIndexOnDev - indicates whether the index is on the device already
*/ */
void CopyInGrid(XTensor * s, XTensor * t, int * index, int blockDim, int blockNumInGrid, bool isIndexOnDev) void CopyInGrid(XTensor * s, XTensor * t, int * index, int blockDim, int blockNumInGrid, bool isIndexOnDev)
{ {
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
#ifndef __COPYINGRID_H__ #ifndef __COPYINGRID_H__
#define __COPYINGRID_H__ #define __COPYINGRID_H__
#include "../XTensor.h" #include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
......
...@@ -36,6 +36,7 @@ copy indexed sub-tensors ...@@ -36,6 +36,7 @@ copy indexed sub-tensors
>> tgtIndex - index of the target sub-tensors >> tgtIndex - index of the target sub-tensors
>> copyNum - number of the sub-tensors we copy for each source index, e.g., >> copyNum - number of the sub-tensors we copy for each source index, e.g.,
for srcIndex = [1,4] and copyNum = 2, we actually copy the source sub-tensors 1, 2, 4, 5 for srcIndex = [1,4] and copyNum = 2, we actually copy the source sub-tensors 1, 2, 4, 5
<< return - whether copy indexed operation was successful
*/ */
bool CopyIndexed(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum) bool CopyIndexed(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum)
{ {
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
#ifndef __COPYINDEXED_H__ #ifndef __COPYINDEXED_H__
#define __COPYINDEXED_H__ #define __COPYINDEXED_H__
#include "../XTensor.h" #include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
......
...@@ -19,7 +19,7 @@ ...@@ -19,7 +19,7 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/ */
#include "../XName.h" #include "../../XName.h"
#include "CopyValues.h" #include "CopyValues.h"
#include "CopyValues.cuh" #include "CopyValues.cuh"
......
...@@ -21,8 +21,8 @@ ...@@ -21,8 +21,8 @@
#include "CopyValues.h" #include "CopyValues.h"
#include "CopyValues.cuh" #include "CopyValues.cuh"
#include "../XUtility.h" #include "../../XUtility.h"
#include "../XDevice.h" #include "../../XDevice.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
......
...@@ -22,13 +22,12 @@ ...@@ -22,13 +22,12 @@
#ifndef __COPYVALUES_CUH__ #ifndef __COPYVALUES_CUH__
#define __COPYVALUES_CUH__ #define __COPYVALUES_CUH__
#include "../XTensor.h" #include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/**************************************/
/* copy all elements from a source matrix to a target matrix */ /* copy all elements from a source matrix to a target matrix */
extern "C" extern "C"
bool CudaCopyValues(XTensor * s, XTensor * t, XStream * stream = NULL); bool CudaCopyValues(XTensor * s, XTensor * t, XStream * stream = NULL);
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
#ifndef __COPYVALUES_H__ #ifndef __COPYVALUES_H__
#define __COPYVALUES_H__ #define __COPYVALUES_H__
#include "../XTensor.h" #include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
......
...@@ -19,8 +19,8 @@ ...@@ -19,8 +19,8 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/ */
#include "../XTensor.h" #include "../../XTensor.h"
#include "../XName.h" #include "../../XName.h"
#include "ReduceMax.h" #include "ReduceMax.h"
#include "ReduceMax.cuh" #include "ReduceMax.cuh"
......
...@@ -19,9 +19,9 @@ ...@@ -19,9 +19,9 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/ */
#include "../XDevice.h" #include "../../XDevice.h"
#include "../XTensor.h" #include "../../XTensor.h"
#include "../XUtility.h" #include "../../XUtility.h"
#include "ReduceMax.h" #include "ReduceMax.h"
#include "ReduceMax.cuh" #include "ReduceMax.cuh"
...@@ -31,14 +31,10 @@ namespace nts{ // namespace nts(NiuTrans.Tensor) ...@@ -31,14 +31,10 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
/* /*
reduce a tensor to another that keeps the max value along a dimension - slow version reduce a tensor to another that keeps the max value along a dimension - slow version
Given a block of data, we go over each dimension i in the stride and we have Given a block of data, we go over each dimension i in the stride and we have
sum_i = max_{0<=j<strideNum} input_{i,j} sum_i = max_{0<=j<strideNum} input_{i,j}
where we can view the block as a matrix and input_{i,j} represent the item at the where we can view the block as a matrix and input_{i,j} represent the item at the
crossing of the i-th columne and the j-th row. crossing of the i-th columne and the j-th row.
>> input - the input array (representing a tensor) >> input - the input array (representing a tensor)
>> output - the sum over each block. NOTE: output is also an array >> output - the sum over each block. NOTE: output is also an array
>> stride - stride that we need to move to the next item >> stride - stride that we need to move to the next item
...@@ -89,82 +85,77 @@ void KernelReduceMax(DTYPE * input, DTYPE * output, ...@@ -89,82 +85,77 @@ void KernelReduceMax(DTYPE * input, DTYPE * output,
} }
/* /*
reduce a tensor to another that keeps the max value along a dimension - slow version reduce a tensor to another that keeps the max value along a dimension - slow version
Given a block of data, we go over each dimension i in the stride and we have
Given a block of data, we go over each dimension i in the stride and we have sum_i = max_{0<=j<strideNum} input_{i,j}
where we can view the block as a matrix and input_{i,j} represent the item at the
sum_i = max_{0<=j<strideNum} input_{i,j} crossing of the i-th columne and the j-th row.
>> input - the input array (representing a tensor)
where we can view the block as a matrix and input_{i,j} represent the item at the >> output - the sum over each block. NOTE: output is also an array
crossing of the i-th columne and the j-th row. >> stride - stride that we need to move to the next item
>> strideNum - how many strides we need to finish the reduce
>> input - the input array (representing a tensor) >> reducedStrideNum - the number of strides after reducation
>> output - the sum over each block. NOTE: output is also an array >> blockSize - size of the block (i.e., stride * strideNum)
>> stride - stride that we need to move to the next item >> blockNum - how many blocks
>> strideNum - how many strides we need to finish the reduce */
>> reducedStrideNum - the number of strides after reducation __global__
>> blockSize - size of the block (i.e., stride * strideNum) void KernelReduceMax(__half * input, __half * output,
>> blockNum - how many blocks int stride, int strideNum, int reducedStrideNum,
*/ int blockSize, int blockNum)
__global__ {
void KernelReduceMax(__half * input, __half * output, int idx = threadIdx.x * blockDim.y + threadIdx.y;
int stride, int strideNum, int reducedStrideNum, unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
int blockSize, int blockNum) unsigned int j = blockIdx.y*blockDim.y + threadIdx.y;
{
int idx = threadIdx.x * blockDim.y + threadIdx.y;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int j = blockIdx.y*blockDim.y + threadIdx.y;
if (i >= stride * blockNum) if (i >= stride * blockNum)
return; return;
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) #if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
__shared__ __half iData[MAX_CUDA_THREAD_NUM_PER_BLOCK * MIN_CUDA_SHARED_MEM_COL_SIZE / 2]; __shared__ __half iData[MAX_CUDA_THREAD_NUM_PER_BLOCK * MIN_CUDA_SHARED_MEM_COL_SIZE / 2];
#else #else
__shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK * MIN_CUDA_SHARED_MEM_COL_SIZE / 2]; __shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK * MIN_CUDA_SHARED_MEM_COL_SIZE / 2];
#endif #endif
__syncthreads(); __syncthreads();
int k = i / stride; int k = i / stride;
int iOffset = i % stride; int iOffset = i % stride;
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) #if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
__half value = (i < stride * blockNum && j < strideNum) ? __half value = (i < stride * blockNum && j < strideNum) ?
input[blockSize * k + stride * j + iOffset] : __half(FLOAT16_MIN); input[blockSize * k + stride * j + iOffset] : __half(FLOAT16_MIN);
#else #else
DTYPE value = (i < stride * blockNum && j < strideNum) ? DTYPE value = (i < stride * blockNum && j < strideNum) ?
__half2float(input[blockSize * k + stride * j + iOffset]) : FLOAT_MIN; __half2float(input[blockSize * k + stride * j + iOffset]) : FLOAT_MIN;
#endif #endif
/* load data into the shared mem */ /* load data into the shared mem */
iData[threadIdx.x * blockDim.y + threadIdx.y] = value; iData[threadIdx.x * blockDim.y + threadIdx.y] = value;
__syncthreads(); __syncthreads();
/* do reduction in shared mem */ /* do reduction in shared mem */
for (unsigned int s = blockDim.y / 2; s > 0; s >>= 1) { for (unsigned int s = blockDim.y / 2; s > 0; s >>= 1) {
if (threadIdx.y < s && iData[idx] < iData[idx + s]) { if (threadIdx.y < s && iData[idx] < iData[idx + s]) {
iData[idx] = iData[idx + s]; iData[idx] = iData[idx + s];
} }
__syncthreads(); __syncthreads();
} }
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) #if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
/* write result for this block to the output array */ /* write result for this block to the output array */
if (threadIdx.y == 0 && blockIdx.y < reducedStrideNum) if (threadIdx.y == 0 && blockIdx.y < reducedStrideNum)
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = iData[threadIdx.x * blockDim.y]; output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = iData[threadIdx.x * blockDim.y];
#else #else
/* write result for this block to the output array */ /* write result for this block to the output array */
if (threadIdx.y == 0 && blockIdx.y < reducedStrideNum) if (threadIdx.y == 0 && blockIdx.y < reducedStrideNum)
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = __half(iData[threadIdx.x * blockDim.y]); output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = __half(iData[threadIdx.x * blockDim.y]);
#endif #endif
} }
/* /*
reduce a tensor to another that keeps the max value along a dimension - fast version reduce a tensor to another that keeps the max value along a dimension - fast version
>> input - the input array (representing a tensor) >> input - the input array (representing a tensor)
...@@ -338,9 +329,7 @@ void KernelReduceMaxSimpleFast(DTYPE * input, DTYPE * output, ...@@ -338,9 +329,7 @@ void KernelReduceMaxSimpleFast(DTYPE * input, DTYPE * output,
/* /*
get the max-valued items along a dimension of the tensor (cuda version). get the max-valued items along a dimension of the tensor (cuda version).
For a 1-dimensional data array a, For a 1-dimensional data array a,
sum_i = max_{0<=j<strideNum} input_{i,j} sum_i = max_{0<=j<strideNum} input_{i,j}
>> input - the input tensor >> input - the input tensor
>> output - the output tensor >> output - the output tensor
>> dim - which dimension to reduce >> dim - which dimension to reduce
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
#ifndef __REDUCEMAX_H__ #ifndef __REDUCEMAX_H__
#define __REDUCEMAX_H__ #define __REDUCEMAX_H__
#include "../XTensor.h" #include "../../XTensor.h"
namespace nts{ // namespace nts(NiuTrans.Tensor) namespace nts{ // namespace nts(NiuTrans.Tensor)
......
...@@ -19,7 +19,7 @@ ...@@ -19,7 +19,7 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/ */
#include "ScaleAndShift.h" #include "../math/ScaleAndShift.h"
#include "ReduceSum.h" #include "ReduceSum.h"
#include "ReduceMean.h" #include "ReduceMean.h"
...@@ -28,7 +28,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor) ...@@ -28,7 +28,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
/* /*
get the mean value along a dimension of the tensor. For a 1-dimensional data array a, get the mean value along a dimension of the tensor. For a 1-dimensional data array a,
mean = (1/n) * sum_i input_i mean = (1/n) * sum_i input_i
>> input - the input tensor >> input - the input tensor
>> output - the output tensor >> output - the output tensor
>> dim - the dimension where the reduction is performed on >> dim - the dimension where the reduction is performed on
...@@ -44,5 +43,4 @@ void ReduceMean(XTensor * input, XTensor * output, int dim) ...@@ -44,5 +43,4 @@ void ReduceMean(XTensor * input, XTensor * output, int dim)
ScaleAndShift(output, (DTYPE)1/num, 0); ScaleAndShift(output, (DTYPE)1/num, 0);
} }
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
#ifndef __REDUCEMEAN_H__ #ifndef __REDUCEMEAN_H__
#define __REDUCEMEAN_H__ #define __REDUCEMEAN_H__
#include "../XTensor.h" #include "../../XTensor.h"
namespace nts{ // namespace nts(NiuTrans.Tensor) namespace nts{ // namespace nts(NiuTrans.Tensor)
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
#ifndef __REDUCESTANDARDVARIANCE_H__ #ifndef __REDUCESTANDARDVARIANCE_H__
#define __REDUCESTANDARDVARIANCE_H__ #define __REDUCESTANDARDVARIANCE_H__
#include "../XTensor.h" #include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
#include <math.h> #include <math.h>
#include "ReduceSum.h" #include "ReduceSum.h"
#include "ReduceSum.cuh" #include "ReduceSum.cuh"
#include "../XName.h" #include "../../XName.h"
namespace nts{ // namespace nts(NiuTrans.Tensor) namespace nts{ // namespace nts(NiuTrans.Tensor)
......
...@@ -19,8 +19,8 @@ ...@@ -19,8 +19,8 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/ */
#include "../XDevice.h" #include "../../XDevice.h"
#include "../XUtility.h" #include "../../XUtility.h"
#include "ReduceSum.cuh" #include "ReduceSum.cuh"
namespace nts{ // namespace nts(NiuTrans.Tensor) namespace nts{ // namespace nts(NiuTrans.Tensor)
...@@ -29,13 +29,11 @@ namespace nts{ // namespace nts(NiuTrans.Tensor) ...@@ -29,13 +29,11 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
/* /*
reduce a tensor to another that keeps the sum along a dimension - slow version reduce a tensor to another that keeps the sum along a dimension - slow version
Given a block of data, we go over each dimension i in the stride and we have Given a block of data, we go over each dimension i in the stride and we have
sum_i = sum_{0<=j<strideNum} exp(input_{i,j} - shift) if isExp == true; sum_i = sum_{0<=j<strideNum} exp(input_{i,j} - shift) if isExp == true;
= sum_{0<=j<strideNum} input_{i,j} - shift if isExp == false; = sum_{0<=j<strideNum} input_{i,j} - shift if isExp == false;
where we can view the block as a matrix and input_{i,j} represent the item at the where we can view the block as a matrix and input_{i,j} represent the item at the
crossing of the i-th columne and the j-th row. crossing of the i-th columne and the j-th row.
>> input - the input array (representing a tensor) >> input - the input array (representing a tensor)
>> output - the sum over each block. NOTE: output is also an array >> output - the sum over each block. NOTE: output is also an array
>> stride - stride that we need to move to the next item >> stride - stride that we need to move to the next item
...@@ -107,13 +105,11 @@ void KernelReduceSum(DTYPE * input, DTYPE * output, ...@@ -107,13 +105,11 @@ void KernelReduceSum(DTYPE * input, DTYPE * output,
/* /*
reduce a tensor to another that keeps the sum along a dimension - slow version reduce a tensor to another that keeps the sum along a dimension - slow version
This is for float16 reduction. This is for float16 reduction.
Given a block of data, we go over each dimension i in the stride and we have Given a block of data, we go over each dimension i in the stride and we have
sum_i = sum_{0<=j<strideNum} exp(input_{i,j} - shift) if isExp == true; sum_i = sum_{0<=j<strideNum} exp(input_{i,j} - shift) if isExp == true;
= sum_{0<=j<strideNum} input_{i,j} - shift if isExp == false; = sum_{0<=j<strideNum} input_{i,j} - shift if isExp == false;
where we can view the block as a matrix and input_{i,j} represent the item at the where we can view the block as a matrix and input_{i,j} represent the item at the
crossing of the i-th columne and the j-th row. crossing of the i-th columne and the j-th row.
>> input - the input array (representing a tensor) >> input - the input array (representing a tensor)
>> output - the sum over each block. NOTE: output is also an array >> output - the sum over each block. NOTE: output is also an array
>> stride - stride that we need to move to the next item >> stride - stride that we need to move to the next item
...@@ -304,7 +300,6 @@ void KernelReduceSumFast(DTYPE * input, DTYPE * output, ...@@ -304,7 +300,6 @@ void KernelReduceSumFast(DTYPE * input, DTYPE * output,
/* /*
reduce a tensor to another that keeps the sum along a dimension - fast version reduce a tensor to another that keeps the sum along a dimension - fast version
This is for float16 reduction This is for float16 reduction
>> input - the input array (representing a tensor) >> input - the input array (representing a tensor)
>> output - the sum over each block. NOTE: output is also an array >> output - the sum over each block. NOTE: output is also an array
>> stride - stride that we need to move to the next item >> stride - stride that we need to move to the next item
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
#ifndef __REDUCESUM_H__ #ifndef __REDUCESUM_H__
#define __REDUCESUM_H__ #define __REDUCESUM_H__
#include "../XTensor.h" #include "../../XTensor.h"
namespace nts{ // namespace nts(NiuTrans.Tensor) namespace nts{ // namespace nts(NiuTrans.Tensor)
......
...@@ -28,7 +28,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor) ...@@ -28,7 +28,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
squared sum of the items along a dimension of the tensor. squared sum of the items along a dimension of the tensor.
For a 1-dimensional data array a, For a 1-dimensional data array a,
sum = \sum_i (a_i - shift)^2 sum = \sum_i (a_i - shift)^2
>> input - the input tensor >> input - the input tensor
>> output - the output tensor >> output - the output tensor
>> dim - the dimension where the reduction is performed on >> dim - the dimension where the reduction is performed on
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
#ifndef __REDUCESUMSQUARED_H__ #ifndef __REDUCESUMSQUARED_H__
#define __REDUCESUMSQUARED_H__ #define __REDUCESUMSQUARED_H__
#include "../XTensor.h" #include "../../XTensor.h"
namespace nts{ // namespace nts(NiuTrans.Tensor) namespace nts{ // namespace nts(NiuTrans.Tensor)
......
...@@ -19,7 +19,7 @@ ...@@ -19,7 +19,7 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/ */
#include "ScaleAndShift.h" #include "../math/ScaleAndShift.h"
#include "ReduceSum.h" #include "ReduceSum.h"
#include "ReduceVariance.h" #include "ReduceVariance.h"
...@@ -29,7 +29,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor) ...@@ -29,7 +29,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
variance of the items along a dimension of the tensor. variance of the items along a dimension of the tensor.
For a 1-dimensional data array a, For a 1-dimensional data array a,
variance = 1/n * \sum_i (a_i - mean)^2 variance = 1/n * \sum_i (a_i - mean)^2
>> input - the input tensor >> input - the input tensor
>> output - the output tensor >> output - the output tensor
>> dim - the dimension where the reduction is performed on >> dim - the dimension where the reduction is performed on
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
#ifndef __REDUCEVARIANCE_H__ #ifndef __REDUCEVARIANCE_H__
#define __REDUCEVARIANCE_H__ #define __REDUCEVARIANCE_H__
#include "../XTensor.h" #include "../../XTensor.h"
namespace nts{ // namespace nts(NiuTrans.Tensor) namespace nts{ // namespace nts(NiuTrans.Tensor)
......
...@@ -19,7 +19,7 @@ ...@@ -19,7 +19,7 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/ */
#include "../XTensor.h" #include "../../XTensor.h"
#include "Concatenate.h" #include "Concatenate.h"
#include "Merge.h" #include "Merge.h"
#include "ConcatenateSolely.h" #include "ConcatenateSolely.h"
...@@ -53,6 +53,10 @@ void Concatenate(XList * smalls, XTensor * big, int dim) ...@@ -53,6 +53,10 @@ void Concatenate(XList * smalls, XTensor * big, int dim)
/* /*
concatenate two tensors along a given dimension concatenate two tensors along a given dimension
>> smallA - one tensor for concatenation
>> smallB - the other tensor for concatenation
>> big - the resulting tensor
>> dim - which dimension we perform the concatenation
*/ */
void Concatenate(XTensor * smallA, XTensor * smallB, XTensor * big, int dim) void Concatenate(XTensor * smallA, XTensor * smallB, XTensor * big, int dim)
{ {
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论