Commit 2837e09f by xuchen

merge with xu

parents 087efa88 394e8340
...@@ -53,8 +53,8 @@ int main( int argc, const char ** argv ) ...@@ -53,8 +53,8 @@ int main( int argc, const char ** argv )
if(argc > 1 && !strcmp(argv[1], "-test")) if(argc > 1 && !strcmp(argv[1], "-test"))
Test(); Test();
else if(argc > 1 && !strcmp(argv[1], "-fnnlm")) //else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
FNNLMMain(argc - 1, argv + 1); // FNNLMMain(argc - 1, argv + 1);
else{ else{
fprintf(stderr, "Thanks for using NiuTrans.Tensor! This is a library that eases the\n"); fprintf(stderr, "Thanks for using NiuTrans.Tensor! This is a library that eases the\n");
fprintf(stderr, "use of tensors. All you need is to ... \n\n"); fprintf(stderr, "use of tensors. All you need is to ... \n\n");
......
...@@ -82,7 +82,7 @@ _XINLINE_ float Float16ToFloat(unsigned short h) ...@@ -82,7 +82,7 @@ _XINLINE_ float Float16ToFloat(unsigned short h)
} }
/* /*
data conversion data type conversion
>> devID - device id >> devID - device id
>> s - source data array >> s - source data array
>> typeS - source data type >> typeS - source data type
...@@ -92,7 +92,7 @@ data conversion ...@@ -92,7 +92,7 @@ data conversion
*/ */
void ConvertDataType(int devID, void * s, TENSOR_DATA_TYPE typeS, void * t, TENSOR_DATA_TYPE typeT, int size) void ConvertDataType(int devID, void * s, TENSOR_DATA_TYPE typeS, void * t, TENSOR_DATA_TYPE typeT, int size)
{ {
CheckNTErrors((devID < 0), "This code must be run on GPUs!"); CheckNTErrors((devID < 0), "This code must be run on CPUs!");
if(typeS == typeT) if(typeS == typeT)
return; return;
......
...@@ -229,6 +229,7 @@ void XLink::AddParam(void * param, int size) ...@@ -229,6 +229,7 @@ void XLink::AddParam(void * param, int size)
paramNum++; paramNum++;
delete[] (char*)ps; delete[] (char*)ps;
} }
/* /*
create a hyperedge with two input tensors and a output tensor create a hyperedge with two input tensors and a output tensor
>> t1 - a tail tensor >> t1 - a tail tensor
...@@ -254,7 +255,7 @@ create a hyper edge with a list of tensors and a output tensor ...@@ -254,7 +255,7 @@ create a hyper edge with a list of tensors and a output tensor
>> h - head tensor >> h - head tensor
>> id - id of the edge type >> id - id of the edge type
*/ */
void XLink::MakeLink(XList * list, XTensor * h, int id) void XLink::MakeLink(const XList * list, XTensor * h, int id)
{ {
/* forward */ /* forward */
XLink &income = h->income; XLink &income = h->income;
...@@ -307,6 +308,43 @@ void XLink::AddParamToHeadInt(XTensor * h, int param) ...@@ -307,6 +308,43 @@ void XLink::AddParamToHeadInt(XTensor * h, int param)
} }
/* /*
add a MATRIX_TRANS_TYPE parameter
>> h - head
>> param - parameter we want introduce
*/
void XLink::AddParamToHeadTrans(XTensor * h, MATRIX_TRANS_TYPE param)
{
if(h != NULL)
return;
h->income.AddParam(&param, sizeof(MATRIX_TRANS_TYPE));
}
/*
add a boolean parameter
>> h - head
>> param - parameter we want introduce
*/
void XLink::AddParamToHeadBool(XTensor * h, bool param)
{
if(h != NULL)
return;
h->income.AddParam(&param, sizeof(bool));
}
/*
add a pointer parameter
>> h - head
>> param - parameter we want introduce
*/
void XLink::AddParamToHeadPointer(XTensor * h, void * param)
{
if(h != NULL)
return;
h->income.AddParam(&param, sizeof(param));
}
/*
replace a node with another, i.e., we redirect the links to the new node replace a node with another, i.e., we redirect the links to the new node
>> oldOne - the node to be replaced >> oldOne - the node to be replaced
>> newOne - the new node >> newOne - the new node
......
...@@ -127,7 +127,7 @@ struct XLink ...@@ -127,7 +127,7 @@ struct XLink
/* create a hyper edge with a list of input tensors and a output tensor */ /* create a hyper edge with a list of input tensors and a output tensor */
static static
void MakeLink(XList * list, XTensor * h, int id); void MakeLink(const XList * list, XTensor * h, int id);
/* add a parameter */ /* add a parameter */
static static
...@@ -137,6 +137,18 @@ struct XLink ...@@ -137,6 +137,18 @@ struct XLink
static static
void AddParamToHeadInt(XTensor * h, int param); void AddParamToHeadInt(XTensor * h, int param);
/* add a MATRIX_TRANS_TYPE parameter */
static
void AddParamToHeadTrans(XTensor * h, MATRIX_TRANS_TYPE param);
/* add a boolean parameter */
static
void AddParamToHeadBool(XTensor * h, bool param);
/* add a pointer parameter */
static
void AddParamToHeadPointer(XTensor * h, void * param);
/* replace a node with another, i.e., we redirect the links to the new node */ /* replace a node with another, i.e., we redirect the links to the new node */
static static
void Replace(const XTensor * oldOne, XTensor * newOne); void Replace(const XTensor * oldOne, XTensor * newOne);
......
...@@ -206,7 +206,7 @@ void XList::Insert(int pos, void * item) ...@@ -206,7 +206,7 @@ void XList::Insert(int pos, void * item)
} }
/* get the item at position i */ /* get the item at position i */
void * XList::GetItem(int i) void * XList::GetItem(int i) const
{ {
if( i >= 0 && i < count ) if( i >= 0 && i < count )
return items[i]; return items[i];
......
...@@ -74,7 +74,7 @@ public: ...@@ -74,7 +74,7 @@ public:
void AddList(XList * l); void AddList(XList * l);
void AddInt(int i); void AddInt(int i);
void Insert(int pos, void * item); void Insert(int pos, void * item);
void * GetItem(int i); void * GetItem(int i) const;
int GetItemInt(int i); int GetItemInt(int i);
void SetItem(int i, void * item); void SetItem(int i, void * item);
void SetItemInt(int i, int item); void SetItemInt(int i, int item);
......
...@@ -27,12 +27,56 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -27,12 +27,56 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
const char * GetOPName(int type) const char * GetOPName(int type)
{ {
if((type & MATH_ARITHMETIC) != 0){ if((type & MATH_ARITHMETIC) != 0){
if(type == MATH_SUM) if(type == MATH_ABSOLUTE)
return "M_SUM"; return "M_ABSOLUTE";
else if(type == MATH_MATRIXMUL)
return "M_MATRIXMUL";
else if(type == MATH_MATRIXMULBATCHED)
return "M_MATRIXMULBATCHED";
else if(type == MATH_MULTIPLY) else if(type == MATH_MULTIPLY)
return "M_MULTIPLY"; return "M_MULTIPLY";
else if(type == MATH_NEGATE)
return "M_NEGATE";
else if(type == MATH_SIGN)
return "M_SIGN";
else if(type == MATH_SUM)
return "M_SUM";
else if(type == MATH_LOG)
return "M_NORMALIZE";
else if(type == MATH_NORMALIZE)
return "M_LOG";
else if(type == MATH_POWER)
return "M_POWER";
else if(type == MATH_SCALEANDSHIFT) else if(type == MATH_SCALEANDSHIFT)
return "M_SCALEANDSHIFT"; return "M_SCALEANDSHIFT";
else if(type == GETANDSET_SELECT)
return "G_SELECT";
else if(type == MOVEMENT_COPYINDEXED)
return "M_COPYINDEXED";
else if(type == MOVEMENT_COPYVALUES)
return "M_COPYVALUES";
else if(type == REDUCE_REDUCEMAX)
return "R_REDUCEMAX";
else if(type == REDUCE_REDUCEMEAN)
return "R_REDUCEMEAN";
else if(type == REDUCE_REDUCESUM)
return "R_REDUCESUM";
else if(type == REDUCE_REDUCESUMSQUARED)
return "R_REDUCESUMSQUARED";
else if(type == REDUCE_REDUCEVARIANCE)
return "R_REDUCEVARIANCE";
else if(type == SHAPE_CONCATENATE)
return "S_CONCATENATE";
else if(type == SHAPE_MERGE)
return "S_MERGE";
else if(type == SHAPE_PERMUTE)
return "S_PERMUTE";
else if(type == SHAPE_SPLIT)
return "S_SPLIT";
else if(type == SHAPE_TRANSPOSE)
return "S_TRANSPOSE";
else if(type == SHAPE_UNSQUEEZE)
return "S_UNSQUEEZE";
} }
return "NULL"; return "NULL";
......
...@@ -29,9 +29,40 @@ ...@@ -29,9 +29,40 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
#define MATH_ARITHMETIC 0x00001000 #define MATH_ARITHMETIC 0x00001000
#define MATH_SUM MATH_ARITHMETIC + 1 #define MATH_ABSOLUTE MATH_ARITHMETIC + 1
#define MATH_MULTIPLY MATH_SUM + 1 #define MATH_MATRIXMUL MATH_ABSOLUTE + 1
#define MATH_SCALEANDSHIFT MATH_MULTIPLY + 1 #define MATH_MATRIXMULBATCHED MATH_MATRIXMUL + 1
#define MATH_MULTIPLY MATH_MATRIXMULBATCHED + 1
#define MATH_NEGATE MATH_MULTIPLY + 1
#define MATH_SIGN MATH_NEGATE + 1
#define MATH_SUM MATH_SIGN + 1
#define MATH_LOG MATH_SUM + 1
#define MATH_NORMALIZE MATH_LOG + 1
#define MATH_POWER MATH_NORMALIZE + 1
#define MATH_SCALEANDSHIFT MATH_POWER + 1
#define GETANDSET MATH_SCALEANDSHIFT + 1
#define GETANDSET_SELECT GETANDSET + 1
#define MOVEMENT GETANDSET_SELECT + 1
#define MOVEMENT_COPYINDEXED MOVEMENT + 1
#define MOVEMENT_COPYVALUES MOVEMENT_COPYINDEXED + 1
#define REDUCE MOVEMENT_COPYVALUES + 1
#define REDUCE_REDUCEMAX REDUCE + 1
#define REDUCE_REDUCEMEAN REDUCE_REDUCEMAX + 1
#define REDUCE_REDUCESUM REDUCE_REDUCEMEAN + 1
#define REDUCE_REDUCESUMSQUARED REDUCE_REDUCESUM + 1
#define REDUCE_REDUCEVARIANCE REDUCE_REDUCESUMSQUARED + 1
#define SHAPE REDUCE_REDUCEVARIANCE + 1
#define SHAPE_CONCATENATE SHAPE + 1
#define SHAPE_MERGE SHAPE_CONCATENATE + 1
#define SHAPE_PERMUTE SHAPE_MERGE + 1
#define SHAPE_SPLIT SHAPE_PERMUTE + 1
#define SHAPE_TRANSPOSE SHAPE_SPLIT + 1
#define SHAPE_UNSQUEEZE SHAPE_TRANSPOSE + 1
/* get operator name */ /* get operator name */
const char * GetOPName(int type); const char * GetOPName(int type);
......
...@@ -173,7 +173,7 @@ XTensor::XTensor(const XTensor &reference) ...@@ -173,7 +173,7 @@ XTensor::XTensor(const XTensor &reference)
devID = reference.devID; devID = reference.devID;
mem = reference.mem; mem = reference.mem;
InitTensor(this, &reference); InitTensor(this, &reference);
CopyValues(&reference, this); _CopyValues(&reference, this);
} }
if(reference.isTmp) if(reference.isTmp)
...@@ -300,7 +300,7 @@ XTensor& XTensor::operator= (const XTensor& tensor) ...@@ -300,7 +300,7 @@ XTensor& XTensor::operator= (const XTensor& tensor)
} }
Resize(tensor.order, tensor.dimSize, tensor.dataType, tensor.denseRatio); Resize(tensor.order, tensor.dimSize, tensor.dataType, tensor.denseRatio);
CopyValues(&tensor, this); _CopyValues(&tensor, this);
} }
/* copy member variables */ /* copy member variables */
...@@ -345,7 +345,7 @@ judge whether the two matrices are in the same type and size ...@@ -345,7 +345,7 @@ judge whether the two matrices are in the same type and size
>> b - anther tensor to compare with >> b - anther tensor to compare with
<< return - whether the two input tensors are identical << return - whether the two input tensors are identical
*/ */
bool XTensor::IsIdentical(XTensor * a, XTensor * b) bool XTensor::IsIdentical(const XTensor * a, const XTensor * b)
{ {
if(a->order != b->order) if(a->order != b->order)
return false; return false;
...@@ -427,7 +427,7 @@ void XTensor::Reshape(const int myOrder, const int * myDimSize) ...@@ -427,7 +427,7 @@ void XTensor::Reshape(const int myOrder, const int * myDimSize)
} }
/* get the number of items in the data array */ /* get the number of items in the data array */
int XTensor::GetSize() int XTensor::GetSize() const
{ {
if(isSparse) if(isSparse)
return unitNumNonZero; return unitNumNonZero;
...@@ -743,7 +743,7 @@ get the pointer to a cell ...@@ -743,7 +743,7 @@ get the pointer to a cell
>> size - size of index >> size - size of index
<< return - pointer to the cell << return - pointer to the cell
*/ */
void * XTensor::GetCell(int index[], int size) void * XTensor::GetCell(int index[], int size) const
{ {
CheckNTErrors((size == order), "Illegal index!"); CheckNTErrors((size == order), "Illegal index!");
...@@ -795,7 +795,7 @@ get the value of a cell in a 2d tensor in default type ...@@ -795,7 +795,7 @@ get the value of a cell in a 2d tensor in default type
>> mi - column index >> mi - column index
<< return - value of cell(ni, mi) in float << return - value of cell(ni, mi) in float
*/ */
DTYPE XTensor::Get2D(int ni, int mi) DTYPE XTensor::Get2D(int ni, int mi) const
{ {
CheckNTErrors((order == 2), "Cannot get a 2d cell for a tensor whose order is not 2!"); CheckNTErrors((order == 2), "Cannot get a 2d cell for a tensor whose order is not 2!");
CheckNTErrors((ni >= 0 && ni < dimSize[0]), "dimension 0 is out of range!"); CheckNTErrors((ni >= 0 && ni < dimSize[0]), "dimension 0 is out of range!");
...@@ -1243,7 +1243,7 @@ binary search to find an element in a sparse tensor ...@@ -1243,7 +1243,7 @@ binary search to find an element in a sparse tensor
it is the previous one if there is no hit it is the previous one if there is no hit
<< return - find it or not? << return - find it or not?
*/ */
bool XTensor::BinarySearch(int key, DTYPE &value, void * &position) bool XTensor::BinarySearch(int key, DTYPE &value, void * &position) const
{ {
CheckNTErrors((isSparse), "A sparse tensor is required!"); CheckNTErrors((isSparse), "A sparse tensor is required!");
CheckNTErrors((dataType == DEFAULT_DTYPE), "The tensor is not in the default type."); CheckNTErrors((dataType == DEFAULT_DTYPE), "The tensor is not in the default type.");
......
...@@ -201,7 +201,7 @@ public: ...@@ -201,7 +201,7 @@ public:
/* judge whether the two matrices are in the same type and size */ /* judge whether the two matrices are in the same type and size */
static static
bool IsIdentical(XTensor * a, XTensor * b); bool IsIdentical(const XTensor * a, const XTensor * b);
/* judge whether the three matrices are in the same type and size */ /* judge whether the three matrices are in the same type and size */
static static
...@@ -217,7 +217,7 @@ public: ...@@ -217,7 +217,7 @@ public:
void Reshape(const int order, const int * myDimSize); void Reshape(const int order, const int * myDimSize);
/* get the number of items in the data array */ /* get the number of items in the data array */
int GetSize(); int GetSize() const;
/* get size of the memory used */ /* get size of the memory used */
int GetDataSizeInChar(); int GetDataSizeInChar();
...@@ -253,13 +253,13 @@ public: ...@@ -253,13 +253,13 @@ public:
DTYPE Get(int index[], int size = -1); DTYPE Get(int index[], int size = -1);
/* get the pointer to a cell */ /* get the pointer to a cell */
void * GetCell(int index[], int size = -1); void * GetCell(int index[], int size = -1) const;
/* get the default type value of a cell in a 1d tensor */ /* get the default type value of a cell in a 1d tensor */
DTYPE Get1D(int i); DTYPE Get1D(int i);
/* get the default type value of a cell in a 2d tensor */ /* get the default type value of a cell in a 2d tensor */
DTYPE Get2D(int ni, int mi); DTYPE Get2D(int ni, int mi) const;
/* get the default type value of a cell in a 3d tensor */ /* get the default type value of a cell in a 3d tensor */
DTYPE Get3D(int d0, int d1, int d2); DTYPE Get3D(int d0, int d1, int d2);
...@@ -314,7 +314,7 @@ public: ...@@ -314,7 +314,7 @@ public:
bool Resize(const XTensor * myTensor); bool Resize(const XTensor * myTensor);
/* binary search to find an element in a sparse matrix*/ /* binary search to find an element in a sparse matrix*/
bool BinarySearch(int key, DTYPE &value, void * &position); bool BinarySearch(int key, DTYPE &value, void * &position) const;
/* dump data to a file */ /* dump data to a file */
void Dump(FILE * file, const char * label = NULL, const int n = -1, const int verbose = 0); void Dump(FILE * file, const char * label = NULL, const int n = -1, const int verbose = 0);
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11 * $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
*/ */
#include <math.h>
#include "../../XTensor.h" #include "../../XTensor.h"
#include "Absolute.h" #include "Absolute.h"
#include "Absolute.cuh" #include "Absolute.cuh"
...@@ -29,12 +30,12 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -29,12 +30,12 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
set every entry to its absolute value set every entry to its absolute value
>> a - the tensor we are processing >> a - the tensor we are processing
*/ */
void Absolute(XTensor * a) void _Absolute(XTensor * a)
{ {
#ifdef USE_CUDA #ifdef USE_CUDA
/* run it on GPUs */ /* run it on GPUs */
if (a->devID >= 0) { if (a->devID >= 0) {
CudaAbsolute(a); _CudaAbsolute(a);
return; return;
} }
#endif #endif
......
...@@ -58,7 +58,7 @@ set each entry to its with float16 data type value ...@@ -58,7 +58,7 @@ set each entry to its with float16 data type value
>> a - the tensor >> a - the tensor
*/ */
extern "C" extern "C"
void CudaAbsolute(XTensor * a) void _CudaAbsolute(XTensor * a)
{ {
CheckNTErrors((a->isSparse == false), "TODO!"); CheckNTErrors((a->isSparse == false), "TODO!");
......
...@@ -35,7 +35,7 @@ void KernelAbsolute(__half * d, int size); ...@@ -35,7 +35,7 @@ void KernelAbsolute(__half * d, int size);
/* set each entry to its absolute value */ /* set each entry to its absolute value */
extern "C" extern "C"
void CudaAbsolute(XTensor * a); void _CudaAbsolute(XTensor * a);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* set every entry to its absolute value */ /* set every entry to its absolute value */
extern "C" extern "C"
void Absolute(XTensor * a); void _Absolute(XTensor * a);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -37,8 +37,8 @@ c_i = trans(a_i) * trans(b_i) * \alpha + c_i * \beta for each i in [0,count-1] ...@@ -37,8 +37,8 @@ c_i = trans(a_i) * trans(b_i) * \alpha + c_i * \beta for each i in [0,count-1]
>> alpha - scalar >> alpha - scalar
>> beta - scalar >> beta - scalar
*/ */
void MatrixMULBatchedCPU(XList * a, MATRIX_TRANS_TYPE transposedA, void _MatrixMULBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA,
XList * b, MATRIX_TRANS_TYPE transposedB, const XList * b, MATRIX_TRANS_TYPE transposedB,
XList * c, DTYPE alpha, DTYPE beta) XList * c, DTYPE alpha, DTYPE beta)
{ {
CheckNTErrors((a && b && c), "Empty input lists!"); CheckNTErrors((a && b && c), "Empty input lists!");
...@@ -73,11 +73,11 @@ void MatrixMULBatchedCPU(XList * a, MATRIX_TRANS_TYPE transposedA, ...@@ -73,11 +73,11 @@ void MatrixMULBatchedCPU(XList * a, MATRIX_TRANS_TYPE transposedA,
CheckNTErrors((ci->order == 2), "2d tensor (i.e., matrix) is required!"); CheckNTErrors((ci->order == 2), "2d tensor (i.e., matrix) is required!");
#ifdef USE_BLAS #ifdef USE_BLAS
if (useBLAS) if (useBLAS)
MatrixMULCPU(ai, transposedA, bi, transposedB, ci, alpha, beta); _MatrixMULCPU(ai, transposedA, bi, transposedB, ci, alpha, beta);
else else
MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta); _MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
#else #else
MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta); _MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
#endif #endif
} }
//} //}
......
...@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* matrix multiplication in batch mode (CPU code) */ /* matrix multiplication in batch mode (CPU code) */
extern "C" extern "C"
void MatrixMULBatchedCPU(XList * a, MATRIX_TRANS_TYPE transposedA, XList * b, MATRIX_TRANS_TYPE transposedB, XList * c, void _MatrixMULBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA, const XList * b, MATRIX_TRANS_TYPE transposedB, XList * c,
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0); DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -30,34 +30,34 @@ ...@@ -30,34 +30,34 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* /*
matrix multiplication. For the input tensors a and b, we perform matrix multiplication matrix multiplication
on the first two dimentsions. E.g., let A be a tensor of size y * z * m and B be
a tensor of size x * y * n. For A * B, we go over each order-2 tensor of A (of size x * y) For the input tensors a and b, we perform matrix multiplication on the first two dimentsions.
and each order-2 tensor B (of size z * x), like this E.g., let A be a tensor of size y * z * m and B be a tensor of size x * y * n.
c_{i,j} = trans(ai) * trans(bj) * alpha + c_{i,j} * beta For A * B, we go over each order-2 tensor of A (of size x * y) and each order-2 tensor B (of size z * x),
where trans() returns the transposed matrix if the flag is fired, ai is the i-th like this c_{i,j} = trans(ai) * trans(bj) * alpha + c_{i,j} * beta
element tensor of A, bj is the j-th element tensor of B, and c_{i,j} is the (i,j) element where trans() returns the transposed matrix if the flag is fired, ai is the i-th element tensor of A,
tensor of the result C. C should be a tensor of z * x * n * m. Obviously C = A * B performs bj is the j-th element tensor of B, and c_{i,j} is the (i,j) element tensor of the result C.
normal matrix multiplication if A = y * z and B = x * y. C should be a tensor of z * x * n * m.
Obviously C = A * B performs normal matrix multiplication if A = y * z and B = x * y.
>> a - tensor a >> a - tensor a
>> transposedA - indicates whether the matrices in a are transposed >> transposedA - indicates whether the matrices in a are transposed
>> b - tensor b >> b - tensor b
>> transposedB - indicates whether teh matrices in b are transposed >> transposedB - indicates whether teh matrices in b are transposed
>> c - where we keep a*b
>> alpha - a coefficient >> alpha - a coefficient
>> beta - another coefficient >> beta - another coefficient
>> parallelRunner - parallel processing module >> parallelRunner - parallel processing module
*/ */
void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA, void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
XTensor * b, MATRIX_TRANS_TYPE transposedB, const XTensor * b, MATRIX_TRANS_TYPE transposedB,
XTensor * c, DTYPE alpha, DTYPE beta, XTensor * c, DTYPE alpha, DTYPE beta, XPRunner * parallelRunner)
XPRunner * parallelRunner)
{ {
CheckNTErrors((a && b && c), "Empty input tensors!"); CheckNTErrors((a && b && c), "Empty input tensors!");
CheckNTErrors((a->dataType == b->dataType && a->dataType == c->dataType), CheckNTErrors((a->dataType == b->dataType && a->dataType == c->dataType),
"Input tensors should have the same data type!"); "Input tensors should have the same data type!");
CheckNTErrors((a->order >= 2 && b->order >= 2 && c->order >= 2), CheckNTErrors((a->order >= 2 && b->order >= 2 && c->order >= 2),
"Input tensors must have a order > 2!"); "Input tensors must have a order >= 2!");
int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1]; int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0]; int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
...@@ -132,7 +132,7 @@ void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -132,7 +132,7 @@ void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA,
XTensor * ai = (XTensor*)aList->GetItem(i); XTensor * ai = (XTensor*)aList->GetItem(i);
XTensor * bi = (XTensor*)bList->GetItem(i); XTensor * bi = (XTensor*)bList->GetItem(i);
XTensor * ci = (XTensor*)cList->GetItem(i); XTensor * ci = (XTensor*)cList->GetItem(i);
MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta, parallelRunner); _MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta, parallelRunner);
} }
} }
else if (a->devID >= 0 && b->devID >= 0 && c->devID >= 0) { else if (a->devID >= 0 && b->devID >= 0 && c->devID >= 0) {
...@@ -144,7 +144,7 @@ void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -144,7 +144,7 @@ void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA,
ProtectCudaDev(a->devID, devIDBackup); ProtectCudaDev(a->devID, devIDBackup);
cublasHandle_t * handle = a->mem != NULL ? a->mem->GetCublasHandle() : GDevs.GetCudaHandle(a->devID); cublasHandle_t * handle = a->mem != NULL ? a->mem->GetCublasHandle() : GDevs.GetCudaHandle(a->devID);
CudaBLASMatrixMULList(handle, _CudaBLASMatrixMULList(handle,
aList, transposedA, aList, transposedA,
bList, transposedB, bList, transposedB,
cList, aList->count, cList, aList->count,
...@@ -157,7 +157,7 @@ void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -157,7 +157,7 @@ void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA,
} }
else { else {
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!"); CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
MatrixMULBatchedCPU(aList, transposedA, _MatrixMULBatchedCPU(aList, transposedA,
bList, transposedB, bList, transposedB,
cList, alpha, beta); cList, alpha, beta);
} }
...@@ -184,4 +184,74 @@ void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -184,4 +184,74 @@ void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA,
delete bList; delete bList;
delete cList; delete cList;
} }
/*
matrix multiplication (return a XTensor structure)
make a new tensor to keep the result and return it
For the input tensors a and b, we perform matrix multiplication on the first two dimentsions.
E.g., let A be a tensor of size y * z * m and B be a tensor of size x * y * n.
For A * B, we go over each order-2 tensor of A (of size x * y) and each order-2 tensor B (of size z * x),
like this c_{i,j} = trans(ai) * trans(bj) * alpha + c_{i,j} * beta
where trans() returns the transposed matrix if the flag is fired, ai is the i-th element tensor of A,
bj is the j-th element tensor of B, and c_{i,j} is the (i,j) element tensor of the result C.
The result C should be a tensor of z * x * n * m.
Obviously C = A * B performs normal matrix multiplication if A = y * z and B = x * y.
>> a - tensor a
>> transposedA - indicates whether the matrices in a are transposed
>> b - tensor b
>> transposedB - indicates whether teh matrices in b are transposed
>> alpha - a coefficient
>> beta - another coefficient
>> parallelRunner - parallel processing module
<< return - the result of matrix multiplication
*/
XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB,
DTYPE alpha, DTYPE beta, XPRunner * parallelRunner)
{
CheckNTErrors((&a && &b), "Empty input tensors!");
CheckNTErrors((a.dataType == b.dataType), "Input tensors should have the same data type!");
CheckNTErrors((a.order >= 2 && b.order >= 2), "Input tensors must have a order >= 2!");
int an = transposedA == X_TRANS ? a.dimSizeRDI[0] : a.dimSizeRDI[1];
int am = transposedA == X_TRANS ? a.dimSizeRDI[1] : a.dimSizeRDI[0];
int bn = transposedB == X_TRANS ? b.dimSizeRDI[0] : b.dimSizeRDI[1];
int bm = transposedB == X_TRANS ? b.dimSizeRDI[1] : b.dimSizeRDI[0];
CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
int order = a.order + b.order - 2;
int sub = 0;
int * dimSize = new int[order];
for (int i = 2; i < a.order; i++)
dimSize[sub++] = a.dimSizeRDI[i];
for (int i = 2; i < b.order; i++)
dimSize[sub++] = b.dimSizeRDI[i];
dimSize[sub++] = an;
dimSize[sub++] = bm;
XTensor c = NewTensor(order, dimSize, a.dataType, a.denseRatio, a.devID, a.mem);
c.SetZeroAll();
c.SetTMP();
/* call _MatrixMul function */
_MatrixMul(&a, transposedA, &b, transposedB, &c, alpha, beta, parallelRunner);
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_MATRIXMUL);
XLink::AddParamToHeadTrans(&c, transposedA);
XLink::AddParamToHeadTrans(&c, transposedB);
XLink::AddParamToHead(&c, alpha);
XLink::AddParamToHead(&c, beta);
/* destroy variables */
delete dimSize;
return c;
}
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
...@@ -27,18 +27,34 @@ ...@@ -27,18 +27,34 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* /*
matrix multiplication. For the input tensors a and b, we perform matrix multiplication matrix multiplication
on the first two dimentsions. E.g., let A be a tensor of size y * z * m and B be
a tensor of size x * y * n. For A * B, we go over each order-2 tensor of A (of size x * y) For the input tensors a and b, we perform matrix multiplicationon the first two dimentsions.
and each order-2 tensor B (of size z * x), like this E.g., let A be a tensor of size y * z * m and B bea tensor of size x * y * n.
c_{i,j} = trans(ai) * trans(bj) * alpha + c_{i,j} * beta For A * B, we go over each order-2 tensor of A (of size x * y) and each order-2 tensor B (of size z * x),
where trans() returns the transposed matrix if the flag is fired, ai is the i-th like this c_{i,j} = trans(ai) * trans(bj) * alpha + c_{i,j} * beta
element tensor of A, bj is the j-th element tensor of B, and c_{i,j} is the (i,j) element where trans() returns the transposed matrix if the flag is fired, ai is the i-th element tensor of A,
tensor of the result C. C should be a tensor of z * x * n * m. Obviously C = A * B performs bj is the j-th element tensor of B, and c_{i,j} is the (i,j) elementtensor of the result C.
normal matrix multiplication if A = y * z and B = x * y. C should be a tensor of z * x * n * m.
Obviously C = A * B performs normal matrix multiplication if A = y * z and B = x * y.
*/
void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
/*
matrix multiplication (return a XTensor structure)
make a new tensor c to keep the result and return it
For the input tensors a and b, we perform matrix multiplicationon the first two dimentsions.
E.g., let A be a tensor of size y * z * m and B bea tensor of size x * y * n.
For A * B, we go over each order-2 tensor of A (of size x * y) and each order-2 tensor B (of size z * x),
like this c_{i,j} = trans(ai) * trans(bj) * alpha + c_{i,j} * beta
where trans() returns the transposed matrix if the flag is fired, ai is the i-th element tensor of A,
bj is the j-th element tensor of B, and c_{i,j} is the (i,j) elementtensor of the result C.
C should be a tensor of z * x * n * m.
Obviously C = A * B performs normal matrix multiplication if A = y * z and B = x * y.
*/ */
extern "C" XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB,
void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA, XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL); DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -30,8 +30,10 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -30,8 +30,10 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* /*
matrix multiplication (for 2d tensors) matrix multiplication (for 2d tensors)
c = trans(a) * trans(b) * alpha + c * beta c = trans(a) * trans(b) * alpha + c * beta
where trans() return the transposed matrix if the flag is fired where trans() return the transposed matrix if the flag is fired
>> a - tensor a >> a - tensor a
>> transposedA - indicates whether the matrices in a are transposed >> transposedA - indicates whether the matrices in a are transposed
>> b - tensor b >> b - tensor b
...@@ -42,8 +44,8 @@ where trans() return the transposed matrix if the flag is fired ...@@ -42,8 +44,8 @@ where trans() return the transposed matrix if the flag is fired
>> parallelRunner - parallel processing module >> parallelRunner - parallel processing module
>> stream - the string for creating the job pipeline >> stream - the string for creating the job pipeline
*/ */
void MatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA, void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
XTensor * b, MATRIX_TRANS_TYPE transposedB, const XTensor * b, MATRIX_TRANS_TYPE transposedB,
XTensor * c, DTYPE alpha, DTYPE beta, XTensor * c, DTYPE alpha, DTYPE beta,
XPRunner * parallelRunner, XStream * stream) XPRunner * parallelRunner, XStream * stream)
{ {
...@@ -67,7 +69,7 @@ void MatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -67,7 +69,7 @@ void MatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA,
#ifdef USE_CUDA #ifdef USE_CUDA
if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) { if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
CudaMatrixMul2D(a, transposedA, b, transposedB, c, alpha, beta, stream); _CudaMatrixMul2D(a, transposedA, b, transposedB, c, alpha, beta, stream);
return; return;
} }
#endif #endif
...@@ -81,9 +83,9 @@ void MatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -81,9 +83,9 @@ void MatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA,
c->dataType == DEFAULT_DTYPE) c->dataType == DEFAULT_DTYPE)
{ {
if (useBLAS) if (useBLAS)
MatrixMULCPU(a, transposedA, b, transposedB, c, alpha, beta); _MatrixMULCPU(a, transposedA, b, transposedB, c, alpha, beta);
else else
MatrixMul2DParallel(a, transposedA, b, transposedB, c, alpha, beta, parallelRunner); _MatrixMul2DParallel(a, transposedA, b, transposedB, c, alpha, beta, parallelRunner);
} }
else { else {
// TODO!! // TODO!!
......
...@@ -108,8 +108,10 @@ void KernelMatrixMulDenseMSparseMV2(DTYPE * a, MATRIX_TRANS_TYPE transposedA, in ...@@ -108,8 +108,10 @@ void KernelMatrixMulDenseMSparseMV2(DTYPE * a, MATRIX_TRANS_TYPE transposedA, in
/* /*
matrix multiplication (for 2d tensors) (cuda version) matrix multiplication (for 2d tensors) (cuda version)
c = trans(a) * trans(b) * alpha + c * beta c = trans(a) * trans(b) * alpha + c * beta
where trans() return the transposed matrix if the flag is fired where trans() return the transposed matrix if the flag is fired
>> a - tensor a >> a - tensor a
>> transposedA - indicates whether the matrices in a are transposed >> transposedA - indicates whether the matrices in a are transposed
>> b - tensor b >> b - tensor b
...@@ -119,8 +121,8 @@ where trans() return the transposed matrix if the flag is fired ...@@ -119,8 +121,8 @@ where trans() return the transposed matrix if the flag is fired
>> beta - another coefficient >> beta - another coefficient
>> stream - the string for creating the job pipeline >> stream - the string for creating the job pipeline
*/ */
void CudaMatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA, void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
XTensor * b, MATRIX_TRANS_TYPE transposedB, const XTensor * b, MATRIX_TRANS_TYPE transposedB,
XTensor * c, XTensor * c,
DTYPE alpha, DTYPE beta, XStream * stream) DTYPE alpha, DTYPE beta, XStream * stream)
{ {
...@@ -156,7 +158,7 @@ void CudaMatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -156,7 +158,7 @@ void CudaMatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA,
cublasSetStream(*handle, stream->stream); cublasSetStream(*handle, stream->stream);
if (a->dataType == X_FLOAT && b->dataType == X_FLOAT && c->dataType == X_FLOAT) { if (a->dataType == X_FLOAT && b->dataType == X_FLOAT && c->dataType == X_FLOAT) {
CudaBLASMatrixMUL(handle, a->data, transposedA, a->dataType, b->data, transposedB, a->dataType, c->data, c->dataType, _CudaBLASMatrixMUL(handle, a->data, transposedA, a->dataType, b->data, transposedB, a->dataType, c->data, c->dataType,
a->dimSize[0], a->dimSize[1], b->dimSize[0], b->dimSize[1], c->dimSize[0], c->dimSize[1], a->dimSize[0], a->dimSize[1], b->dimSize[0], b->dimSize[1], c->dimSize[0], c->dimSize[1],
alpha, beta); alpha, beta);
} }
......
...@@ -43,7 +43,7 @@ c = trans(a) * trans(b) * alpha + c * beta ...@@ -43,7 +43,7 @@ c = trans(a) * trans(b) * alpha + c * beta
where trans() return the transposed matrix if the flag is fired where trans() return the transposed matrix if the flag is fired
*/ */
extern "C" extern "C"
void CudaMatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA, XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c, void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XStream * stream = NULL); DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XStream * stream = NULL);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -31,8 +31,7 @@ matrix multiplication (for 2d tensors) ...@@ -31,8 +31,7 @@ matrix multiplication (for 2d tensors)
c = trans(a) * trans(b) * alpha + c * beta c = trans(a) * trans(b) * alpha + c * beta
where trans() return the transposed matrix if the flag is fired where trans() return the transposed matrix if the flag is fired
*/ */
extern "C" void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
void MatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA, XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL, XStream * stream = NULL); DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL, XStream * stream = NULL);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -38,7 +38,7 @@ argument5: matrix a ...@@ -38,7 +38,7 @@ argument5: matrix a
argument6: matrix b argument6: matrix b
argument7: matrix c (c=a*b*\alpha + c*beta) argument7: matrix c (c=a*b*\alpha + c*beta)
*/ */
void MatrixMul2DMultiTheading(XList * args) void _MatrixMul2DMultiTheading(XList * args)
{ {
int x1 = *(int*)args->GetItem(0); int x1 = *(int*)args->GetItem(0);
int y1 = *(int*)args->GetItem(1); int y1 = *(int*)args->GetItem(1);
......
...@@ -31,7 +31,7 @@ matrix multiplication for a block (x1,y1) - (x2,y2) ...@@ -31,7 +31,7 @@ matrix multiplication for a block (x1,y1) - (x2,y2)
where (x1,y1) is the upper-left corner and (x2,y2) is the bottom-right corner where (x1,y1) is the upper-left corner and (x2,y2) is the bottom-right corner
*/ */
extern "C" extern "C"
void MatrixMul2DMultiTheading(XList * args); void _MatrixMul2DMultiTheading(XList * args);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -30,6 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -30,6 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
matrix multiplication (for 2d tensors) with multi-threading matrix multiplication (for 2d tensors) with multi-threading
c = trans(a) * trans(b) * alpha + c * beta c = trans(a) * trans(b) * alpha + c * beta
where trans() return the transposed matrix if the flag is fired where trans() return the transposed matrix if the flag is fired
>> a - tensor a >> a - tensor a
>> transposedA - indicates whether the matrices in a are transposed >> transposedA - indicates whether the matrices in a are transposed
>> b - tensor b >> b - tensor b
...@@ -39,10 +40,9 @@ where trans() return the transposed matrix if the flag is fired ...@@ -39,10 +40,9 @@ where trans() return the transposed matrix if the flag is fired
>> beta - another coefficient >> beta - another coefficient
>> parallelRunner - parallel processing module >> parallelRunner - parallel processing module
*/ */
void MatrixMul2DParallel(XTensor * a, MATRIX_TRANS_TYPE transposedA, void _MatrixMul2DParallel(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
XTensor * b, MATRIX_TRANS_TYPE transposedB, const XTensor * b, MATRIX_TRANS_TYPE transposedB,
XTensor * c, DTYPE alpha, DTYPE beta, XTensor * c, DTYPE alpha, DTYPE beta, XPRunner * parallelRunner)
XPRunner * parallelRunner)
{ {
CheckNTErrors((a && b && c), "Empty input tensors!"); CheckNTErrors((a && b && c), "Empty input tensors!");
CheckNTErrors((a->order == 2 && b->order == 2 && c->order == 2), CheckNTErrors((a->order == 2 && b->order == 2 && c->order == 2),
...@@ -56,7 +56,7 @@ void MatrixMul2DParallel(XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -56,7 +56,7 @@ void MatrixMul2DParallel(XTensor * a, MATRIX_TRANS_TYPE transposedA,
/* a * b */ /* a * b */
if (transposedA == X_NOTRANS && transposedB == X_NOTRANS) { if (transposedA == X_NOTRANS && transposedB == X_NOTRANS) {
RunParallel2D(parallelRunner, (void*)MatrixMul2DMultiTheading, an * am * bm, RunParallel2D(parallelRunner, (void*)_MatrixMul2DMultiTheading, an * am * bm,
cn, cm, 5, cn, cm, 5,
a, b, c, &alpha, &beta); a, b, c, &alpha, &beta);
} }
......
...@@ -27,12 +27,12 @@ ...@@ -27,12 +27,12 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* /*
matrix multiplication (for 2d tensors) with multi-threading matrix multiplication (for 2d tensors) with multi-threading.
c = trans(a) * trans(b) * alpha + c * beta c = trans(a) * trans(b) * alpha + c * beta
where trans() return the transposed matrix if the flag is fired where trans() return the transposed matrix if the flag is fired.
*/ */
extern "C" extern "C"
void MatrixMul2DParallel(XTensor * a, MATRIX_TRANS_TYPE transposedA, XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c, void _MatrixMul2DParallel(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL); DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -30,10 +30,12 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -30,10 +30,12 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* /*
matrix multiplication of the two tensors matrix multiplication of the two tensors
for each 2-dimensional data array in a (denoted as ai) and for each 2-dimensional data array in a (denoted as ai) and
each 2-dimensional data array in b (denoted as bi), we have each 2-dimensional data array in b (denoted as bi), we have
ci = trans(ai) * trans(bi) * alpha + cm * beta ci = trans(ai) * trans(bi) * alpha + cm * beta
where trans() returns the transposed matrix if the flag is fired where trans() returns the transposed matrix if the flag is fired
>> a - tensor a >> a - tensor a
>> transposedA - indicates whether the matrices in a are transposed >> transposedA - indicates whether the matrices in a are transposed
>> b - tensor b >> b - tensor b
...@@ -43,8 +45,8 @@ where trans() returns the transposed matrix if the flag is fired ...@@ -43,8 +45,8 @@ where trans() returns the transposed matrix if the flag is fired
>> beta - another coefficient >> beta - another coefficient
>> parallelRunner - parallel processing module >> parallelRunner - parallel processing module
*/ */
void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA, void _MatrixMulBatched(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
XTensor * b, MATRIX_TRANS_TYPE transposedB, const XTensor * b, MATRIX_TRANS_TYPE transposedB,
XTensor * c, DTYPE alpha, DTYPE beta, XTensor * c, DTYPE alpha, DTYPE beta,
XPRunner * parallelRunner) XPRunner * parallelRunner)
{ {
...@@ -52,7 +54,9 @@ void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -52,7 +54,9 @@ void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA,
CheckNTErrors((a->dataType == b->dataType && a->dataType == c->dataType), CheckNTErrors((a->dataType == b->dataType && a->dataType == c->dataType),
"Input tensors should have the same data type!"); "Input tensors should have the same data type!");
CheckNTErrors((a->order >= 2 && b->order >= 2 && c->order >= 2), CheckNTErrors((a->order >= 2 && b->order >= 2 && c->order >= 2),
"Input tensors must have a order > 2!"); "Input tensors must have a order >= 2!");
CheckNTErrors((a->order == b->order && a->order == c->order),
"Input tensor and output tensor must have same order!");
int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1]; int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0]; int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
...@@ -109,7 +113,7 @@ void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -109,7 +113,7 @@ void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA,
ProtectCudaDev(a->devID, devIDBackup); ProtectCudaDev(a->devID, devIDBackup);
cublasHandle_t * handle = a->mem != NULL ? a->mem->GetCublasHandle() : GDevs.GetCudaHandle(a->devID); cublasHandle_t * handle = a->mem != NULL ? a->mem->GetCublasHandle() : GDevs.GetCudaHandle(a->devID);
CudaBLASMatrixMULList(handle, _CudaBLASMatrixMULList(handle,
aList, transposedA, aList, transposedA,
bList, transposedB, bList, transposedB,
cList, aList->count, cList, aList->count,
...@@ -122,7 +126,7 @@ void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -122,7 +126,7 @@ void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA,
} }
else { else {
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!"); CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
MatrixMULBatchedCPU(aList, transposedA, _MatrixMULBatchedCPU(aList, transposedA,
bList, transposedB, bList, transposedB,
cList, alpha, beta); cList, alpha, beta);
} }
...@@ -150,4 +154,65 @@ void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -150,4 +154,65 @@ void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA,
delete cList; delete cList;
} }
/*
matrix multiplication of the two tensors (do it on site)
make a new tensor to keep the result and return it
for each 2-dimensional data array in a (denoted as ai) and
each 2-dimensional data array in b (denoted as bi), we have
ci = trans(ai) * trans(bi) * alpha + cm * beta
where trans() returns the transposed matrix if the flag is fired.
>> a - tensor a
>> transposedA - indicates whether the matrices in a are transposed
>> b - tensor b
>> transposedB - indicates whether teh matrices in b are transposed
>> alpha - a coefficient
>> beta - another coefficient
>> parallelRunner - parallel processing module
<< return - the result of matrix multiplication of the two tensors
*/
XTensor MatrixMulBatched(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB,
DTYPE alpha, DTYPE beta, XPRunner * parallelRunner)
{
CheckNTErrors((&a && &b), "Empty input tensors!");
CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
CheckNTErrors((a.order >= 2 && b.order >= 2), "Input tensors must have a order >= 2!");
CheckNTErrors(a.order == b.order, "Input tensor and output tensor must have same order!");
int an = transposedA == X_TRANS ? a.dimSizeRDI[0] : a.dimSizeRDI[1];
int am = transposedA == X_TRANS ? a.dimSizeRDI[1] : a.dimSizeRDI[0];
int bn = transposedB == X_TRANS ? b.dimSizeRDI[0] : b.dimSizeRDI[1];
int bm = transposedB == X_TRANS ? b.dimSizeRDI[1] : b.dimSizeRDI[0];
CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
int order = a.order;
int sub = 0;
int * dimSize = new int[order];
for (int i = 2; i < a.order; i++)
dimSize[sub++] = a.dimSizeRDI[i];
dimSize[sub++] = an;
dimSize[sub++] = bm;
XTensor c = NewTensor(order, dimSize, a.dataType, a.denseRatio, a.devID, a.mem);
c.SetZeroAll();
c.SetTMP();
/*call _MatrixMulBatched function */
_MatrixMulBatched(&a, transposedA, &b, transposedB, &c, alpha, beta, parallelRunner);
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_MATRIXMULBATCHED);
XLink::AddParamToHeadTrans(&c, transposedA);
XLink::AddParamToHeadTrans(&c, transposedB);
XLink::AddParamToHead(&c, alpha);
XLink::AddParamToHead(&c, beta);
/* destroy variables */
delete dimSize;
return c;
}
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
...@@ -28,13 +28,25 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -28,13 +28,25 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* /*
matrix multiplication of the two tensors matrix multiplication of the two tensors
for each 2-dimensional data array in a (denoted as ai) and
each 2-dimensional data array in b (denoted as bi), we have
ci = trans(ai) * trans(bi) * alpha + cm * beta
where trans() returns the transposed matrix if the flag is fired
*/
void _MatrixMulBatched(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB,
XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
/*
matrix multiplication of the two tensors (return a XTensor structure)
make a new tensor to keep the result and return it
for each 2-dimensional data array in a (denoted as ai) and for each 2-dimensional data array in a (denoted as ai) and
each 2-dimensional data array in b (denoted as bi), we have each 2-dimensional data array in b (denoted as bi), we have
ci = trans(ai) * trans(bi) * alpha + cm * beta ci = trans(ai) * trans(bi) * alpha + cm * beta
where trans() returns the transposed matrix if the flag is fired where trans() returns the transposed matrix if the flag is fired
*/ */
extern "C" XTensor MatrixMulBatched(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB,
void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA, XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL); DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -28,14 +28,15 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -28,14 +28,15 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* /*
element-wise product of two tensors element-wise product of two tensors
c(i) = a(i)*b(i) + \alpha * c(i) c(i) = a(i)*b(i) + \alpha * c(i)
where i is the index of the item where i is the index of the item
>> a - matrix a >> a - matrix a
>> b - matrix b >> b - matrix b
>> c - result matrix >> c - result matrix
>> alpha - the coefficient >> alpha - the coefficient
>> leadingDim - the dimension along which we perform broadcasting >> leadingDim - the dimension along which we perform broadcasting
>>
*/ */
void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim) void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
{ {
...@@ -121,9 +122,12 @@ void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, i ...@@ -121,9 +122,12 @@ void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, i
} }
/* /*
element-wise product of two tensors and keep the result in the input element-wise product of two tensors (do it on site)
keep the result in the input tensor a and return nothing
a(i) = a(i)*b(i) + \alpha * a(i) a(i) = a(i)*b(i) + \alpha * a(i)
where i is the index of the item where i is the index of the item
>> a - tensor a (where keep the result) >> a - tensor a (where keep the result)
>> b - tensor b >> b - tensor b
>> alpha - the coefficient >> alpha - the coefficient
...@@ -135,9 +139,12 @@ void _MultiplyMe(XTensor * a, const XTensor * b, DTYPE alpha, int leadingDim) ...@@ -135,9 +139,12 @@ void _MultiplyMe(XTensor * a, const XTensor * b, DTYPE alpha, int leadingDim)
} }
/* /*
make a tensor of the element-wise product for two input tensors: element-wise product of two tensors (return a XTensor structure)
make a new tensor c to keep the result and return it
c(i) = a(i)*b(i) + \alpha * c(i) c(i) = a(i)*b(i) + \alpha * c(i)
where i is the index of the item where i is the index of the item
>> a - tensor a >> a - tensor a
>> b - tensor b >> b - tensor b
>> alpha - the coefficient >> alpha - the coefficient
...@@ -151,7 +158,7 @@ XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim ...@@ -151,7 +158,7 @@ XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim
XTensor c(&a); XTensor c(&a);
c.SetTMP(); c.SetTMP();
/* computation */ /* call _Multiply function */
_Multiply(&a, &b, &c, alpha, leadingDim); _Multiply(&a, &b, &c, alpha, leadingDim);
/* tensor connections */ /* tensor connections */
......
...@@ -26,19 +26,27 @@ ...@@ -26,19 +26,27 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* element-wise product of two tensors: /*
c(i) = a(i)*b(i) + \alpha * c(i) element-wise product of two tensors:
where i is the index of the element */ c(i) = a(i)*b(i) + \alpha * c(i)
where i is the index of the element
*/
void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha = 0, int leadingDim = 0); void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha = 0, int leadingDim = 0);
/* element-wise product of two tensors and keep the result in the input tensor: /*
a(i) = a(i)*b(i) + \alpha * a(i) element-wise product of two tensors (do it on site)
where i is the index of the element */ keep the result in the input tensor a and return nothing
a(i) = a(i)*b(i) + \alpha * a(i)
where i is the index of the element
*/
void _MultiplyMe(XTensor * a, const XTensor * b, DTYPE alpha = 0, int leadingDim = 0); void _MultiplyMe(XTensor * a, const XTensor * b, DTYPE alpha = 0, int leadingDim = 0);
/* make a tensor of the element-wise product for two input tensors: /*
c(i) = a(i)*b(i) + \alpha * c(i) element-wise product of two tensors (return a XTensor structure)
where i is the index of the element */ make a new tensor to keep the result and return it
c(i) = a(i)*b(i) + \alpha * c(i)
where i is the index of the element
*/
XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha = 0, int leadingDim = 0); XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha = 0, int leadingDim = 0);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -29,12 +29,12 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -29,12 +29,12 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
set every entry to its minus value set every entry to its minus value
>> a - the tensor we are processing >> a - the tensor we are processing
*/ */
void Negate(XTensor * a) void _Negate(XTensor * a)
{ {
#ifdef USE_CUDA #ifdef USE_CUDA
/* run it on GPUs */ /* run it on GPUs */
if (a->devID >= 0) { if (a->devID >= 0) {
CudaNegate(a); _CudaNegate(a);
return; return;
} }
#endif #endif
......
...@@ -66,7 +66,7 @@ set each entry to its negtive value ...@@ -66,7 +66,7 @@ set each entry to its negtive value
>> a - the tensor >> a - the tensor
*/ */
extern "C" extern "C"
void CudaNegate(XTensor * a) void _CudaNegate(XTensor * a)
{ {
CheckNTErrors((a->isSparse == false), "TODO!"); CheckNTErrors((a->isSparse == false), "TODO!");
......
...@@ -19,6 +19,9 @@ ...@@ -19,6 +19,9 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/ */
#ifndef __NEGATE_CUH__
#define __NEGATE_CUH__
#include "Negate.h" #include "Negate.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -35,8 +38,10 @@ void KernelNegate(__half * d, int size); ...@@ -35,8 +38,10 @@ void KernelNegate(__half * d, int size);
/* set each entry to its negtive value */ /* set each entry to its negtive value */
extern "C" extern "C"
void CudaNegate(XTensor * a); void _CudaNegate(XTensor * a);
#endif // USE_CUDA #endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
#endif // __NEGATE_CUH__
\ No newline at end of file
...@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* set every entry to its minus value */ /* set every entry to its minus value */
extern "C" extern "C"
void Negate(XTensor * a); void _Negate(XTensor * a);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -29,12 +29,12 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -29,12 +29,12 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
set every entry to its sign value set every entry to its sign value
>> a - the tensor we are processing >> a - the tensor we are processing
*/ */
void Sign(XTensor * a) void _Sign(XTensor * a)
{ {
#ifdef USE_CUDA #ifdef USE_CUDA
/* run it on GPUs */ /* run it on GPUs */
if (a->devID >= 0) { if (a->devID >= 0) {
CudaSign(a); _CudaSign(a);
return; return;
} }
#endif #endif
......
...@@ -64,7 +64,7 @@ set each entry to its with float16 data type value ...@@ -64,7 +64,7 @@ set each entry to its with float16 data type value
>> a - the tensor >> a - the tensor
*/ */
extern "C" extern "C"
void CudaSign(XTensor * a) void _CudaSign(XTensor * a)
{ {
CheckNTErrors((a->isSparse == false), "TODO!"); CheckNTErrors((a->isSparse == false), "TODO!");
......
...@@ -19,6 +19,9 @@ ...@@ -19,6 +19,9 @@
* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11 * $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
*/ */
#ifndef __SIGN_CUH__
#define __SIGN_CUH__
#include "Sign.h" #include "Sign.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -35,8 +38,10 @@ void KernelSign(__half * d, int size); ...@@ -35,8 +38,10 @@ void KernelSign(__half * d, int size);
/* set each entry to its sign value */ /* set each entry to its sign value */
extern "C" extern "C"
void CudaSign(XTensor * a); void _CudaSign(XTensor * a);
#endif // USE_CUDA #endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
#endif // __SIGN_H__
\ No newline at end of file
...@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* set every entry to its sign value */ /* set every entry to its sign value */
extern "C" extern "C"
void Sign(XTensor * a); void _Sign(XTensor * a);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -29,7 +29,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -29,7 +29,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* /*
tensor summation c = a + b * \beta tensor summation c = a + b * \beta
return a pointer
>> a - a tensor >> a - a tensor
>> b - another tensor >> b - another tensor
>> c - where we put a+b*\beta. we save it in a if c is NULL >> c - where we put a+b*\beta. we save it in a if c is NULL
...@@ -112,8 +112,9 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta) ...@@ -112,8 +112,9 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
} }
/* /*
tensor summation a = a + b * \beta tensor summation a = a + b * \beta (do it on site)
do it on site keep the result in the tensor a and return nothing
>> a - a tensor >> a - a tensor
>> b - another tensor >> b - another tensor
>> beta - the scaling factor >> beta - the scaling factor
...@@ -124,18 +125,20 @@ void _SumMe(XTensor * a, const XTensor * b, DTYPE beta) ...@@ -124,18 +125,20 @@ void _SumMe(XTensor * a, const XTensor * b, DTYPE beta)
} }
/* /*
tensor summation a = a + b * \beta tensor summation c = a + b * \beta (return a XTensor structure)
return a XTensor structure make a new tensor c to keep the result and return it
>> a - a tensor >> a - a tensor
>> b - another tensor >> b - another tensor
>> beta - the scaling factor >> beta - the scaling factor
<< return - the result of tensor summation
*/ */
XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta) XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta)
{ {
XTensor c(&a); XTensor c(&a);
c.SetTMP(); c.SetTMP();
/* computation */ /* call _Sum function */
_Sum(&a, &b, &c, beta); _Sum(&a, &b, &c, beta);
/* tensor connections */ /* tensor connections */
......
...@@ -29,10 +29,16 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -29,10 +29,16 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* tensor summation c = a + b * \beta */ /* tensor summation c = a + b * \beta */
void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0); void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
/* tensor summation a = a + b * \beta (return a pointer) */ /*
tensor summation a = a + b * \beta
keep the result in the input tensor a and return nothing
*/
void _SumMe(XTensor * a, const XTensor * b, DTYPE beta = (DTYPE)1.0); void _SumMe(XTensor * a, const XTensor * b, DTYPE beta = (DTYPE)1.0);
/* tensor summation c = a + b * \beta (return a structure) */ /*
tensor summation c = a + b * \beta
make a new tensor c to keep the result and return it
*/
XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta = (DTYPE)1.0); XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta = (DTYPE)1.0);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -37,11 +37,8 @@ where b is a vector. ...@@ -37,11 +37,8 @@ where b is a vector.
>> c - where we put a+b. we save it in a if c is NULL >> c - where we put a+b. we save it in a if c is NULL
>> beta - the scaling factor >> beta - the scaling factor
*/ */
void SumByColumnTV(XTensor * a, XTensor * b, XTensor * c, DTYPE beta) void _SumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
{ {
if (c == NULL)
c = a;
CheckNTErrors((a && b && c), "Empty input tensors!"); CheckNTErrors((a && b && c), "Empty input tensors!");
CheckNTErrors((XTensor::IsIdentical(a, c)), "Unmatched tensors in addition!"); CheckNTErrors((XTensor::IsIdentical(a, c)), "Unmatched tensors in addition!");
CheckNTErrors((b->order == 2 && b->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]), CheckNTErrors((b->order == 2 && b->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]),
...@@ -56,7 +53,7 @@ void SumByColumnTV(XTensor * a, XTensor * b, XTensor * c, DTYPE beta) ...@@ -56,7 +53,7 @@ void SumByColumnTV(XTensor * a, XTensor * b, XTensor * c, DTYPE beta)
if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) { if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
#ifdef USE_CUDA #ifdef USE_CUDA
CudaSumByColumnTV(a, b, c, beta); _CudaSumByColumnTV(a, b, c, beta);
#endif #endif
} }
else { else {
......
...@@ -64,11 +64,8 @@ where b is a vector. ...@@ -64,11 +64,8 @@ where b is a vector.
>> c - where we put a+b. we save it in a if c is NULL >> c - where we put a+b. we save it in a if c is NULL
>> beta - the scaling factor >> beta - the scaling factor
*/ */
void CudaSumByColumnTV(XTensor * a, XTensor * b, XTensor * c, DTYPE beta) void _CudaSumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
{ {
if (c == NULL)
c = a;
CheckNTErrors((a && b && c), "Empty input tensors!"); CheckNTErrors((a && b && c), "Empty input tensors!");
CheckNTErrors((XTensor::IsIdentical(a, c)), "Unmatched tensors in addition!"); CheckNTErrors((XTensor::IsIdentical(a, c)), "Unmatched tensors in addition!");
CheckNTErrors((b->order == 2 && b->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]), CheckNTErrors((b->order == 2 && b->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]),
......
...@@ -30,7 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -30,7 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* summation of a tensor and a vector (column vector) */ /* summation of a tensor and a vector (column vector) */
extern "C" extern "C"
void CudaSumByColumnTV(XTensor * a, XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0); void _CudaSumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* sum of a tensor and a (column) vector */ /* sum of a tensor and a (column) vector */
extern "C" extern "C"
void SumByColumnTV(XTensor * a, XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0); void _SumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -37,11 +37,8 @@ where c and a are vectors, and b_col is a column in b. ...@@ -37,11 +37,8 @@ where c and a are vectors, and b_col is a column in b.
>> c - where we put a+b. we save it in a if c is NULL >> c - where we put a+b. we save it in a if c is NULL
>> beta - the scaling factor >> beta - the scaling factor
*/ */
void SumByColumnVT(XTensor * a, XTensor * b, XTensor * c, DTYPE beta) void _SumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
{ {
if (c == NULL)
c = a;
CheckNTErrors((a && b && c), "Empty input tensors!"); CheckNTErrors((a && b && c), "Empty input tensors!");
CheckNTErrors((XTensor::IsIdentical(a, c)), "Unmatched tensors in addition!"); CheckNTErrors((XTensor::IsIdentical(a, c)), "Unmatched tensors in addition!");
CheckNTErrors((a->order == 2 && a->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]), CheckNTErrors((a->order == 2 && a->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]),
...@@ -49,7 +46,7 @@ void SumByColumnVT(XTensor * a, XTensor * b, XTensor * c, DTYPE beta) ...@@ -49,7 +46,7 @@ void SumByColumnVT(XTensor * a, XTensor * b, XTensor * c, DTYPE beta)
if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) { if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
#ifdef USE_CUDA #ifdef USE_CUDA
CudaSumByColumnVT(a, b, c, beta); _CudaSumByColumnVT(a, b, c, beta);
#endif #endif
} }
else { else {
......
...@@ -80,11 +80,8 @@ where c and a are vectors, and b_col is a column in b. ...@@ -80,11 +80,8 @@ where c and a are vectors, and b_col is a column in b.
>> c - where we put a+b. we save it in a if c is NULL >> c - where we put a+b. we save it in a if c is NULL
>> beta - the scaling factor >> beta - the scaling factor
*/ */
void CudaSumByColumnVT(XTensor * a, XTensor * b, XTensor * c, DTYPE beta) void _CudaSumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
{ {
if (c == NULL)
c = a;
CheckNTErrors((a && b && c), "Empty input tensors!"); CheckNTErrors((a && b && c), "Empty input tensors!");
CheckNTErrors((XTensor::IsIdentical(a, c)), "Unmatched tensors in addition!"); CheckNTErrors((XTensor::IsIdentical(a, c)), "Unmatched tensors in addition!");
CheckNTErrors((a->order == 2 && a->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]), CheckNTErrors((a->order == 2 && a->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]),
......
...@@ -30,7 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -30,7 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* summation of a vector (column vector) and a tensor */ /* summation of a vector (column vector) and a tensor */
extern "C" extern "C"
void CudaSumByColumnVT(XTensor * a, XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0); void _CudaSumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* sum of a (column) vector and a tensor */ /* sum of a (column) vector and a tensor */
extern "C" extern "C"
void SumByColumnVT(XTensor * a, XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0); void _SumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -36,8 +36,8 @@ c = trans(a) * trans(b) * \alpha + c * \beta ...@@ -36,8 +36,8 @@ c = trans(a) * trans(b) * \alpha + c * \beta
>> beta - scalar >> beta - scalar
>> c - output matrix (2d tensor) >> c - output matrix (2d tensor)
*/ */
void MatrixMULCPU(XTensor * a, MATRIX_TRANS_TYPE transposedA, void _MatrixMULCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
XTensor * b, MATRIX_TRANS_TYPE transposedB, const XTensor * b, MATRIX_TRANS_TYPE transposedB,
XTensor * c, DTYPE alpha, DTYPE beta) XTensor * c, DTYPE alpha, DTYPE beta)
{ {
CheckNTErrors((a && b && c), "Empty input tensors!"); CheckNTErrors((a && b && c), "Empty input tensors!");
......
...@@ -31,9 +31,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -31,9 +31,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* /*
matrix multiplication via cuda version BLAS matrix multiplication via cuda version BLAS
*/ */
void CudaBLASMatrixMUL(cublasHandle_t * handle, void _CudaBLASMatrixMUL(cublasHandle_t * handle,
void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB, const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
void * c, TENSOR_DATA_TYPE dataTypeC, void * c, TENSOR_DATA_TYPE dataTypeC,
int na, int ma, int nb, int mb, int nc, int mc, int na, int ma, int nb, int mb, int nc, int mc,
DTYPE alpha, DTYPE beta) DTYPE alpha, DTYPE beta)
...@@ -88,7 +88,7 @@ void CudaBLASMatrixMUL(cublasHandle_t * handle, ...@@ -88,7 +88,7 @@ void CudaBLASMatrixMUL(cublasHandle_t * handle,
/* /*
matrix multiplication via cuda version BLAS matrix multiplication via cuda version BLAS
*/ */
void CudaBLASMatrixMULBatched(cublasHandle_t * handle, void _CudaBLASMatrixMULBatched(cublasHandle_t * handle,
const void ** a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, const void ** a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
const void ** b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB, const void ** b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
void ** c, TENSOR_DATA_TYPE dataTypeC, void ** c, TENSOR_DATA_TYPE dataTypeC,
...@@ -144,7 +144,7 @@ void CudaBLASMatrixMULBatched(cublasHandle_t * handle, ...@@ -144,7 +144,7 @@ void CudaBLASMatrixMULBatched(cublasHandle_t * handle,
/* matrix multiplication in batch and strided mode via cuda version BLAS */ /* matrix multiplication in batch and strided mode via cuda version BLAS */
extern "C" extern "C"
void CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle, void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, long long int strideA, const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, long long int strideA,
const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB, long long int strideB, const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB, long long int strideB,
void * c, TENSOR_DATA_TYPE dataTypeC, long long int strideC, void * c, TENSOR_DATA_TYPE dataTypeC, long long int strideC,
...@@ -201,9 +201,9 @@ void CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle, ...@@ -201,9 +201,9 @@ void CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
/* /*
matrix multiplication via cuda version BLAS matrix multiplication via cuda version BLAS
*/ */
void CudaBLASMatrixMULList(cublasHandle_t * handle, void _CudaBLASMatrixMULList(cublasHandle_t * handle,
XList * a, MATRIX_TRANS_TYPE transposedA, const XList * a, MATRIX_TRANS_TYPE transposedA,
XList * b, MATRIX_TRANS_TYPE transposedB, const XList * b, MATRIX_TRANS_TYPE transposedB,
XList * c, XList * c,
int count, DTYPE alpha, DTYPE beta) int count, DTYPE alpha, DTYPE beta)
{ {
...@@ -255,7 +255,7 @@ void CudaBLASMatrixMULList(cublasHandle_t * handle, ...@@ -255,7 +255,7 @@ void CudaBLASMatrixMULList(cublasHandle_t * handle,
if (isUniform) { if (isUniform) {
XMem * mem = a0->mem; XMem * mem = a0->mem;
if (isStrided && a->count > 1) { if (isStrided && a->count > 1) {
CudaBLASMatrixMULBatchedStrided(handle, _CudaBLASMatrixMULBatchedStrided(handle,
a0->data, transposedA, a0->dataType, strideA / a0->unitSize, a0->data, transposedA, a0->dataType, strideA / a0->unitSize,
b0->data, transposedB, b0->dataType, strideB / b0->unitSize, b0->data, transposedB, b0->dataType, strideB / b0->unitSize,
c0->data, c0->dataType, strideC / c0->unitSize, a->count, c0->data, c0->dataType, strideC / c0->unitSize, a->count,
...@@ -297,7 +297,7 @@ void CudaBLASMatrixMULList(cublasHandle_t * handle, ...@@ -297,7 +297,7 @@ void CudaBLASMatrixMULList(cublasHandle_t * handle,
cudaMemcpy(bpGPU, bp, sizeof(DTYPE*) * b->count, cudaMemcpyHostToDevice); cudaMemcpy(bpGPU, bp, sizeof(DTYPE*) * b->count, cudaMemcpyHostToDevice);
cudaMemcpy(cpGPU, cp, sizeof(DTYPE*) * c->count, cudaMemcpyHostToDevice); cudaMemcpy(cpGPU, cp, sizeof(DTYPE*) * c->count, cudaMemcpyHostToDevice);
CudaBLASMatrixMULBatched(handle, _CudaBLASMatrixMULBatched(handle,
(const void**)apGPU, transposedA, a0->dataType, (const void**)apGPU, transposedA, a0->dataType,
(const void**)bpGPU, transposedB, b0->dataType, (const void**)bpGPU, transposedB, b0->dataType,
(void**)cpGPU, c0->dataType, a->count, (void**)cpGPU, c0->dataType, a->count,
...@@ -324,7 +324,7 @@ void CudaBLASMatrixMULList(cublasHandle_t * handle, ...@@ -324,7 +324,7 @@ void CudaBLASMatrixMULList(cublasHandle_t * handle,
XTensor * bi = (XTensor*)b->GetItem(i); XTensor * bi = (XTensor*)b->GetItem(i);
XTensor * ci = (XTensor*)c->GetItem(i); XTensor * ci = (XTensor*)c->GetItem(i);
CudaBLASMatrixMUL(handle, _CudaBLASMatrixMUL(handle,
ai->data, transposedA, ai->dataType, ai->data, transposedA, ai->dataType,
bi->data, transposedB, bi->dataType, bi->data, transposedB, bi->dataType,
ci->data, ci->dataType, ci->data, ci->dataType,
......
...@@ -28,21 +28,21 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -28,21 +28,21 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* matrix multiplication (BLAS) */ /* matrix multiplication (BLAS) */
extern "C" extern "C"
void MatrixMULCPU(XTensor * a, MATRIX_TRANS_TYPE transposedA, XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0); void _MatrixMULCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
#ifdef USE_CUDA #ifdef USE_CUDA
/* matrix multiplication via cuda version BLAS */ /* matrix multiplication via cuda version BLAS */
extern "C" extern "C"
void CudaBLASMatrixMUL(cublasHandle_t * handle, void _CudaBLASMatrixMUL(cublasHandle_t * handle,
void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB, const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
void * c, TENSOR_DATA_TYPE dataTypeC, void * c, TENSOR_DATA_TYPE dataTypeC,
int na, int ma, int nb, int mb, int nc, int mc, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0); int na, int ma, int nb, int mb, int nc, int mc, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
/* matrix multiplication in batch mode via cuda version BLAS */ /* matrix multiplication in batch mode via cuda version BLAS */
extern "C" extern "C"
void CudaBLASMatrixMULBatched(cublasHandle_t * handle, void _CudaBLASMatrixMULBatched(cublasHandle_t * handle,
const void ** a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, const void ** a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
const void ** b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB, const void ** b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
void ** c, TENSOR_DATA_TYPE dataTypeC, void ** c, TENSOR_DATA_TYPE dataTypeC,
...@@ -50,7 +50,7 @@ void CudaBLASMatrixMULBatched(cublasHandle_t * handle, ...@@ -50,7 +50,7 @@ void CudaBLASMatrixMULBatched(cublasHandle_t * handle,
/* matrix multiplication in batch and strided mode via cuda version BLAS */ /* matrix multiplication in batch and strided mode via cuda version BLAS */
extern "C" extern "C"
void CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle, void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, long long int strideA, const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, long long int strideA,
const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB, long long int strideB, const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB, long long int strideB,
void * c, TENSOR_DATA_TYPE dataTypeC, long long int strideC, void * c, TENSOR_DATA_TYPE dataTypeC, long long int strideC,
...@@ -58,7 +58,7 @@ void CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle, ...@@ -58,7 +58,7 @@ void CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
/* matrix multiplication in batch mode via cuda version BLAS */ /* matrix multiplication in batch mode via cuda version BLAS */
extern "C" extern "C"
void CudaBLASMatrixMULList(cublasHandle_t * handle, XList * a, MATRIX_TRANS_TYPE transposedA, XList * b, MATRIX_TRANS_TYPE transposedB, XList * c, void _CudaBLASMatrixMULList(cublasHandle_t * handle, const XList * a, MATRIX_TRANS_TYPE transposedA, const XList * b, MATRIX_TRANS_TYPE transposedB, XList * c,
int count, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0); int count, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
#endif #endif
......
...@@ -30,15 +30,15 @@ convert data type ...@@ -30,15 +30,15 @@ convert data type
>> input - input tensor >> input - input tensor
>> output - output tensor >> output - output tensor
*/ */
void ConvertTensorDataType(XTensor * input, XTensor * output) void _ConvertDataType(const XTensor * input, XTensor * output)
{ {
CheckNTErrors(XTensor::IsIdentical(input, output), "Input and Output are different in type or size!"); CheckNTErrors((input->unitSize == output->unitSize), "Input and Output must be same in size!");
if (input->dataType == output->dataType) if (input->dataType == output->dataType)
return; return;
#ifdef USE_CUDA #ifdef USE_CUDA
/* run it on GPUs */ /* run it on GPUs */
if (input->devID >= 0) { if (input->devID >= 0) {
CudaConvertDataType(input, output); _CudaConvertDataType(input, output);
return; return;
} }
#endif #endif
......
...@@ -78,7 +78,7 @@ data conversion (cuda code) ...@@ -78,7 +78,7 @@ data conversion (cuda code)
>> typeT - target data type >> typeT - target data type
>> size - number of the items in s (and t) >> size - number of the items in s (and t)
*/ */
void CudaConvertDataType(int devID, void * s, TENSOR_DATA_TYPE typeS, void * t, TENSOR_DATA_TYPE typeT, int size) void _CudaConvertDataType(int devID, void * s, TENSOR_DATA_TYPE typeS, void * t, TENSOR_DATA_TYPE typeT, int size)
{ {
CheckNTErrors((devID >= 0), "This code must be run on GPUs!"); CheckNTErrors((devID >= 0), "This code must be run on GPUs!");
...@@ -112,9 +112,9 @@ convert data type (cuda code) ...@@ -112,9 +112,9 @@ convert data type (cuda code)
>> input - input tensor >> input - input tensor
>> output - output tensor >> output - output tensor
*/ */
void CudaConvertDataType(XTensor * input, XTensor * output) void _CudaConvertDataType(const XTensor * input, XTensor * output)
{ {
CheckNTErrors(XTensor::IsIdentical(input, output), "Input and Output are different in type or size!"); CheckNTErrors((input->unitSize == output->unitSize), "Input and Output must be same in size!");
if (input->dataType == output->dataType) if (input->dataType == output->dataType)
return; return;
......
...@@ -19,6 +19,9 @@ ...@@ -19,6 +19,9 @@
* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11 * $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
*/ */
#ifndef __CONVERTDATATYPE_CUH__
#define __CONVERTDATATYPE_CUH__
#include "ConvertDataType.h" #include "ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -42,8 +45,10 @@ __global__ ...@@ -42,8 +45,10 @@ __global__
void KernelIntToFloat(int * inputData, float * outputData, int size); void KernelIntToFloat(int * inputData, float * outputData, int size);
/* convert data type */ /* convert data type */
void CudaConvertDataType(XTensor * input, XTensor * output); void _CudaConvertDataType(const XTensor * input, XTensor * output);
#endif // USE_CUDA #endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
#endif // __CONVERTDATATYPE_H__
\ No newline at end of file
...@@ -27,7 +27,7 @@ ...@@ -27,7 +27,7 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* convert data type */ /* convert data type */
void ConvertDataType(XTensor * input, XTensor * output); void _ConvertDataType(const XTensor * input, XTensor * output);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -26,8 +26,10 @@ ...@@ -26,8 +26,10 @@
namespace nts{ // namespace nts(NiuTrans.Tensor) namespace nts{ // namespace nts(NiuTrans.Tensor)
/* /*
generate a tensor with seleccted data in range[low,high] along the given dimension generate a tensor with selected data in range[low,high] along the given dimension
c = select(a) c = select(a)
>> a - input tensor >> a - input tensor
>> c - result tensor >> c - result tensor
>> dim - the dimension along with which we do the job >> dim - the dimension along with which we do the job
...@@ -35,7 +37,7 @@ c = select(a) ...@@ -35,7 +37,7 @@ c = select(a)
>> high - higher bound. >> high - higher bound.
Note that range [1,3] means that we select 1 and 2. Note that range [1,3] means that we select 1 and 2.
*/ */
void SelectRange(XTensor * a, XTensor * c, int dim, int low, int high) void _SelectRange(const XTensor * a, XTensor * c, int dim, int low, int high)
{ {
CheckNTErrors(a != NULL && c != NULL, "empty tensors!"); CheckNTErrors(a != NULL && c != NULL, "empty tensors!");
CheckNTErrors(a->order == c->order, "The input and output tensors must in the same order!"); CheckNTErrors(a->order == c->order, "The input and output tensors must in the same order!");
...@@ -76,4 +78,55 @@ void SelectRange(XTensor * a, XTensor * c, int dim, int low, int high) ...@@ -76,4 +78,55 @@ void SelectRange(XTensor * a, XTensor * c, int dim, int low, int high)
} }
} }
/*
generate a tensor with selected data in range[low,high] along the given dimension (return a XTensor structure)
make a new tensor to keep the result and return it
c = select(a)
>> a - input tensor
>> dim - the dimension along with which we do the job
>> low - lower bound
>> high - higher bound.
Note that range [1,3] means that we select 1 and 2.
<< return - the result of the generated tensor with selected data
*/
XTensor SelectRange(const XTensor &a, int dim, int low, int high)
{
int order = a.order;
int * dimSize = new int[order];
CheckNTErrors(&a != NULL, "Empty input tensors!");
CheckNTErrors(dim >= 0 && dim < a.order, "The input dimension is out of bounds!");
CheckNTErrors(low < high, "Illegal range specified!");
for(int i = 0; i < a.order; i++){
if(i == dim){
CheckNTErrors(low > 0 && low < a.dimSize[dim], "Illegal range specified!");
CheckNTErrors(high > 0 && high <= a.dimSize[dim], "Illegal range specified!");
dimSize[i] = high - low;
}
else
dimSize[i] = a.dimSize[i];
}
XTensor c = NewTensor(order, dimSize, a.dataType, a.denseRatio, a.devID, a.mem);
c.SetZeroAll();
c.SetTMP();
/* call _SelectRange function */
_SelectRange(&a, &c, dim, low, high);
/* tensor connection */
XLink::MakeLink(&a, NULL, &c, GETANDSET_SELECT);
XLink::AddParamToHead(&c, low);
XLink::AddParamToHead(&c, high);
/* destroy variables */
delete dimSize;
return c;
}
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-04
*/
#ifndef __SELECT_CUH__
#define __SELECT_CUH__
#include "Select.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
/* generate a tensor with selected data c = select(a) */
extern "C"
void _CudaSelect(const XTensor * a, XTensor * c, XTensor * indexCPU);
/*
generate a tensor with selected data in range[low,high] along the given dimension
c = select(a)
*/
extern "C"
void _CudaSelectRange(const XTensor * a, XTensor * c, int dim, int low, int high);
} // namespace nts(NiuTrans.Tensor)
#endif // __SELECT_CUH__
\ No newline at end of file
...@@ -26,14 +26,29 @@ ...@@ -26,14 +26,29 @@
namespace nts{ // namespace nts(NiuTrans.Tensor) namespace nts{ // namespace nts(NiuTrans.Tensor)
/* generate a tensor with seleccted data c = select(a) */ /* generate a tensor with selected data c = select(a) */
extern "C" extern "C"
void Select(XTensor * a, XTensor * c, XTensor * indexCPU); void _Select(const XTensor * a, XTensor * c, XTensor * indexCPU);
/* generate a tensor with seleccted data in range[low,high] along the given dimension /*
c = select(a) */ generate a tensor with selected data c = select(a) (returna a XTensor structure)
make a new tensor to keep the result and return it
*/
XTensor Select(const XTensor &a, XTensor &indexCPU);
/*
generate a tensor with selected data in range[low,high] along the given dimension
c = select(a)
*/
extern "C" extern "C"
void SelectRange(XTensor * a, XTensor * c, int dim, int low, int high); void _SelectRange(const XTensor * a, XTensor * c, int dim, int low, int high);
/*
generate a tensor with selected data in range[low,high] along the given dimension (return a XTensor structure)
make a new tensor to keep the result and return it
c = select(a)
*/
XTensor SelectRange(const XTensor &a, int dim, int low, int high);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -77,7 +77,7 @@ void SetDataRand(XTensor * tensor, DTYPE low, DTYPE high) ...@@ -77,7 +77,7 @@ void SetDataRand(XTensor * tensor, DTYPE low, DTYPE high)
else{ else{
XTensor * t2 = NewTensor(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, -1); XTensor * t2 = NewTensor(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, -1);
SetDataRand(t2, low, high); SetDataRand(t2, low, high);
CopyValues(t2, tensor); _CopyValues(t2, tensor);
delete t2; delete t2;
} }
} }
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include "../../XTensor.h" #include "../../XTensor.h"
#include "Log.h" #include "Log.h"
#include "Log.cuh" #include "Log.cuh"
#include <math.h>
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -29,12 +30,12 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -29,12 +30,12 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
set every entry to its log value set every entry to its log value
>> a - the tensor we are processing >> a - the tensor we are processing
*/ */
void Log(XTensor * a) void _Log(XTensor * a)
{ {
#ifdef USE_CUDA #ifdef USE_CUDA
/* run it on GPUs */ /* run it on GPUs */
if (a->devID >= 0) { if (a->devID >= 0) {
CudaLog(a); _CudaLog(a);
return; return;
} }
#endif #endif
......
...@@ -58,7 +58,7 @@ set each entry to its log value ...@@ -58,7 +58,7 @@ set each entry to its log value
>> a - the tensor >> a - the tensor
*/ */
extern "C" extern "C"
void CudaLog(XTensor * a) void _CudaLog(XTensor * a)
{ {
CheckNTErrors((a->isSparse == false), "TODO!"); CheckNTErrors((a->isSparse == false), "TODO!");
......
...@@ -19,6 +19,9 @@ ...@@ -19,6 +19,9 @@
* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11 * $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
*/ */
#ifndef __LOG_CUH__
#define __LOG_CUH__
#include "Log.h" #include "Log.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -35,8 +38,10 @@ void KernelLog(__half * d, int size); ...@@ -35,8 +38,10 @@ void KernelLog(__half * d, int size);
/* set each entry to its log value */ /* set each entry to its log value */
extern "C" extern "C"
void CudaLog(XTensor * a); void _CudaLog(XTensor * a);
#endif // USE_CUDA #endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
#endif // __LOG_CUH__
\ No newline at end of file
...@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* set every entry to its log value */ /* set every entry to its log value */
extern "C" extern "C"
void Log(XTensor * a); void _Log(XTensor * a);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -21,15 +21,18 @@ ...@@ -21,15 +21,18 @@
#include <math.h> #include <math.h>
#include "../../XTensor.h" #include "../../XTensor.h"
#include "../../XName.h"
#include "Normalize.h" #include "Normalize.h"
#include "Normalize.cuh" #include "Normalize.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* /*
normalized the data with normal distribution. For an input x, normalized the data with normal distribution
y = a * (x-mean)/sqrt(variance+\epsilon) + b
For an input x, y = a * (x-mean)/sqrt(variance+\epsilon) + b
where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter. where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter.
>> input - the input tensor >> input - the input tensor
>> output - the output tensor >> output - the output tensor
>> dim - dimension alone which we generate the mean and variance >> dim - dimension alone which we generate the mean and variance
...@@ -39,7 +42,7 @@ where a and b are the scalar and bias respectively, and \epsilon is the adjustme ...@@ -39,7 +42,7 @@ where a and b are the scalar and bias respectively, and \epsilon is the adjustme
>> b - the bias >> b - the bias
>> epsilon - a parameter >> epsilon - a parameter
*/ */
void Normalize(XTensor * input, XTensor * output, int dim, XTensor * mean, XTensor * var, XTensor * a, XTensor * b, DTYPE epsilon) void _Normalize(const XTensor * input, XTensor * output, int dim, const XTensor * mean, const XTensor * var, const XTensor * a, const XTensor * b, DTYPE epsilon)
{ {
int dimRDI = input->order - dim - 1; int dimRDI = input->order - dim - 1;
CheckNTErrors((XTensor::IsIdentical(input, output)), "Unmatched input tensors!"); CheckNTErrors((XTensor::IsIdentical(input, output)), "Unmatched input tensors!");
...@@ -68,7 +71,7 @@ void Normalize(XTensor * input, XTensor * output, int dim, XTensor * mean, XTens ...@@ -68,7 +71,7 @@ void Normalize(XTensor * input, XTensor * output, int dim, XTensor * mean, XTens
if (input->devID >= 0 || output->devID >= 0) { if (input->devID >= 0 || output->devID >= 0) {
#ifdef USE_CUDA #ifdef USE_CUDA
CudaNormalize(input, output, dim, mean, var, a, b, epsilon); _CudaNormalize(input, output, dim, mean, var, a, b, epsilon);
#else #else
ShowNTErrors("Please specify USE_CUDA and recompile the code!"); ShowNTErrors("Please specify USE_CUDA and recompile the code!");
#endif #endif
...@@ -91,4 +94,61 @@ void Normalize(XTensor * input, XTensor * output, int dim, XTensor * mean, XTens ...@@ -91,4 +94,61 @@ void Normalize(XTensor * input, XTensor * output, int dim, XTensor * mean, XTens
} }
} }
} }
/*
normalized the data with normal distribution (do it on site)
keep the result in the input tensor and return nothing
For an input x, x = a * (x-mean)/sqrt(variance+\epsilon) + b
where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter.
>> input - the input tensor
>> dim - dimension alone which we generate the mean and variance
>> mean - the mean of the input
>> var - the variance of the input
>> a - the scalar
>> b - the bias
>> epsilon - a parameter
*/
void _NormalizeMe(XTensor * input, int dim, const XTensor * mean, const XTensor * var, const XTensor * a, const XTensor * b, DTYPE epsilon)
{
_Normalize(input, input, dim, mean, var, a, b, epsilon);
}
/*
normalized the data with normal distribution (return a XTensor structure)
make a new tensor to keep the result and return it
For an input x, y = a * (x-mean)/sqrt(variance+\epsilon) + b
where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter.
>> input - the input tensor
>> dim - dimension alone which we generate the mean and variance
>> mean - the mean of the input
>> var - the variance of the input
>> a - the scalar
>> b - the bias
>> epsilon - a parameter
<< return - the result of normalized the data with normal distribution
*/
XTensor Normalize(const XTensor &input, int dim, const XTensor &mean, const XTensor &var, const XTensor &a, const XTensor &b, DTYPE epsilon)
{
XTensor output(&input);
output.SetTMP();
/* call _Normalize function */
_Normalize(&input, &output, dim, &mean, &var, &a, &b, epsilon);
/* tensor connections */
XList list(5);
list.Add(&input);
list.Add(&mean);
list.Add(&var);
list.Add(&a);
list.Add(&b);
XLink::MakeLink(&list, &output, MATH_NORMALIZE);
XLink::AddParamToHeadInt(&output, dim);
XLink::AddParamToHead(&output, epsilon);
return output;
}
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
...@@ -89,9 +89,9 @@ where a and b are the scalar and bias respectively, and \epsilon is the adjustme ...@@ -89,9 +89,9 @@ where a and b are the scalar and bias respectively, and \epsilon is the adjustme
>> epsilon - a parameter >> epsilon - a parameter
*/ */
extern "C" extern "C"
void CudaNormalize(XTensor * input, XTensor * output, int dim, void _CudaNormalize(const XTensor * input, XTensor * output, int dim,
XTensor * mean, XTensor * var, const XTensor * mean, const XTensor * var,
XTensor * a, XTensor * b, const XTensor * a, const XTensor * b,
DTYPE epsilon) DTYPE epsilon)
{ {
CheckNTErrors((input->dataType == DEFAULT_DTYPE), "TODO!"); CheckNTErrors((input->dataType == DEFAULT_DTYPE), "TODO!");
......
...@@ -44,9 +44,9 @@ y = a * (x-mean)/sqrt(variance+\epsilon) + b ...@@ -44,9 +44,9 @@ y = a * (x-mean)/sqrt(variance+\epsilon) + b
where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter
*/ */
extern "C" extern "C"
void CudaNormalize(XTensor * input, XTensor * output, int dim, void _CudaNormalize(const XTensor * input, XTensor * output, int dim,
XTensor * mean, XTensor * var, const XTensor * mean, const XTensor * var,
XTensor * a, XTensor * b, DTYPE epsilon); const XTensor * a, const XTensor * b, DTYPE epsilon);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -27,12 +27,29 @@ ...@@ -27,12 +27,29 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* /*
normalized the data with normal distribution. For an input x, normalized the data with normal distribution.
y = a * (x-mean)/sqrt(variance+\epsilon) + b For an input x, y = a * (x-mean)/sqrt(variance+\epsilon) + b
where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter. where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter.
*/ */
extern "C" extern "C"
void Normalize(XTensor * input, XTensor * output, int dim, XTensor * mean, XTensor * var, XTensor * a, XTensor * b, DTYPE epsilon); void _Normalize(const XTensor * input, XTensor * output, int dim, const XTensor * mean, const XTensor * var, const XTensor * a, const XTensor * b, DTYPE epsilon);
/*
normalized the data with normal distribution (do it on site)
keep the result in the input tenosr and return nothing
For an input x, x = a * (x-mean)/sqrt(variance+\epsilon) + b
where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter.
*/
extern "C"
void _NormalizeMe(XTensor * input, int dim, const XTensor * mean, const XTensor * var, const XTensor * a, const XTensor * b, DTYPE epsilon);
/*
normalized the data with normal distribution (return a XTensor structure)
make a new tensor to keep the result and return it
For an input x, y = a * (x-mean)/sqrt(variance+\epsilon) + b
where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter.
*/
XTensor Normalize(const XTensor &input, int dim, const XTensor &mean, const XTensor &var, const XTensor &a, const XTensor &b, DTYPE epsilon);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -31,12 +31,12 @@ get the power(a, p) ...@@ -31,12 +31,12 @@ get the power(a, p)
>> a - the tensor >> a - the tensor
>> p - as it is >> p - as it is
*/ */
void Power(XTensor * a, DTYPE p) void _Power(XTensor * a, DTYPE p)
{ {
#ifdef USE_CUDA #ifdef USE_CUDA
/* run it on GPUs */ /* run it on GPUs */
if (a->devID >= 0) { if (a->devID >= 0) {
CudaPower(a, p); _CudaPower(a, p);
return; return;
} }
#endif #endif
......
...@@ -96,7 +96,7 @@ void KernelPower(__half * d, __half p, int size) ...@@ -96,7 +96,7 @@ void KernelPower(__half * d, __half p, int size)
/* get the power of the entries */ /* get the power of the entries */
extern "C" extern "C"
void CudaPower(XTensor * a, DTYPE p) void _CudaPower(XTensor * a, DTYPE p)
{ {
int gridSize[3]; int gridSize[3];
int blockSize[3]; int blockSize[3];
......
...@@ -38,7 +38,7 @@ void KernelSqrtV2(__half * d, int size); ...@@ -38,7 +38,7 @@ void KernelSqrtV2(__half * d, int size);
/* get the power of the entries */ /* get the power of the entries */
extern "C" extern "C"
void CudaPower(XTensor * a, DTYPE p); void _CudaPower(XTensor * a, DTYPE p);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* get the power(x, y) */ /* get the power(x, y) */
extern "C" extern "C"
void Power(XTensor * a, DTYPE p); void _Power(XTensor * a, DTYPE p);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -28,8 +28,10 @@ ...@@ -28,8 +28,10 @@
namespace nts{ // namespace nts(NiuTrans.Tensor) namespace nts{ // namespace nts(NiuTrans.Tensor)
/* /*
scale and shift all tensor entires b = a * scale + shift scale and shift all tensor entires
b = a * scale + shift b = a * scale + shift
>> a - the input tensor >> a - the input tensor
>> b - the output tensor >> b - the output tensor
>> scale - the scaler factor >> scale - the scaler factor
...@@ -76,8 +78,11 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift) ...@@ -76,8 +78,11 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift)
} }
/* /*
scale and shift all tensor entires on site b = a * scale + shift scale and shift all tensor entires (do it on site)
b = a * scale + shift keep the result in the input tensor a and return nothing
a = a * scale + shift
>> a - the input/output tensor >> a - the input/output tensor
>> scale - the scaler factor >> scale - the scaler factor
>> shift - the shift factor >> shift - the shift factor
...@@ -88,19 +93,22 @@ void _ScaleAndShiftMe(XTensor * a, DTYPE scale, DTYPE shift) ...@@ -88,19 +93,22 @@ void _ScaleAndShiftMe(XTensor * a, DTYPE scale, DTYPE shift)
} }
/* /*
scale and shift all tensor entires b = a * scale + shift scale and shift all tensor entires (return a XTensor structure)
make a new tensor to keep the result and return it
b = a * scale + shift b = a * scale + shift
>> a - the input tensor >> a - the input tensor
>> b - the output tensor
>> scale - the scaler factor >> scale - the scaler factor
>> shift - the shift factor >> shift - the shift factor
<< return - the result of scaling and shifting all tensor entires
*/ */
XTensor ScaleAndShift(const XTensor &a, DTYPE scale, DTYPE shift) XTensor ScaleAndShift(const XTensor &a, DTYPE scale, DTYPE shift)
{ {
XTensor b(&a); XTensor b(&a);
b.SetTMP(); b.SetTMP();
/* computation */ /* call _ScaleAndShift function */
_ScaleAndShift(&a, &b, scale, shift); _ScaleAndShift(&a, &b, scale, shift);
/* tensor connections */ /* tensor connections */
......
...@@ -30,13 +30,24 @@ namespace nts{ // namespace nts(NiuTrans.Tensor) ...@@ -30,13 +30,24 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
#define _LinearMe _ScaleAndShiftMe #define _LinearMe _ScaleAndShiftMe
#define Linear ScaleAndShift #define Linear ScaleAndShift
/* scale and shift all tensor entires b = a * scale + shift */ /*
scale and shift all tensor entires
b = a * scale + shift
*/
void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift = 0); void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift = 0);
/* scale and shift all tensor entires on site a = a * scale + shift */ /*
scale and shift all tensor entires
keep the result in the input tensor a and return nothing
a = a * scale + shift
*/
void _ScaleAndShiftMe(XTensor * a, DTYPE scale, DTYPE shift = 0); void _ScaleAndShiftMe(XTensor * a, DTYPE scale, DTYPE shift = 0);
/* scale and shift all tensor entires b = a * scale + shift, and return the result tensor b */ /*
scale and shift all tensor entires
make a new tensor to keep the result and return it
b = a * scale + shift
*/
XTensor ScaleAndShift(const XTensor &a, DTYPE scale, DTYPE shift = 0); XTensor ScaleAndShift(const XTensor &a, DTYPE scale, DTYPE shift = 0);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -36,7 +36,7 @@ copy a number of blocks to target positions ...@@ -36,7 +36,7 @@ copy a number of blocks to target positions
>> targetBlocks - target positions of the copy >> targetBlocks - target positions of the copy
>> myMem - the memory pool >> myMem - the memory pool
*/ */
void CopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem) void _CopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem)
{ {
if (myMem != NULL && myMem->devID >= 0) { if (myMem != NULL && myMem->devID >= 0) {
#ifdef USE_CUDA #ifdef USE_CUDA
...@@ -44,7 +44,7 @@ void CopyBlocks(void * source, int blockSize, int blockNum, void * target, int * ...@@ -44,7 +44,7 @@ void CopyBlocks(void * source, int blockSize, int blockNum, void * target, int *
int * targetBlocksTMP = (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)); int * targetBlocksTMP = (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int));
XMemCopy(targetBlocksTMP, myMem->devID, targetBlocks, -1, blockNum * sizeof(int)); XMemCopy(targetBlocksTMP, myMem->devID, targetBlocks, -1, blockNum * sizeof(int));
CopyBlocksOnSite(source, blockSize, blockNum, target, targetBlocksTMP, myMem); _CopyBlocksOnSite(source, blockSize, blockNum, target, targetBlocksTMP, myMem);
myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int)); myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
#else #else
...@@ -52,7 +52,7 @@ void CopyBlocks(void * source, int blockSize, int blockNum, void * target, int * ...@@ -52,7 +52,7 @@ void CopyBlocks(void * source, int blockSize, int blockNum, void * target, int *
#endif #endif
} }
else { else {
CopyBlocksOnSite(source, blockSize, blockNum, target, targetBlocks, myMem); _CopyBlocksOnSite(source, blockSize, blockNum, target, targetBlocks, myMem);
} }
} }
...@@ -66,14 +66,14 @@ copy a number of blocks source source positions to target positions ...@@ -66,14 +66,14 @@ copy a number of blocks source source positions to target positions
>> targetBlocks - target positions of the copy >> targetBlocks - target positions of the copy
>> myMem - the memory pool >> myMem - the memory pool
*/ */
void CopyBlocks(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID) void _CopyBlocks(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID)
{ {
if (myMem != NULL) if (myMem != NULL)
CheckNTErrors((myMem->devID == devID), "DevIDs are different between memory pool and input devID!"); CheckNTErrors((myMem->devID == devID), "DevIDs are different between memory pool and input devID!");
if (devID >= 0) { if (devID >= 0) {
#ifdef USE_CUDA #ifdef USE_CUDA
CudaCopyBlocksSelected(source, blockSize, sourceBlocks, blockNum, target, targetBlocks, myMem, devID); _CudaCopyBlocksSelected(source, blockSize, sourceBlocks, blockNum, target, targetBlocks, myMem, devID);
#else #else
ShowNTErrors("Plesae specify USE_CUDA and recompile the code!"); ShowNTErrors("Plesae specify USE_CUDA and recompile the code!");
#endif #endif
......
...@@ -27,10 +27,10 @@ ...@@ -27,10 +27,10 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* copy a number of blocks to target positions */ /* copy a number of blocks to target positions */
void CopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem); void _CopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem);
/* copy a number of blocks from source positions to target positions */ /* copy a number of blocks from source positions to target positions */
void CopyBlocks(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID); void _CopyBlocks(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -38,7 +38,7 @@ Note that a grid may have a number of blocks ...@@ -38,7 +38,7 @@ Note that a grid may have a number of blocks
>> myMem - the memory pool >> myMem - the memory pool
>> isIndexOnDev - indicates whether the index is on the device already >> isIndexOnDev - indicates whether the index is on the device already
*/ */
void CopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, void * target, void _CopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, void * target,
int * index, int unitSize, bool isIndexOnDev, XMem * myMem) int * index, int unitSize, bool isIndexOnDev, XMem * myMem)
{ {
CheckNTErrors((unitSize == sizeof(int)), "TODO!"); CheckNTErrors((unitSize == sizeof(int)), "TODO!");
...@@ -51,7 +51,7 @@ void CopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, v ...@@ -51,7 +51,7 @@ void CopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, v
XMemCopy(indexGPU, myMem->devID, index, -1, blockNum * gridNum * sizeof(int)); XMemCopy(indexGPU, myMem->devID, index, -1, blockNum * gridNum * sizeof(int));
} }
CudaCopyBlocksInGrid(source, blockSize, blockNum, gridNum, target, indexGPU, unitSize, myMem); _CudaCopyBlocksInGrid(source, blockSize, blockNum, gridNum, target, indexGPU, unitSize, myMem);
if (!isIndexOnDev) if (!isIndexOnDev)
myMem->ReleaseBuf(myMem->devID, blockNum * gridNum * sizeof(int)); myMem->ReleaseBuf(myMem->devID, blockNum * gridNum * sizeof(int));
......
...@@ -216,7 +216,7 @@ Note that a grid may have a number of blocks ...@@ -216,7 +216,7 @@ Note that a grid may have a number of blocks
>> itemSize - size of each data item >> itemSize - size of each data item
>> myMem - the memory pool >> myMem - the memory pool
*/ */
void CudaCopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, void * target, int * index, int itemSize, XMem * myMem) void _CudaCopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, void * target, int * index, int itemSize, XMem * myMem)
{ {
CheckNTErrors((myMem != NULL && myMem->devID >= 0), "This code must be run on GPUs!"); CheckNTErrors((myMem != NULL && myMem->devID >= 0), "This code must be run on GPUs!");
CheckNTErrors((itemSize == sizeof(int)), "TODO!"); CheckNTErrors((itemSize == sizeof(int)), "TODO!");
......
...@@ -30,7 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -30,7 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* copy data by index */ /* copy data by index */
extern "C" extern "C"
void CudaCopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, void * target, int * index, int unitSize, XMem * myMem); void _CudaCopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, void * target, int * index, int unitSize, XMem * myMem);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* copy a number of blocks in grid */ /* copy a number of blocks in grid */
extern "C" extern "C"
void CopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, void * target, int * index, int unitSize, bool isIndexOnDev, XMem * myMem); void _CopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, void * target, int * index, int unitSize, bool isIndexOnDev, XMem * myMem);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -36,11 +36,11 @@ all the data has been on the device (CPU/GPU) already. ...@@ -36,11 +36,11 @@ all the data has been on the device (CPU/GPU) already.
>> targetBlocks - target positions of the copy >> targetBlocks - target positions of the copy
>> myMem - the memory pool >> myMem - the memory pool
*/ */
void CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem) void _CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem)
{ {
if (myMem != NULL && myMem->devID >= 0) { if (myMem != NULL && myMem->devID >= 0) {
#ifdef USE_CUDA #ifdef USE_CUDA
CudaCopyBlocks(source, blockSize, blockNum, target, targetBlocks, myMem); _CudaCopyBlocks(source, blockSize, blockNum, target, targetBlocks, myMem);
#else #else
ShowNTErrors("Plesae specify USE_CUDA and recompile the code!"); ShowNTErrors("Plesae specify USE_CUDA and recompile the code!");
#endif #endif
......
...@@ -80,7 +80,7 @@ copy a number of blocks to target positions (cuda version) ...@@ -80,7 +80,7 @@ copy a number of blocks to target positions (cuda version)
>> targetBlocks - target positions of the copy (on the device) >> targetBlocks - target positions of the copy (on the device)
>> myMem - memory pool >> myMem - memory pool
*/ */
void CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem) void _CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem)
{ {
CheckNTErrors((myMem != NULL), "No memory pool!"); CheckNTErrors((myMem != NULL), "No memory pool!");
CheckNTErrors((myMem->devID >= 0), "Wrong device to run!"); CheckNTErrors((myMem->devID >= 0), "Wrong device to run!");
......
...@@ -34,7 +34,7 @@ void KernelCopyBlocks(DTYPE * source, int blockSize, int blockNum, DTYPE * targe ...@@ -34,7 +34,7 @@ void KernelCopyBlocks(DTYPE * source, int blockSize, int blockNum, DTYPE * targe
/* copy a number of blocks to target positions (cuda version) */ /* copy a number of blocks to target positions (cuda version) */
extern "C" extern "C"
void CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem); void _CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* copy a number of blocks to target positions (on site) */ /* copy a number of blocks to target positions (on site) */
extern "C" extern "C"
void CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem); void _CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -70,7 +70,7 @@ copy a number of blocks from source positions to target positions (cuda version) ...@@ -70,7 +70,7 @@ copy a number of blocks from source positions to target positions (cuda version)
>> targetBlocks - target positions of the copy >> targetBlocks - target positions of the copy
>> myMem - memory pool >> myMem - memory pool
*/ */
void CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID) void _CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID)
{ {
CheckNTErrors((devID >= 0), "Wrong device to run!"); CheckNTErrors((devID >= 0), "Wrong device to run!");
CheckNTErrors((blockSize % sizeof(DTYPE) == 0), "Unsupported block size!"); CheckNTErrors((blockSize % sizeof(DTYPE) == 0), "Unsupported block size!");
......
...@@ -34,7 +34,7 @@ void KernelCopyBlocksSelected(DTYPE * source, int blockSize, int * sourceBlocks, ...@@ -34,7 +34,7 @@ void KernelCopyBlocksSelected(DTYPE * source, int blockSize, int * sourceBlocks,
/* copy a number of blocks form source positions to target positions (cuda version) */ /* copy a number of blocks form source positions to target positions (cuda version) */
extern "C" extern "C"
void CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID); void _CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -36,7 +36,7 @@ copy data blocks by 2d layout ...@@ -36,7 +36,7 @@ copy data blocks by 2d layout
>> n - height of each block >> n - height of each block
>> myMem - the memory pool >> myMem - the memory pool
*/ */
void CopyData2D(void ** s, int sPitch, void ** t, int tPitch, int blockNum, int mSize, int n, XMem * myMem) void _CopyData2D(void ** s, int sPitch, void ** t, int tPitch, int blockNum, int mSize, int n, XMem * myMem)
{ {
int devID = myMem != NULL ? myMem->devID : -1; int devID = myMem != NULL ? myMem->devID : -1;
......
...@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* copy data blocks by 2d layout */ /* copy data blocks by 2d layout */
extern "C" extern "C"
void CopyData2D(void ** s, int sPitch, void ** t, int tPitch, int count, int mSize, int n, XMem * myMem); void _CopyData2D(void ** s, int sPitch, void ** t, int tPitch, int count, int mSize, int n, XMem * myMem);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -36,7 +36,7 @@ in the k-th grid ...@@ -36,7 +36,7 @@ in the k-th grid
>> blockNumInGrid - number of blocks in each grid >> blockNumInGrid - number of blocks in each grid
>> isIndexOnDev - indicates whether the index is on the device already >> isIndexOnDev - indicates whether the index is on the device already
*/ */
void CopyInGrid(XTensor * s, XTensor * t, int * index, int blockDim, int blockNumInGrid, bool isIndexOnDev) void _CopyInGrid(const XTensor * s, XTensor * t, int * index, int blockDim, int blockNumInGrid, bool isIndexOnDev)
{ {
CheckNTErrors((XTensor::IsIdentical(s, t)), "Unmatched tensors!"); CheckNTErrors((XTensor::IsIdentical(s, t)), "Unmatched tensors!");
...@@ -50,7 +50,7 @@ void CopyInGrid(XTensor * s, XTensor * t, int * index, int blockDim, int blockNu ...@@ -50,7 +50,7 @@ void CopyInGrid(XTensor * s, XTensor * t, int * index, int blockDim, int blockNu
CheckNTErrors((s->unitNum % (blockSize * blockNum) == 0), "Illegal block number!"); CheckNTErrors((s->unitNum % (blockSize * blockNum) == 0), "Illegal block number!");
gridNum = s->unitNum / (blockSize * blockNum); gridNum = s->unitNum / (blockSize * blockNum);
CopyBlocksInGrid(s->data, blockSize, blockNum, gridNum, t->data, index, s->unitSize, isIndexOnDev, s->mem); _CopyBlocksInGrid(s->data, blockSize, blockNum, gridNum, t->data, index, s->unitSize, isIndexOnDev, s->mem);
} }
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
...@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* copy a number of blocks in grid. i.e., reorder the data blocks in the same memory piece*/ /* copy a number of blocks in grid. i.e., reorder the data blocks in the same memory piece*/
extern "C" extern "C"
void CopyInGrid(XTensor * s, XTensor * t, int * index, int blockDim, int blockNumInGrid, bool isIndexOnDev = false); void _CopyInGrid(const XTensor * s, XTensor * t, int * index, int blockDim, int blockNumInGrid, bool isIndexOnDev = false);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -21,11 +21,13 @@ ...@@ -21,11 +21,13 @@
#include "CopyIndexed.h" #include "CopyIndexed.h"
#include "CopyBlocks.h" #include "CopyBlocks.h"
#include "../../XName.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* /*
copy indexed sub-tensors copy indexed sub-tensors
>> s - the source tensor >> s - the source tensor
>> t - the target tensor >> t - the target tensor
>> dim - the leading dimension to define "sub-tensors" >> dim - the leading dimension to define "sub-tensors"
...@@ -34,11 +36,11 @@ copy indexed sub-tensors ...@@ -34,11 +36,11 @@ copy indexed sub-tensors
>> srcIndex - index of the source sub-tensors >> srcIndex - index of the source sub-tensors
>> indexSize - length of srcIndex (and tgtIndex) >> indexSize - length of srcIndex (and tgtIndex)
>> tgtIndex - index of the target sub-tensors >> tgtIndex - index of the target sub-tensors
>> copyNum - number of the sub-tensors we copy for each source index, e.g., >> copyNum - number of the sub-tensors we copy for each source index,
for srcIndex = [1,4] and copyNum = 2, we actually copy the source sub-tensors 1, 2, 4, 5 e.g., for srcIndex = [1,4] and copyNum = 2,
<< return - whether copy indexed operation was successful we actually copy the source sub-tensors 1, 2, 4, 5
*/ */
bool CopyIndexed(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum) void _CopyIndexed(const XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum)
{ {
CheckNTErrors((s && t), "Invalid tensors!"); CheckNTErrors((s && t), "Invalid tensors!");
CheckNTErrors((s->devID == t->devID || (s->devID < 0 && t->devID < 0)), CheckNTErrors((s->devID == t->devID || (s->devID < 0 && t->devID < 0)),
...@@ -84,12 +86,62 @@ bool CopyIndexed(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSiz ...@@ -84,12 +86,62 @@ bool CopyIndexed(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSiz
CheckNTErrors((tgtIndex[i] < blockNumTgt), "Index is out of range!"); CheckNTErrors((tgtIndex[i] < blockNumTgt), "Index is out of range!");
} }
CopyBlocks(s->data, blockSizeSrc * s->unitSize, realSrcIndex, realIndexSize, t->data, realTgtIndex, s->mem, s->devID); _CopyBlocks(s->data, blockSizeSrc * s->unitSize, realSrcIndex, realIndexSize, t->data, realTgtIndex, s->mem, s->devID);
delete[] realSrcIndex; delete[] realSrcIndex;
delete[] realTgtIndex; delete[] realTgtIndex;
}
/*
copy indexed sub-tensors (return a XTensor structure)
make a new tensor to keep the result and return it
>> s - the source tensor
>> dim - the leading dimension to define "sub-tensors"
e.g., for a tensor of size (3, 2, 4) and dim = 2,
we have 4 sub-tensors of size (3,2)
>> srcIndex - index of the source sub-tensors
>> indexSize - length of srcIndex (and tgtIndex)
>> tgtIndex - index of the target sub-tensors
>> copyNum - number of the sub-tensors we copy for each source index,
e.g., for srcIndex = [1,4] and copyNum = 2,
we actually copy the source sub-tensors 1, 2, 4, 5
<< return - the result of copying indexed sub-tensors
*/
XTensor CopyIndexed(const XTensor &s, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum)
{
CheckNTErrors(&s, "Empty input tensor!");
CheckNTErrors((dim >= 0 && dim < s.order), "A too larget dimension specified!");
int order = s.order;
int * dimSize = new int[order];
for (int i = 0; i < s.order; i++) {
if (i == dim)
dimSize[i] = indexSize * copyNum;
else
dimSize[i] = s.dimSize[i];
}
XTensor t = NewTensor(order, dimSize, s.dataType, s.denseRatio, s.devID, s.mem);
t.SetZeroAll();
t.SetTMP();
/* call _CopyIndexed function */
_CopyIndexed(&s, &t, dim, srcIndex, indexSize, tgtIndex, copyNum);
/* destroy variables */
delete dimSize;
/* tensor connection */
XLink::MakeLink(&s, NULL, &t, MOVEMENT_COPYINDEXED);
XLink::AddParamToHead(&t, dim);
XLink::AddParamToHeadPointer(&t, srcIndex);
XLink::AddParamToHead(&t, indexSize);
XLink::AddParamToHeadPointer(&t, tgtIndex);
XLink::AddParamToHead(&t, copyNum);
return true; return t;
} }
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
...@@ -28,7 +28,13 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -28,7 +28,13 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* copy selected sub-tensors */ /* copy selected sub-tensors */
extern "C" extern "C"
bool CopyIndexed(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum); void _CopyIndexed(const XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum);
/*
copy selected sub-tensors (return a XTensor structure)
make a new tensor to keep the result and return it
*/
XTensor CopyIndexed(const XTensor &s, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -27,18 +27,15 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -27,18 +27,15 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* /*
copy s to t copy s to t
>> s - source >> s - source
>> t - target >> t - target
>> stream - the stream for creating the job pipeline >> stream - the stream for creating the job pipeline
<< return - succeeded or not
*/ */
bool CopyValues(const XTensor * s, XTensor * t, XStream * stream) void _CopyValues(const XTensor * s, XTensor * t, XStream * stream)
{ {
if (s == NULL || t == NULL) CheckNTErrors((s != NULL && t != NULL), "The input tensor and output tensor must be nonempty!");
return false; CheckNTErrors((s->data != NULL), "Cannot copy from an empty data array!");
if (s->data == NULL || t->data == NULL)
return false;
CheckNTErrors((t->data != NULL), "Cannot copy to an empty data array!"); CheckNTErrors((t->data != NULL), "Cannot copy to an empty data array!");
CheckNTErrors((s->unitNum == t->unitNum), "Unmatched data item number!"); CheckNTErrors((s->unitNum == t->unitNum), "Unmatched data item number!");
...@@ -48,12 +45,13 @@ bool CopyValues(const XTensor * s, XTensor * t, XStream * stream) ...@@ -48,12 +45,13 @@ bool CopyValues(const XTensor * s, XTensor * t, XStream * stream)
"The code must be run on the same device!"); "The code must be run on the same device!");
CheckNTErrors((s->isSparse || t->isSparse), "TODO!"); CheckNTErrors((s->isSparse || t->isSparse), "TODO!");
ConvertDataType(s->devID, s->data, s->dataType, t->data, t->dataType, s->unitNum); ConvertDataType(s->devID, s->data, s->dataType, t->data, t->dataType, s->unitNum);
return true;
} }
#ifdef USE_CUDA #ifdef USE_CUDA
if (s->devID >= 0 || t->devID >= 0) if (s->devID >= 0 || t->devID >= 0) {
return CudaCopyValues(s, t, stream); _CudaCopyValues(s, t, stream);
return;
}
#endif #endif
if (!s->isSparse && !t->isSparse) { if (!s->isSparse && !t->isSparse) {
...@@ -68,8 +66,28 @@ bool CopyValues(const XTensor * s, XTensor * t, XStream * stream) ...@@ -68,8 +66,28 @@ bool CopyValues(const XTensor * s, XTensor * t, XStream * stream)
else { else {
ShowNTErrors("TODO!"); ShowNTErrors("TODO!");
} }
}
/*
copy s to t (return a XTensor structure)
make a new tensor to keep the result and return it
>> s - source
>> stream - the stream for creating the job pipeline
<< return - the copyed tensor t
*/
XTensor CopyValues(const XTensor &s, XStream * stream)
{
XTensor t(&s);
t.SetTMP();
/* call _CopyValues function */
_CopyValues(&s, &t, stream);
/* tensor connection */
XLink::MakeLink(&s, NULL, &t, MOVEMENT_COPYVALUES);
return true; return t;
} }
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
...@@ -35,11 +35,9 @@ copy a range of elements from a source vector to a target vector ...@@ -35,11 +35,9 @@ copy a range of elements from a source vector to a target vector
>> stream - the stream for creating the job pipeline >> stream - the stream for creating the job pipeline
<< return - succeed or not << return - succeed or not
*/ */
bool CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream) void _CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream)
{ {
if (s == NULL || t == NULL) CheckNTErrors((s != NULL && t != NULL), "The input tensor and output tensor must be nonempty!");
return false;
CheckNTErrors(s->dataType == t->dataType, "Unmatched data type!"); CheckNTErrors(s->dataType == t->dataType, "Unmatched data type!");
CheckNTErrors((s->unitSize == t->unitSize), "Incompatible vectors in value copy."); CheckNTErrors((s->unitSize == t->unitSize), "Incompatible vectors in value copy.");
CheckNTErrors((s->denseRatio <= s->denseRatio), "Incompatible vectors in value copy."); CheckNTErrors((s->denseRatio <= s->denseRatio), "Incompatible vectors in value copy.");
...@@ -83,8 +81,6 @@ bool CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream) ...@@ -83,8 +81,6 @@ bool CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream)
else { else {
ShowNTErrors("TODO!"); ShowNTErrors("TODO!");
} }
return true;
} }
......
...@@ -30,7 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -30,7 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* copy all elements from a source matrix to a target matrix */ /* copy all elements from a source matrix to a target matrix */
extern "C" extern "C"
bool CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream = NULL); void _CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream = NULL);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -28,7 +28,13 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -28,7 +28,13 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* copy s to t */ /* copy s to t */
extern "C" extern "C"
bool CopyValues(const XTensor * s, XTensor * t, XStream * stream = NULL); void _CopyValues(const XTensor * s, XTensor * t, XStream * stream = NULL);
/*
copy s to t (return a XTensor structure)
make a new tensor to keep the result and return it
*/
XTensor CopyValues(const XTensor &s, XStream * stream = NULL);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -27,12 +27,13 @@ ...@@ -27,12 +27,13 @@
namespace nts{ // namespace nts(NiuTrans.Tensor) namespace nts{ // namespace nts(NiuTrans.Tensor)
/* /*
get the max value of the items along a dimension of the tensor. get the max value of the items along a dimension of the tensor
>> input - the input tensor >> input - the input tensor
>> output - the output tensor >> output - the output tensor
>> dim - the dimension where the reduction is performed on >> dim - the dimension where the reduction is performed on
*/ */
void ReduceMax(XTensor * input, XTensor * output, int dim) void _ReduceMax(const XTensor * input, XTensor * output, int dim)
{ {
CheckNTErrors((input->devID == output->devID || (input->devID < 0 && output->devID < 0)), CheckNTErrors((input->devID == output->devID || (input->devID < 0 && output->devID < 0)),
"This code must be run on the same device!"); "This code must be run on the same device!");
...@@ -55,7 +56,7 @@ void ReduceMax(XTensor * input, XTensor * output, int dim) ...@@ -55,7 +56,7 @@ void ReduceMax(XTensor * input, XTensor * output, int dim)
if(input->devID >= 0){ if(input->devID >= 0){
#ifdef USE_CUDA #ifdef USE_CUDA
CudaReduceMax(input, output, dim); _CudaReduceMax(input, output, dim);
#endif #endif
} }
else{ else{
...@@ -90,4 +91,43 @@ void ReduceMax(XTensor * input, XTensor * output, int dim) ...@@ -90,4 +91,43 @@ void ReduceMax(XTensor * input, XTensor * output, int dim)
} }
} }
/*
get the max value of the items along a dimension of the tensor (return a XTensor structure).
make a new tensor to keep the result and return it
>> input - the input tensor
>> dim - the dimension where the reduction is performed on
<< return - the max value of the items along a dimension of the tensor
*/
XTensor ReduceMax(const XTensor &input, int dim)
{
CheckNTErrors(&input, "Empty input or output tensors!");
CheckNTErrors((dim >= 0 && dim < input.order), "Illegal dimension to reduce!");
int order = input.order - 1;
int * dimSize = new int[order];
for(int i = 0; i < input.order; i++){
if(i < dim)
dimSize[i] = input.dimSize[i];
else if(i > dim)
dimSize[i] = input.dimSize[i + 1];
}
XTensor output = NewTensor(order, dimSize, input.dataType, input.denseRatio, input.devID, input.mem);
output.SetZeroAll();
output.SetTMP();
/* call _ReduceMax function */
_ReduceMax(&input, &output, dim);
/* destroy variables */
delete dimSize;
/* tensor connection */
XLink::MakeLink(&input, NULL, &output, REDUCE_REDUCEMAX);
XLink::AddParamToHead(&output, dim);
return output;
}
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
...@@ -334,7 +334,7 @@ sum_i = max_{0<=j<strideNum} input_{i,j} ...@@ -334,7 +334,7 @@ sum_i = max_{0<=j<strideNum} input_{i,j}
>> output - the output tensor >> output - the output tensor
>> dim - which dimension to reduce >> dim - which dimension to reduce
*/ */
void CudaReduceMax(XTensor * input, XTensor * output, int dim) void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
{ {
CheckNTErrors((input && output), "Empty input or output tensors!"); CheckNTErrors((input && output), "Empty input or output tensors!");
CheckNTErrors((input->order == output->order + 1), "Incorrect tensor sizes!"); CheckNTErrors((input->order == output->order + 1), "Incorrect tensor sizes!");
......
...@@ -30,7 +30,7 @@ namespace nts{ // namespace nts(NiuTrans.Tensor) ...@@ -30,7 +30,7 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
/* get the max-valued items along a dimension of the tensor (cuda version) */ /* get the max-valued items along a dimension of the tensor (cuda version) */
extern "C" extern "C"
void CudaReduceMax(XTensor * input, XTensor * output, int dim); void _CudaReduceMax(const XTensor * input, XTensor * output, int dim);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -28,7 +28,13 @@ namespace nts{ // namespace nts(NiuTrans.Tensor) ...@@ -28,7 +28,13 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
/* get the max value of the items along a dimension of the tensor. */ /* get the max value of the items along a dimension of the tensor. */
extern "C" extern "C"
void ReduceMax(XTensor * input, XTensor * output, int dim); void _ReduceMax(const XTensor * input, XTensor * output, int dim);
/*
get the max value of the items along a dimension of the tensor (return a XTensor structure)
make a new tensor to keep the result and return it
*/
XTensor ReduceMax(const XTensor &input, int dim);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论