Commit f5149a15 by liyinqiao

Merge with Yuhao branch (with little bit change).

parent f0b49d6d
...@@ -30,8 +30,9 @@ ...@@ -30,8 +30,9 @@
#include "XDevice.h" #include "XDevice.h"
#include "./test/Test.h" #include "./test/Test.h"
#include "./core/CHeader.h" #include "./core/CHeader.h"
#include "./loss/CrossEntropy.h" #include "./XBLAS.h"
#include "./core/sort/TopK.h"
#include "./core/movement/Gather.h"
//#define CRTDBG_MAP_ALLOC //#define CRTDBG_MAP_ALLOC
//#include <stdlib.h> //#include <stdlib.h>
//#include <crtdbg.h> //#include <crtdbg.h>
......
...@@ -50,14 +50,6 @@ int CONST_MINUSONE = -1; ...@@ -50,14 +50,6 @@ int CONST_MINUSONE = -1;
bool CONST_TRUE = true; bool CONST_TRUE = true;
int verboseLevel = 0; int verboseLevel = 0;
bool useBLAS = false;
#ifdef USE_CUDA
bool useCUDA = true;
#else
bool useCUDA = false;
#endif
FILE * tmpLog = NULL; FILE * tmpLog = NULL;
double myTime = 0; double myTime = 0;
......
...@@ -135,8 +135,6 @@ extern bool CONST_TRUE; ...@@ -135,8 +135,6 @@ extern bool CONST_TRUE;
#define NIUTRANSNNDEBUG #define NIUTRANSNNDEBUG
extern int verboseLevel; extern int verboseLevel;
extern bool useBLAS;
extern bool useCUDA;
#define FFLUSH(FILEH) \ #define FFLUSH(FILEH) \
{ \ { \
......
...@@ -1562,9 +1562,9 @@ void XMemManager::GetBufferSize(MTYPE freeMem, MTYPE * myBufSize) ...@@ -1562,9 +1562,9 @@ void XMemManager::GetBufferSize(MTYPE freeMem, MTYPE * myBufSize)
if (freeMem >= MILLION * 512){ if (freeMem >= MILLION * 512){
*myBufSize = MILLION * 128; *myBufSize = MILLION * 128;
if (freeMem >= MILLION * 1024) { if (freeMem >= MILLION * 1024) {
*myBufSize = MILLION * 256; *myBufSize = MILLION * 128;
if (freeMem >= MILLION * 2048) if (freeMem >= MILLION * 2048)
*myBufSize = MILLION * 512; *myBufSize = MILLION * 128;
} }
} }
} }
......
...@@ -266,7 +266,6 @@ void XTensor::Init() ...@@ -266,7 +266,6 @@ void XTensor::Init()
devID = -1; devID = -1;
order = -1; order = -1;
memset(dimSize, 0, sizeof(int) * MAX_TENSOR_DIM_NUM); memset(dimSize, 0, sizeof(int) * MAX_TENSOR_DIM_NUM);
memset(dimSizeRDI, 0, sizeof(int) * MAX_TENSOR_DIM_NUM);
dataType = DEFAULT_DTYPE; dataType = DEFAULT_DTYPE;
unitSize = sizeof(float); unitSize = sizeof(float);
unitNum = 0; unitNum = 0;
...@@ -314,7 +313,6 @@ void XTensor::ShallowCopy(const XTensor &tensor) ...@@ -314,7 +313,6 @@ void XTensor::ShallowCopy(const XTensor &tensor)
order = tensor.order; order = tensor.order;
enableGrad = tensor.enableGrad; enableGrad = tensor.enableGrad;
memcpy(dimSize, tensor.dimSize, sizeof(int) * MAX_TENSOR_DIM_NUM); memcpy(dimSize, tensor.dimSize, sizeof(int) * MAX_TENSOR_DIM_NUM);
memcpy(dimSizeRDI, tensor.dimSizeRDI, sizeof(int) * MAX_TENSOR_DIM_NUM);
dataType = tensor.dataType; dataType = tensor.dataType;
unitSize = tensor.unitSize; unitSize = tensor.unitSize;
unitNum = tensor.unitNum; unitNum = tensor.unitNum;
...@@ -533,7 +531,7 @@ void XTensor::SetDevice(int myDevId, XMem * myMem) ...@@ -533,7 +531,7 @@ void XTensor::SetDevice(int myDevId, XMem * myMem)
bool XTensor::IsReduceShaped(const XTensor * a, const XTensor * b, int dim) bool XTensor::IsReduceShaped(const XTensor * a, const XTensor * b, int dim)
{ {
if (a == NULL || b == NULL) if(a == NULL || b == NULL)
return false; return false;
if ((a->order - 1) != b->order) if ((a->order - 1) != b->order)
...@@ -570,7 +568,6 @@ void XTensor::SetDim(int * myDimSize) ...@@ -570,7 +568,6 @@ void XTensor::SetDim(int * myDimSize)
{ {
for (int i = 0; i < order; i++) { for (int i = 0; i < order; i++) {
dimSize[i] = myDimSize[i]; dimSize[i] = myDimSize[i];
dimSizeRDI[order - i - 1] = myDimSize[i];
} }
} }
...@@ -598,20 +595,17 @@ reshape the tensor ...@@ -598,20 +595,17 @@ reshape the tensor
void XTensor::Reshape(const int myOrder, const int * myDimSize) void XTensor::Reshape(const int myOrder, const int * myDimSize)
{ {
int dims[MAX_TENSOR_DIM_NUM]; int dims[MAX_TENSOR_DIM_NUM];
int dimsRDI[MAX_TENSOR_DIM_NUM];
int num = 1; int num = 1;
for(int i = 0; i < myOrder; i++){ for(int i = 0; i < myOrder; i++){
num *= myDimSize[i]; num *= myDimSize[i];
dims[i] = abs(myDimSize[i]); dims[i] = abs(myDimSize[i]);
dimsRDI[myOrder - i - 1] = dims[i];
} }
CheckNTErrors(abs(num) == unitNum, "Wrong size found when we reshape the tensor!"); CheckNTErrors(abs(num) == unitNum, "Wrong size found when we reshape the tensor!");
order = myOrder; order = myOrder;
memcpy(dimSize, dims, sizeof(int) * order); memcpy(dimSize, dims, sizeof(int) * order);
memcpy(dimSizeRDI, dimsRDI, sizeof(int) * order);
} }
/* /*
...@@ -997,18 +991,12 @@ void * XTensor::GetCell(int index[], int size) const ...@@ -997,18 +991,12 @@ void * XTensor::GetCell(int index[], int size) const
{ {
CheckNTErrors((size == order), "Illegal index!"); CheckNTErrors((size == order), "Illegal index!");
int * indexRDI = new int[size]; int offset = index[0];
for (int i = 0; i < size; i++) for(int i = 1; i < size; ++i){
indexRDI[size - i - 1] = index[i]; CheckNTErrors((index[i] < dimSize[i]), "Index is out of range!");
offset = offset * dimSize[i] + index[i];
int offset = indexRDI[size - 1];
for(int i = size - 2; i >= 0; i--){
CheckNTErrors((indexRDI[i] < dimSizeRDI[i]), "Index is out of range!");
offset = offset * dimSizeRDI[i] + indexRDI[i];
} }
delete[] indexRDI;
if(isSparse){ if(isSparse){
DTYPE value; DTYPE value;
void * p; void * p;
...@@ -1469,7 +1457,6 @@ bool XTensor::Resize(const int myOrder, const int * myDimSize, ...@@ -1469,7 +1457,6 @@ bool XTensor::Resize(const int myOrder, const int * myDimSize,
bool zeroData = false; bool zeroData = false;
for(int i = 0; i < order; i++){ for(int i = 0; i < order; i++){
dimSize[i] = abs(myDimSize[i]); dimSize[i] = abs(myDimSize[i]);
dimSizeRDI[order - i - 1] = dimSize[i];
if(myDimSize[i] < 0) if(myDimSize[i] < 0)
filledData = false; filledData = false;
if(myDimSize[i] == 0) if(myDimSize[i] == 0)
...@@ -1668,7 +1655,7 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int beg, ...@@ -1668,7 +1655,7 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int beg,
if (isSparse) { if (isSparse) {
int num = 0; int num = 0;
for (int i = 0; i < order; i++) for (int i = 0; i < order; i++)
num *= dimSizeRDI[i]; num *= dimSize[i];
num = int(num * denseRatio + 1); num = int(num * denseRatio + 1);
int tupleSize = sizeof(int) + sizeof(DTYPE); int tupleSize = sizeof(int) + sizeof(DTYPE);
int size = sizeof(int) + tupleSize*(num); int size = sizeof(int) + tupleSize*(num);
...@@ -1880,8 +1867,8 @@ void XTensor::Read(FILE * file, const char * label) ...@@ -1880,8 +1867,8 @@ void XTensor::Read(FILE * file, const char * label)
int ds[MAX_TENSOR_DIM_NUM]; int ds[MAX_TENSOR_DIM_NUM];
for (int i = 0; i < order; i++) { for (int i = 0; i < order; i++) {
ds[i] = key % dimSizeRDI[i]; ds[i] = key % dimSize[i];
key /= dimSizeRDI[i]; key /= dimSize[i];
} }
Set(value, ds); Set(value, ds);
} }
......
...@@ -100,9 +100,6 @@ public: ...@@ -100,9 +100,6 @@ public:
/* size of each dimension */ /* size of each dimension */
int dimSize[MAX_TENSOR_DIM_NUM]; int dimSize[MAX_TENSOR_DIM_NUM];
/* size of each dimension by Reversed Dimension Indexing (RDI) Mode */
int dimSizeRDI[MAX_TENSOR_DIM_NUM];
/* data unit - data type for every cell */ /* data unit - data type for every cell */
TENSOR_DATA_TYPE dataType; TENSOR_DATA_TYPE dataType;
......
...@@ -49,9 +49,6 @@ void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int le ...@@ -49,9 +49,6 @@ void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int le
"Unmatched tensors!"); "Unmatched tensors!");
CheckDev(a->devID, b->devID); CheckDev(a->devID, b->devID);
int leadingDimRDI = a->order - leadingDim - 1;
#ifdef USE_CUDA #ifdef USE_CUDA
if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) { if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
_CudaDiv(a, b, c, alpha, leadingDim); _CudaDiv(a, b, c, alpha, leadingDim);
...@@ -64,17 +61,17 @@ void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int le ...@@ -64,17 +61,17 @@ void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int le
int blockSizeB = 1; int blockSizeB = 1;
int blockSizeC = 1; int blockSizeC = 1;
int blockNum = 1; int blockNum = 1;
int dimensionSizeA = a->dimSizeRDI[leadingDimRDI]; int dimensionSizeA = a->dimSize[leadingDim];
int dimensionSizeB = b->dimSizeRDI[leadingDimRDI]; int dimensionSizeB = b->dimSize[leadingDim];
int dimensionSizeC = c->dimSizeRDI[leadingDimRDI]; int dimensionSizeC = c->dimSize[leadingDim];
for (int i = 0; i < a->order; i++) { for (int i = 0; i < a->order; i++) {
if (i != leadingDimRDI) { if (i != leadingDim) {
CheckNTErrors((a->dimSizeRDI[i] == b->dimSizeRDI[i] && a->dimSizeRDI[i] == c->dimSizeRDI[i]), CheckNTErrors((a->dimSize[i] == b->dimSize[i] && a->dimSize[i] == c->dimSize[i]),
"Unmatched tensors!"); "Unmatched tensors!");
} }
if (i < leadingDimRDI) if (i > leadingDim)
stride *= a->dimSizeRDI[i]; stride *= a->dimSize[i];
} }
blockSizeA = stride * dimensionSizeA; blockSizeA = stride * dimensionSizeA;
......
...@@ -122,7 +122,6 @@ where i is the item index ...@@ -122,7 +122,6 @@ where i is the item index
*/ */
void _CudaDiv(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim) void _CudaDiv(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
{ {
int leadingDimRDI = a->order - leadingDim - 1;
CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum), CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum),
"Unmatched tensors in multiplication!"); "Unmatched tensors in multiplication!");
CheckNTErrors((a->order == b->order && a->order == c->order), "Unmatched tensors!"); CheckNTErrors((a->order == b->order && a->order == c->order), "Unmatched tensors!");
...@@ -130,18 +129,18 @@ void _CudaDiv(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, in ...@@ -130,18 +129,18 @@ void _CudaDiv(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, in
int stride = 1; int stride = 1;
int blockSizeA = 1; int blockSizeA = 1;
int blockNum = 1; int blockNum = 1;
int dimensionSizeA = a->dimSizeRDI[leadingDimRDI]; int dimensionSizeA = a->dimSize[leadingDim];
int dimensionSizeB = b->dimSizeRDI[leadingDimRDI]; int dimensionSizeB = b->dimSize[leadingDim];
int dimensionSizeC = c->dimSizeRDI[leadingDimRDI]; int dimensionSizeC = c->dimSize[leadingDim];
for (int i = 0; i < a->order; i++) { for (int i = 0; i < a->order; i++) {
if (i != leadingDimRDI) { if (i != leadingDim) {
CheckNTErrors((a->dimSizeRDI[i] == b->dimSizeRDI[i] && CheckNTErrors((a->dimSize[i] == b->dimSize[i] &&
a->dimSizeRDI[i] == c->dimSizeRDI[i]), a->dimSize[i] == c->dimSize[i]),
"Unmatched tensors!"); "Unmatched tensors!");
} }
if (i < leadingDimRDI) if (i > leadingDim)
stride *= a->dimSizeRDI[i]; stride *= a->dimSize[i];
} }
blockSizeA = stride * dimensionSizeA; blockSizeA = stride * dimensionSizeA;
......
...@@ -77,18 +77,18 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -77,18 +77,18 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
return; return;
} }
int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1]; int an = transposedA == X_TRANS ? a->dimSize[a->order - 1] : a->dimSize[a->order - 2];
int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0]; int am = transposedA == X_TRANS ? a->dimSize[a->order - 2] : a->dimSize[a->order - 1];
int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1]; int bn = transposedB == X_TRANS ? b->dimSize[b->order - 1] : b->dimSize[b->order - 2];
int bm = transposedB == X_TRANS ? b->dimSizeRDI[1] : b->dimSizeRDI[0]; int bm = transposedB == X_TRANS ? b->dimSize[b->order - 2] : b->dimSize[b->order - 1];
int cn = c->dimSizeRDI[1]; int cn = c->dimSize[c->order - 2];
int cm = c->dimSizeRDI[0]; int cm = c->dimSize[c->order - 1];
CheckNTErrors((am == bn && an == cn && bm == cm), "Unmatched tensors in multiplication!"); CheckNTErrors((am == bn && an == cn && bm == cm), "Unmatched tensors in multiplication!");
int aBlockSize = a->dimSizeRDI[0] * a->dimSizeRDI[1]; int aBlockSize = a->dimSize[a->order - 1] * a->dimSize[a->order - 2];
int bBlockSize = b->dimSizeRDI[0] * b->dimSizeRDI[1]; int bBlockSize = b->dimSize[b->order - 1] * b->dimSize[b->order - 2];
int cBlockSize = c->dimSizeRDI[0] * c->dimSizeRDI[1]; int cBlockSize = c->dimSize[c->order - 1] * c->dimSize[c->order - 2];
int aRealBlockSize = aBlockSize * a->unitSize; int aRealBlockSize = aBlockSize * a->unitSize;
int bRealBlockSize = bBlockSize * b->unitSize; int bRealBlockSize = bBlockSize * b->unitSize;
int cRealBlockSize = cBlockSize * c->unitSize; int cRealBlockSize = cBlockSize * c->unitSize;
...@@ -96,24 +96,25 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -96,24 +96,25 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
int bBlockNum = 1; int bBlockNum = 1;
int cBlockNum = 1; int cBlockNum = 1;
for (int i = 2; i < a->order; i++) {
CheckNTErrors(a->dimSizeRDI[i] == c->dimSizeRDI[i - 2 + b->order], "Incorrect tensor sizes!"); for (int i = 0; i < a->order - 2; i++) {
aBlockNum *= a->dimSizeRDI[i]; CheckNTErrors(a->dimSize[i] == c->dimSize[i], "Incorrect tensor sizes!");
cBlockNum *= a->dimSizeRDI[i]; aBlockNum *= a->dimSize[i];
cBlockNum *= a->dimSize[i];
} }
for (int i = 2; i < b->order; i++) { for (int i = 0; i < b->order - 2; i++) {
CheckNTErrors(b->dimSizeRDI[i] == c->dimSizeRDI[i], "Incorrect tensor sizes!"); CheckNTErrors(b->dimSize[i] == c->dimSize[i - 2 + a->order], "Incorrect tensor sizes!");
bBlockNum *= b->dimSizeRDI[i]; bBlockNum *= b->dimSize[i];
cBlockNum *= b->dimSizeRDI[i]; cBlockNum *= b->dimSize[i];
} }
TensorList * aList = new TensorList(10); TensorList * aList = new TensorList(10);
TensorList * bList = new TensorList(10); TensorList * bList = new TensorList(10);
TensorList * cList = new TensorList(10); TensorList * cList = new TensorList(10);
int aDimSize[2] = { -a->dimSizeRDI[1], a->dimSizeRDI[0] }; int aDimSize[2] = { -a->dimSize[a->order - 2], a->dimSize[a->order - 1] };
int bDimSize[2] = { -b->dimSizeRDI[1], b->dimSizeRDI[0] }; int bDimSize[2] = { -b->dimSize[b->order - 2], b->dimSize[b->order - 1] };
int cDimSize[2] = { -c->dimSizeRDI[1], c->dimSizeRDI[0] }; int cDimSize[2] = { -c->dimSize[c->order - 2], c->dimSize[c->order - 1] };
bool isSparseMul = false; bool isSparseMul = false;
...@@ -215,20 +216,20 @@ bool CheckMMulShape(const XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -215,20 +216,20 @@ bool CheckMMulShape(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
if (!(a->order >= 2 && b->order >= 2 && c->order >= 2)) if (!(a->order >= 2 && b->order >= 2 && c->order >= 2))
return false; return false;
int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1]; int an = transposedA == X_TRANS ? a->dimSize[a->order - 1] : a->dimSize[a->order - 2];
int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0]; int am = transposedA == X_TRANS ? a->dimSize[a->order - 2] : a->dimSize[a->order - 1];
int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1]; int bn = transposedB == X_TRANS ? b->dimSize[b->order - 1] : b->dimSize[b->order - 2];
int bm = transposedB == X_TRANS ? b->dimSizeRDI[1] : b->dimSizeRDI[0]; int bm = transposedB == X_TRANS ? b->dimSize[b->order - 2] : b->dimSize[b->order - 1];
CheckNTErrors(am == bn, "Unmatched tensors in multiplication!"); CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
int order = a->order + b->order - 2; int order = a->order + b->order - 2;
int sub = 0; int sub = 0;
int * dimSize = new int[order]; int * dimSize = new int[order];
for (int i = 2; i < a->order; i++) for (int i = 0; i < a->order - 2; i++)
dimSize[sub++] = a->dimSizeRDI[a->order + 1 - i]; dimSize[sub++] = a->dimSize[i];
for (int i = 2; i < b->order; i++) for (int i = 0; i < b->order - 2; i++)
dimSize[sub++] = b->dimSizeRDI[b->order + 1 - i]; dimSize[sub++] = b->dimSize[i];
dimSize[sub++] = an; dimSize[sub++] = an;
dimSize[sub++] = bm; dimSize[sub++] = bm;
...@@ -271,20 +272,20 @@ XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, ...@@ -271,20 +272,20 @@ XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!"); CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!"); CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
int an = transposedA == X_TRANS ? a.dimSizeRDI[0] : a.dimSizeRDI[1]; int an = transposedA == X_TRANS ? a.dimSize[a.order - 1] : a.dimSize[a.order - 2];
int am = transposedA == X_TRANS ? a.dimSizeRDI[1] : a.dimSizeRDI[0]; int am = transposedA == X_TRANS ? a.dimSize[a.order - 2] : a.dimSize[a.order - 1];
int bn = transposedB == X_TRANS ? b.dimSizeRDI[0] : b.dimSizeRDI[1]; int bn = transposedB == X_TRANS ? b.dimSize[b.order - 1] : b.dimSize[b.order - 2];
int bm = transposedB == X_TRANS ? b.dimSizeRDI[1] : b.dimSizeRDI[0]; int bm = transposedB == X_TRANS ? b.dimSize[b.order - 2] : b.dimSize[b.order - 1];
CheckNTErrors(am == bn, "Unmatched tensors in multiplication!"); CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
int order = a.order + b.order - 2; int order = a.order + b.order - 2;
int sub = 0; int sub = 0;
int * dimSize = new int[order]; int * dimSize = new int[order];
for (int i = 2; i < a.order; i++) for (int i = 0; i < a.order - 2; i++)
dimSize[sub++] = a.dimSizeRDI[a.order + 1 - i]; dimSize[sub++] = a.dimSize[i];
for (int i = 2; i < b.order; i++) for (int i = 0; i < b.order - 2; i++)
dimSize[sub++] = b.dimSizeRDI[b.order + 1 - i]; dimSize[sub++] = b.dimSize[i];
dimSize[sub++] = an; dimSize[sub++] = an;
dimSize[sub++] = bm; dimSize[sub++] = bm;
...@@ -318,20 +319,20 @@ void MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, ...@@ -318,20 +319,20 @@ void MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
if (!c.isInit || !CheckMMulShape(&a, transposedA, &b, transposedB, &c)) { if (!c.isInit || !CheckMMulShape(&a, transposedA, &b, transposedB, &c)) {
int an = transposedA == X_TRANS ? a.dimSizeRDI[0] : a.dimSizeRDI[1]; int an = transposedA == X_TRANS ? a.dimSize[a.order - 1] : a.dimSize[a.order - 2];
int am = transposedA == X_TRANS ? a.dimSizeRDI[1] : a.dimSizeRDI[0]; int am = transposedA == X_TRANS ? a.dimSize[a.order - 2] : a.dimSize[a.order - 1];
int bn = transposedB == X_TRANS ? b.dimSizeRDI[0] : b.dimSizeRDI[1]; int bn = transposedB == X_TRANS ? b.dimSize[b.order - 1] : b.dimSize[b.order - 2];
int bm = transposedB == X_TRANS ? b.dimSizeRDI[1] : b.dimSizeRDI[0]; int bm = transposedB == X_TRANS ? b.dimSize[b.order - 2] : b.dimSize[b.order - 1];
CheckNTErrors(am == bn, "Unmatched tensors in multiplication!"); CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
int order = a.order + b.order - 2; int order = a.order + b.order - 2;
int sub = 0; int sub = 0;
int * dimSize = new int[order]; int * dimSize = new int[order];
for (int i = 2; i < a.order; i++) for (int i = 0; i < a.order - 2; i++)
dimSize[sub++] = a.dimSizeRDI[a.order + 1 - i]; dimSize[sub++] = a.dimSize[i];
for (int i = 2; i < b.order; i++) for (int i = 0; i < b.order - 2; i++)
dimSize[sub++] = b.dimSizeRDI[b.order + 1 - i]; dimSize[sub++] = b.dimSize[i];
dimSize[sub++] = an; dimSize[sub++] = an;
dimSize[sub++] = bm; dimSize[sub++] = bm;
...@@ -370,20 +371,20 @@ XTensor MatrixMul(const XTensor &a, const XTensor &b, ...@@ -370,20 +371,20 @@ XTensor MatrixMul(const XTensor &a, const XTensor &b,
CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!"); CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!"); CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
int an = a.dimSizeRDI[1]; int an = a.dimSize[a.order - 2];
int am = a.dimSizeRDI[0]; int am = a.dimSize[a.order - 1];
int bn = b.dimSizeRDI[1]; int bn = b.dimSize[b.order - 2];
int bm = b.dimSizeRDI[0]; int bm = b.dimSize[b.order - 1];
CheckNTErrors(am == bn, "Unmatched tensors in multiplication!"); CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
int order = a.order + b.order - 2; int order = a.order + b.order - 2;
int sub = 0; int sub = 0;
int * dimSize = new int[order]; int * dimSize = new int[order];
for (int i = 2; i < a.order; i++) for (int i = 0; i < a.order - 2; i++)
dimSize[sub++] = a.dimSizeRDI[a.order + 1 - i]; dimSize[sub++] = a.dimSize[i];
for (int i = 2; i < b.order; i++) for (int i = 0; i < b.order - 2; i++)
dimSize[sub++] = b.dimSizeRDI[b.order + 1 - i]; dimSize[sub++] = b.dimSize[i];
dimSize[sub++] = an; dimSize[sub++] = an;
dimSize[sub++] = bm; dimSize[sub++] = bm;
...@@ -416,20 +417,20 @@ void MatrixMul(const XTensor &a, const XTensor &b, XTensor &c, ...@@ -416,20 +417,20 @@ void MatrixMul(const XTensor &a, const XTensor &b, XTensor &c,
if (!c.isInit || !CheckMMulShape(&a, X_NOTRANS, &b, X_NOTRANS, &c)) { if (!c.isInit || !CheckMMulShape(&a, X_NOTRANS, &b, X_NOTRANS, &c)) {
int an = a.dimSizeRDI[1]; int an = a.dimSize[a.order - 2];
int am = a.dimSizeRDI[0]; int am = a.dimSize[a.order - 1];
int bn = b.dimSizeRDI[1]; int bn = b.dimSize[b.order - 2];
int bm = b.dimSizeRDI[0]; int bm = b.dimSize[b.order - 1];
CheckNTErrors(am == bn, "Unmatched tensors in multiplication!"); CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
int order = a.order + b.order - 2; int order = a.order + b.order - 2;
int sub = 0; int sub = 0;
int * dimSize = new int[order]; int * dimSize = new int[order];
for (int i = 2; i < a.order; i++) for (int i = 0; i < a.order - 2; i++)
dimSize[sub++] = a.dimSizeRDI[a.order + 1 - i]; dimSize[sub++] = a.dimSize[i];
for (int i = 2; i < b.order; i++) for (int i = 0; i < b.order - 2; i++)
dimSize[sub++] = b.dimSizeRDI[b.order + 1 - i]; dimSize[sub++] = b.dimSize[i];
dimSize[sub++] = an; dimSize[sub++] = an;
dimSize[sub++] = bm; dimSize[sub++] = bm;
......
...@@ -95,27 +95,27 @@ void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -95,27 +95,27 @@ void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
"Input tensor and output tensor must have same order!"); "Input tensor and output tensor must have same order!");
CheckNTErrors(a->devID >= 0 && b->devID >= 0 && c->devID >= 0, "The tensors must be on GPUs"); CheckNTErrors(a->devID >= 0 && b->devID >= 0 && c->devID >= 0, "The tensors must be on GPUs");
int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1]; int an = transposedA == X_TRANS ? a->dimSize[a->order - 1] : a->dimSize[a->order - 2];
int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0]; int am = transposedA == X_TRANS ? a->dimSize[a->order - 2] : a->dimSize[a->order - 1];
int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1]; int bn = transposedB == X_TRANS ? b->dimSize[b->order - 1] : b->dimSize[b->order - 2];
int bm = transposedB == X_TRANS ? b->dimSizeRDI[1] : b->dimSizeRDI[0]; int bm = transposedB == X_TRANS ? b->dimSize[b->order - 2] : b->dimSize[b->order - 1];
int cn = c->dimSizeRDI[1]; int cn = c->dimSize[c->order - 2];
int cm = c->dimSizeRDI[0]; int cm = c->dimSize[c->order - 1];
CheckNTErrors((am == bn && an == cn && bm == cm), "Unmatched tensors in multiplication!"); CheckNTErrors((am == bn && an == cn && bm == cm), "Unmatched tensors in multiplication!");
int aBlockSize = a->dimSizeRDI[0] * a->dimSizeRDI[1]; int aBlockSize = a->dimSize[a->order - 1] * a->dimSize[a->order - 2];
int bBlockSize = b->dimSizeRDI[0] * b->dimSizeRDI[1]; int bBlockSize = b->dimSize[b->order - 1] * b->dimSize[b->order - 2];
int cBlockSize = c->dimSizeRDI[0] * c->dimSizeRDI[1]; int cBlockSize = c->dimSize[c->order - 1] * c->dimSize[c->order - 2];
int aRealBlockSize = aBlockSize * a->unitSize; int aRealBlockSize = aBlockSize * a->unitSize;
int bRealBlockSize = bBlockSize * b->unitSize; int bRealBlockSize = bBlockSize * b->unitSize;
int cRealBlockSize = cBlockSize * c->unitSize; int cRealBlockSize = cBlockSize * c->unitSize;
int blockNum = 1; int blockNum = 1;
for (int i = 2; i < a->order; i++) { for (int i = 0; i < a->order - 2; i++) {
CheckNTErrors((a->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!"); CheckNTErrors((a->dimSize[i] == c->dimSize[i]), "Incorrect tensor sizes!");
CheckNTErrors((b->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!"); CheckNTErrors((b->dimSize[i] == c->dimSize[i]), "Incorrect tensor sizes!");
blockNum *= a->dimSizeRDI[i]; blockNum *= a->dimSize[i];
} }
int devIDBackup = 0; int devIDBackup = 0;
...@@ -126,9 +126,9 @@ void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -126,9 +126,9 @@ void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
a->data, transposedA, a->dataType, aBlockSize, a->data, transposedA, a->dataType, aBlockSize,
b->data, transposedB, b->dataType, bBlockSize, b->data, transposedB, b->dataType, bBlockSize,
c->data, c->dataType, cBlockSize, blockNum, c->data, c->dataType, cBlockSize, blockNum,
a->dimSizeRDI[1], a->dimSizeRDI[0], a->dimSize[a->order - 2], a->dimSize[a->order - 1],
b->dimSizeRDI[1], b->dimSizeRDI[0], b->dimSize[b->order - 2], b->dimSize[b->order - 1],
c->dimSizeRDI[1], c->dimSizeRDI[0], alpha, beta); c->dimSize[c->order - 2], c->dimSize[c->order - 1], alpha, beta);
BacktoCudaDev(a->devID, devIDBackup); BacktoCudaDev(a->devID, devIDBackup);
#endif #endif
...@@ -164,32 +164,32 @@ void _MatrixMulBatchedCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -164,32 +164,32 @@ void _MatrixMulBatchedCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
"Input tensor and output tensor must have same order!"); "Input tensor and output tensor must have same order!");
int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1]; int an = transposedA == X_TRANS ? a->dimSize[a->order - 1] : a->dimSize[a->order - 2];
int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0]; int am = transposedA == X_TRANS ? a->dimSize[a->order - 2] : a->dimSize[a->order - 1];
int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1]; int bn = transposedB == X_TRANS ? b->dimSize[b->order - 1] : b->dimSize[b->order - 2];
int bm = transposedB == X_TRANS ? b->dimSizeRDI[1] : b->dimSizeRDI[0]; int bm = transposedB == X_TRANS ? b->dimSize[b->order - 2] : b->dimSize[b->order - 1];
int cn = c->dimSizeRDI[1]; int cn = c->dimSize[c->order - 2];
int cm = c->dimSizeRDI[0]; int cm = c->dimSize[c->order - 1];
CheckNTErrors(am == bn && an == cn && bm == cm, "Unmatched tensors in multiplication!"); CheckNTErrors(am == bn && an == cn && bm == cm, "Unmatched tensors in multiplication!");
int aBlockSize = a->dimSizeRDI[0] * a->dimSizeRDI[1]; int aBlockSize = a->dimSize[a->order - 1] * a->dimSize[a->order - 2];
int bBlockSize = b->dimSizeRDI[0] * b->dimSizeRDI[1]; int bBlockSize = b->dimSize[b->order - 1] * b->dimSize[b->order - 2];
int cBlockSize = c->dimSizeRDI[0] * c->dimSizeRDI[1]; int cBlockSize = c->dimSize[c->order - 1] * c->dimSize[c->order - 2];
int aRealBlockSize = aBlockSize * a->unitSize; int aRealBlockSize = aBlockSize * a->unitSize;
int bRealBlockSize = bBlockSize * b->unitSize; int bRealBlockSize = bBlockSize * b->unitSize;
int cRealBlockSize = cBlockSize * c->unitSize; int cRealBlockSize = cBlockSize * c->unitSize;
int blockNum = 1; int blockNum = 1;
for (int i = 2; i < a->order; i++) { for (int i = 0; i < a->order - 2; i++) {
CheckNTErrors((a->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!"); CheckNTErrors((a->dimSize[i] == c->dimSize[i]), "Incorrect tensor sizes!");
CheckNTErrors((b->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!"); CheckNTErrors((b->dimSize[i] == c->dimSize[i]), "Incorrect tensor sizes!");
blockNum *= a->dimSizeRDI[i]; blockNum *= a->dimSize[i];
} }
int aDimSize[2] = {-a->dimSizeRDI[1], a->dimSizeRDI[0]}; int aDimSize[2] = {-a->dimSize[a->order - 2], a->dimSize[a->order - 1]};
int bDimSize[2] = {-b->dimSizeRDI[1], b->dimSizeRDI[0]}; int bDimSize[2] = {-b->dimSize[b->order - 2], b->dimSize[b->order - 1]};
int cDimSize[2] = {-c->dimSizeRDI[1], c->dimSizeRDI[0]}; int cDimSize[2] = {-c->dimSize[c->order - 2], c->dimSize[c->order - 1]};
XTensor * ai = NewTensor2D(aDimSize[0], aDimSize[1], a->dataType, a->devID, a->mem); XTensor * ai = NewTensor2D(aDimSize[0], aDimSize[1], a->dataType, a->devID, a->mem);
XTensor * bi = NewTensor2D(bDimSize[0], bDimSize[1], b->dataType, b->devID, b->mem); XTensor * bi = NewTensor2D(bDimSize[0], bDimSize[1], b->dataType, b->devID, b->mem);
...@@ -292,10 +292,10 @@ XTensor MatrixMulBatched(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const ...@@ -292,10 +292,10 @@ XTensor MatrixMulBatched(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const
CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!"); CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
CheckNTErrors(a.order == b.order, "Input tensor and output tensor must have same order!"); CheckNTErrors(a.order == b.order, "Input tensor and output tensor must have same order!");
int an = transposedA == X_TRANS ? a.dimSizeRDI[0] : a.dimSizeRDI[1]; int an = transposedA == X_TRANS ? a.dimSize[a.order - 1] : a.dimSize[a.order - 2];
int am = transposedA == X_TRANS ? a.dimSizeRDI[1] : a.dimSizeRDI[0]; int am = transposedA == X_TRANS ? a.dimSize[a.order - 2] : a.dimSize[a.order - 1];
int bn = transposedB == X_TRANS ? b.dimSizeRDI[0] : b.dimSizeRDI[1]; int bn = transposedB == X_TRANS ? b.dimSize[b.order - 1] : b.dimSize[b.order - 2];
int bm = transposedB == X_TRANS ? b.dimSizeRDI[1] : b.dimSizeRDI[0]; int bm = transposedB == X_TRANS ? b.dimSize[b.order - 2] : b.dimSize[b.order - 1];
CheckNTErrors(am == bn, "Unmatched tensors in multiplication!"); CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
...@@ -350,10 +350,10 @@ XTensor MatrixMulBatched(const XTensor &a, const XTensor &b, ...@@ -350,10 +350,10 @@ XTensor MatrixMulBatched(const XTensor &a, const XTensor &b,
CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!"); CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
CheckNTErrors(a.order == b.order, "Input tensor and output tensor must have same order!"); CheckNTErrors(a.order == b.order, "Input tensor and output tensor must have same order!");
int an = a.dimSizeRDI[1]; int an = a.dimSize[a.order - 2];
int am = a.dimSizeRDI[0]; int am = a.dimSize[a.order - 1];
int bn = b.dimSizeRDI[1]; int bn = b.dimSize[b.order - 2];
int bm = b.dimSizeRDI[0]; int bm = b.dimSize[b.order - 1];
CheckNTErrors(am == bn, "Unmatched tensors in multiplication!"); CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
......
...@@ -71,20 +71,21 @@ XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b, ...@@ -71,20 +71,21 @@ XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
CheckNTErrors(x.dataType == w.dataType, "Input tensors should have the same data type!"); CheckNTErrors(x.dataType == w.dataType, "Input tensors should have the same data type!");
CheckNTErrors(x.order >= 2 && w.order >= 2, "Input tensors must have a order >= 2!"); CheckNTErrors(x.order >= 2 && w.order >= 2, "Input tensors must have a order >= 2!");
int xn = x.dimSizeRDI[1]; int xn = x.dimSize[x.order - 2];
int xm = x.dimSizeRDI[0]; int xm = x.dimSize[x.order - 1];
int wn = w.dimSizeRDI[1]; int wn = w.dimSize[w.order - 2];
int wm = w.dimSizeRDI[0]; int wm = w.dimSize[w.order - 1];
CheckNTErrors(xm == wn, "Unmatched tensors in multiplication!"); CheckNTErrors(xm == wn, "Unmatched tensors in multiplication!");
int order = x.order + w.order - 2; int order = x.order + w.order - 2;
int sub = 0; int sub = 0;
int * dimSize = new int[order]; int * dimSize = new int[order];
for (int i = 2; i < x.order; i++) for (int i = 0; i < x.order - 2; i++)
dimSize[sub++] = x.dimSizeRDI[x.order + 1 - i]; dimSize[sub++] = x.dimSize[i];
for (int i = 2; i < w.order; i++) for (int i = 0; i < w.order - 2; i++)
dimSize[sub++] = w.dimSizeRDI[w.order + 1 - i]; dimSize[sub++] = w.dimSize[i];
dimSize[sub++] = xn; dimSize[sub++] = xn;
dimSize[sub++] = wm; dimSize[sub++] = wm;
...@@ -148,18 +149,18 @@ XTensor MulAndShift(const XTensor& x, MATRIX_TRANS_TYPE transposedA, ...@@ -148,18 +149,18 @@ XTensor MulAndShift(const XTensor& x, MATRIX_TRANS_TYPE transposedA,
CheckNTErrors(x.dataType == w.dataType, "Input tensors should have the same data type!"); CheckNTErrors(x.dataType == w.dataType, "Input tensors should have the same data type!");
CheckNTErrors(x.order >= 2 && w.order >= 2, "Input tensors must have a order >= 2!"); CheckNTErrors(x.order >= 2 && w.order >= 2, "Input tensors must have a order >= 2!");
int xn = transposedA == X_TRANS ? x.dimSizeRDI[0] : x.dimSizeRDI[1]; int xn = transposedA == X_TRANS ? x.dimSize[x.order - 1] : x.dimSize[x.order - 2];
int xm = transposedA == X_TRANS ? x.dimSizeRDI[1] : x.dimSizeRDI[0]; int xm = transposedA == X_TRANS ? x.dimSize[x.order - 2] : x.dimSize[x.order - 1];
int wn = transposedB == X_TRANS ? w.dimSizeRDI[0] : w.dimSizeRDI[1]; int wn = transposedB == X_TRANS ? w.dimSize[w.order - 1] : w.dimSize[w.order - 2];
int wm = transposedB == X_TRANS ? w.dimSizeRDI[1] : w.dimSizeRDI[0]; int wm = transposedB == X_TRANS ? w.dimSize[w.order - 2] : w.dimSize[w.order - 1];
int order = x.order + w.order - 2; int order = x.order + w.order - 2;
int sub = 0; int sub = 0;
int * dimSize = new int[order]; int * dimSize = new int[order];
for (int i = 2; i < x.order; i++) for (int i = 0; i < x.order - 2; i++)
dimSize[sub++] = x.dimSizeRDI[x.order + 1 - i]; dimSize[sub++] = x.dimSize[i];
for (int i = 2; i < w.order; i++) for (int i = 0; i < w.order - 2; i++)
dimSize[sub++] = w.dimSizeRDI[w.order + 1 - i]; dimSize[sub++] = w.dimSize[i];
dimSize[sub++] = xn; dimSize[sub++] = xn;
dimSize[sub++] = wm; dimSize[sub++] = wm;
......
...@@ -49,9 +49,6 @@ void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, i ...@@ -49,9 +49,6 @@ void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, i
"Unmatched tensors!"); "Unmatched tensors!");
CheckDev(a->devID, b->devID); CheckDev(a->devID, b->devID);
int leadingDimRDI = a->order - leadingDim - 1;
#ifdef USE_CUDA #ifdef USE_CUDA
if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) { if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
_CudaMultiply(a, b, c, alpha, leadingDim); _CudaMultiply(a, b, c, alpha, leadingDim);
...@@ -64,18 +61,18 @@ void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, i ...@@ -64,18 +61,18 @@ void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, i
int blockSizeB = 1; int blockSizeB = 1;
int blockSizeC = 1; int blockSizeC = 1;
int blockNum = 1; int blockNum = 1;
int dimensionSizeA = a->dimSizeRDI[leadingDimRDI]; int dimensionSizeA = a->dimSize[leadingDim];
int dimensionSizeB = b->dimSizeRDI[leadingDimRDI]; int dimensionSizeB = b->dimSize[leadingDim];
int dimensionSizeC = c->dimSizeRDI[leadingDimRDI]; int dimensionSizeC = c->dimSize[leadingDim];
for (int i = 0; i < a->order; i++) { for (int i = 0; i < a->order; i++) {
if (i != leadingDimRDI) { if (i != leadingDim) {
CheckNTErrors((a->dimSizeRDI[i] == b->dimSizeRDI[i] && CheckNTErrors((a->dimSize[i] == b->dimSize[i] &&
a->dimSizeRDI[i] == c->dimSizeRDI[i]), a->dimSize[i] == c->dimSize[i]),
"Unmatched tensors!"); "Unmatched tensors!");
} }
if (i < leadingDimRDI) if (i > leadingDim)
stride *= a->dimSizeRDI[i]; stride *= a->dimSize[i];
} }
blockSizeA = stride * dimensionSizeA; blockSizeA = stride * dimensionSizeA;
......
...@@ -122,26 +122,25 @@ where i is the item index ...@@ -122,26 +122,25 @@ where i is the item index
*/ */
void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim) void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
{ {
int leadingDimRDI = a->order - leadingDim - 1; CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum),
CheckNTErrors(a->unitNum <= c->unitNum && b->unitNum <= c->unitNum,
"Unmatched tensors in multiplication!"); "Unmatched tensors in multiplication!");
CheckNTErrors(a->order == b->order && a->order == c->order, "Unmatched tensors!"); CheckNTErrors((a->order == b->order && a->order == c->order), "Unmatched tensors!");
int stride = 1; int stride = 1;
int blockSizeA = 1; int blockSizeA = 1;
int blockNum = 1; int blockNum = 1;
int dimensionSizeA = a->dimSizeRDI[leadingDimRDI]; int dimensionSizeA = a->dimSize[leadingDim];
int dimensionSizeB = b->dimSizeRDI[leadingDimRDI]; int dimensionSizeB = b->dimSize[leadingDim];
int dimensionSizeC = c->dimSizeRDI[leadingDimRDI]; int dimensionSizeC = c->dimSize[leadingDim];
for (int i = 0; i < a->order; i++) { for (int i = 0; i < a->order; i++) {
if (i != leadingDimRDI) { if (i != leadingDim) {
CheckNTErrors((a->dimSizeRDI[i] == b->dimSizeRDI[i] && CheckNTErrors((a->dimSize[i] == b->dimSize[i] &&
a->dimSizeRDI[i] == c->dimSizeRDI[i]), a->dimSize[i] == c->dimSize[i]),
"Unmatched tensors!"); "Unmatched tensors!");
} }
if (i < leadingDimRDI) if (i > leadingDim)
stride *= a->dimSizeRDI[i]; stride *= a->dimSize[i];
} }
blockSizeA = stride * dimensionSizeA; blockSizeA = stride * dimensionSizeA;
......
...@@ -70,20 +70,6 @@ void _SumDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE bet ...@@ -70,20 +70,6 @@ void _SumDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE bet
return; return;
} }
/*int dims[MAX_TENSOR_DIM_NUM];
for(int i = 0; i < a->order; i++)
dims[i] = 1;
dims[n] = a->GetDim(n);
XTensor * b2 = NewTensor(a->order, dims, b->dataType, b->denseRatio, b->devID, b->mem);
_CopyValues(b, b2);
_SumBroadcast(a, b2, c, beta);
DelTensor(b2);
return;*/
if(a->devID >= 0 || b->devID >= 0 || c->devID >= 0){ if(a->devID >= 0 || b->devID >= 0 || c->devID >= 0){
#ifdef USE_CUDA #ifdef USE_CUDA
_CudaSumDim(a, b, c, n, beta); _CudaSumDim(a, b, c, n, beta);
......
...@@ -87,17 +87,17 @@ void KernelAddWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize ...@@ -87,17 +87,17 @@ void KernelAddWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize
int col = colIndex % colNum; int col = colIndex % colNum;
int block = colIndex / colNum; int block = colIndex / colNum;
if (row >= rowNum || block >= blockNum) if(row >= rowNum || block >= blockNum)
return; return;
if (threadIdx.x == 0) if(threadIdx.x == 0)
bv[threadIdx.y] = b[row]; bv[threadIdx.y] = b[row];
__syncthreads(); __syncthreads();
int offset = block * blockSize + row * colNum + col; int offset = block * blockSize + row * colNum + col;
if (betaFired) if(betaFired)
c[offset] = a[offset] + bv[threadIdx.y] * beta; c[offset] = a[offset] + bv[threadIdx.y] * beta;
else else
c[offset] = a[offset] + bv[threadIdx.y]; c[offset] = a[offset] + bv[threadIdx.y];
......
...@@ -140,6 +140,47 @@ void _IndexToOnehot(const XTensor * index, XTensor * onehot, ...@@ -140,6 +140,47 @@ void _IndexToOnehot(const XTensor * index, XTensor * onehot,
} }
/* /*
convert index tensor to onehot tensor
>> index - index tensor, which value is an integer num
>> onehot - onehot tensor, which value is 0 or 1
>> size - the last dimension size of the onehot tensor
*/
void _IndexToOnehot(int * index, int n, XTensor * onehot, int size, float labelSmoothingP)
{
/*CheckNTErrors(onehot->GetDim(-1) == size, "Illegal tensor dimension!");
CheckNTErrors(onehot->dataType == X_INT, "The onehot tensor must be in X_INT!")
onehot->SetZeroAll();
#ifdef USE_CUDA
if (onehot->devID >= 0) {
delete[] cudaIndex;
return;
}
#endif
int blockNum = n;
int stride = size;
int * indexData = (int *)index;
int * onehotData = (int *)onehot->data;
for (int i = 0; i < blockNum; i++) {
int id = indexData[i];
int * od = onehotData + i * stride;
od[id] = 1;
}*/
XTensor* cudaIndex = NewTensor1D(n, X_INT, onehot->devID);
cudaIndex->SetData(index, n);
_IndexToOnehot(cudaIndex, onehot, size, labelSmoothingP);
delete[] cudaIndex;
}
/*
convert onehot tensor to index tensor (return an XTensor structure) convert onehot tensor to index tensor (return an XTensor structure)
make a new tensor to keep the result and return it make a new tensor to keep the result and return it
......
...@@ -36,6 +36,9 @@ XTensor OnehotToIndex(const XTensor & onehot, int num); ...@@ -36,6 +36,9 @@ XTensor OnehotToIndex(const XTensor & onehot, int num);
/* convert index tensor to onehot tensor */ /* convert index tensor to onehot tensor */
void _IndexToOnehot(const XTensor * index, XTensor * onehot, int size, float labelSmoothingP); void _IndexToOnehot(const XTensor * index, XTensor * onehot, int size, float labelSmoothingP);
/* convert index tensor to onehot tensor */
void _IndexToOnehot(int * index, int n, XTensor * onehot, int size, float labelSmoothingP);
/* convert index tensor to onehot tensor (return an XTensor structure) /* convert index tensor to onehot tensor (return an XTensor structure)
make a new tensor to keep the result and return it */ make a new tensor to keep the result and return it */
XTensor IndexToOnehot(const XTensor & index, int num, float labelSmoothingP); XTensor IndexToOnehot(const XTensor & index, int num, float labelSmoothingP);
......
...@@ -26,6 +26,82 @@ ...@@ -26,6 +26,82 @@
namespace nts{ // namespace nts(NiuTrans.Tensor) namespace nts{ // namespace nts(NiuTrans.Tensor)
/* /*
generate a tensor with selected data in index along the given dimension
c = select(a)
>> a - input tensor
>> c - result tensor
>> index - the selected index
>> dim - the dimension along with which we do the job
*/
void _Select(const XTensor * a, XTensor * c, int* index, int dim)
{
CheckNTErrors(a != NULL && c != NULL, "empty tensors!");
CheckNTErrors(a->order == c->order, "The input and output tensors must in the same order!");
CheckNTErrors(dim >= 0 && dim < a->order, "The input dimension is out of bounds!");
CheckNTErrors(a->dataType == c->dataType, "The tensor must be of the same data type!");
int stride = 1;
for (int i = dim + 1; i < a->order; i++)
stride *= a->dimSize[i];
printf("\n%d %d\n", a->order - dim - 1,stride);
int copyTimes = 1;
for (int i = 0; i < dim; i++)
{
copyTimes *= a->dimSize[i];
}
int cot = c->dimSize[dim];
int blockSize = stride * a->unitSize;
int stepSizeS = stride * a->dimSize[dim] * a->unitSize;
int stepSizeT = stride * c->dimSize[dim] * a->unitSize;
char * s = (char*)a->data;
char * t = (char*)c->data;
for (int i = 0; i < copyTimes; i++) {
for (int j = 0; j < cot; ++j) {
XMemCopy(t + j * blockSize, c->devID, s + index[j] * blockSize, a->devID, blockSize);
}
s += stepSizeS;
t += stepSizeT;
}
}
/*
generate a tensor with selected data in index along the given dimension
c = select(a)
>> a - input tensor
>> c - result tensor
>> index - the selected index
>> dim - the dimension along with which we do the job
*/
void _Select(const XTensor * a, XTensor * c, XTensor* index, int dim)
{
if (index->devID >= 0)
{
int* indexCPU = new int[index->unitNum];
XMemCopy(indexCPU, -1, index->data,index->devID, index->unitNum * sizeof(int));
_Select(a, c, indexCPU, dim);
delete[] indexCPU;
}
else
{
_Select(a, c, (int *)index->data, dim);
}
}
/*
*/
/*XTensor Select(const XTensor &a, int* index, int dim)
{
}*/
/*
generate a tensor with selected data in range[low,high] along the given dimension generate a tensor with selected data in range[low,high] along the given dimension
c = select(a) c = select(a)
...@@ -58,13 +134,12 @@ void _SelectRange(const XTensor * a, XTensor * c, int dim, int low, int high) ...@@ -58,13 +134,12 @@ void _SelectRange(const XTensor * a, XTensor * c, int dim, int low, int high)
} }
int stride = 1; int stride = 1;
int dimRDI = a->order - dim - 1; for(int i = dim + 1; i < a->order; i++)
for(int i = 0; i < dimRDI; i++) stride *= a->dimSize[i];
stride *= a->dimSizeRDI[i];
int copyTimes = 1; int copyTimes = 1;
for (int i = dimRDI + 1; i < a->order; i++) for (int i = 0; i < dim; i++)
copyTimes *= a->dimSizeRDI[i]; copyTimes *= a->dimSize[i];
int blockSize = stride * (high - low) * a->unitSize; int blockSize = stride * (high - low) * a->unitSize;
int stepSizeS = stride * a->dimSize[dim] * a->unitSize; int stepSizeS = stride * a->dimSize[dim] * a->unitSize;
...@@ -117,12 +192,10 @@ XTensor SelectRange(const XTensor &a, int dim, int low, int high) ...@@ -117,12 +192,10 @@ XTensor SelectRange(const XTensor &a, int dim, int low, int high)
_SelectRange(&a, &c, dim, low, high); _SelectRange(&a, &c, dim, low, high);
/* tensor connection */ /* tensor connection */
if (a.enableGrad) {
XLink::MakeLink(&a, NULL, &c, GETANDSET_SELECT); XLink::MakeLink(&a, NULL, &c, GETANDSET_SELECT);
XLink::AddParamToHeadInt(&c, dim); XLink::AddParamToHeadInt(&c, dim);
XLink::AddParamToHeadInt(&c, low); XLink::AddParamToHeadInt(&c, low);
XLink::AddParamToHeadInt(&c, high); XLink::AddParamToHeadInt(&c, high);
}
/* destroy variables */ /* destroy variables */
delete[] dimSize; delete[] dimSize;
......
...@@ -27,7 +27,10 @@ ...@@ -27,7 +27,10 @@
namespace nts{ // namespace nts(NiuTrans.Tensor) namespace nts{ // namespace nts(NiuTrans.Tensor)
/* generate a tensor with selected data c = select(a) */ /* generate a tensor with selected data c = select(a) */
void _Select(const XTensor * a, XTensor * c, XTensor * indexCPU); void _Select(const XTensor * a, XTensor * c, int* index, int dim);
/* generate a tensor with selected data c = select(a) */
void _Select(const XTensor * a, XTensor * c, XTensor* index, int dim);
/* /*
generate a tensor with selected data c = select(a) (returna a XTensor structure) generate a tensor with selected data c = select(a) (returna a XTensor structure)
......
...@@ -47,26 +47,25 @@ void _Normalize(const XTensor * input, XTensor * output, int dim, ...@@ -47,26 +47,25 @@ void _Normalize(const XTensor * input, XTensor * output, int dim,
const XTensor * mean, const XTensor * var, const XTensor * mean, const XTensor * var,
const XTensor * a, const XTensor * b, DTYPE epsilon) const XTensor * a, const XTensor * b, DTYPE epsilon)
{ {
int dimRDI = input->order - dim - 1;
CheckNTErrors((_IsSameShaped(input, output)), "Unmatched input tensors!"); CheckNTErrors((_IsSameShaped(input, output)), "Unmatched input tensors!");
CheckNTErrors((_IsSameShaped(a, b)), "Unmatched input tensors"); CheckNTErrors((_IsSameShaped(a, b)), "Unmatched input tensors");
CheckNTErrors((_IsSameShaped(mean, var)), "Unmatched input tensors"); CheckNTErrors((_IsSameShaped(mean, var)), "Unmatched input tensors");
CheckNTErrors((input && output && mean && var && a && b), "Empty input tensors!"); CheckNTErrors((input && output && mean && var && a && b), "Empty input tensors!");
CheckNTErrors((dimRDI >= 0 && dimRDI < input->order), "Incorrect reduction dimension!"); CheckNTErrors((dim >= 0 && dim < input->order), "Incorrect reduction dimension!");
CheckNTErrors((input->order == mean->order + 1), "Incorrect reduction dimension!"); CheckNTErrors((input->order == mean->order + 1), "Incorrect reduction dimension!");
int stride = 1; int stride = 1;
int strideNum = input->dimSizeRDI[dimRDI]; int strideNum = input->dimSize[dim];
int blockSize = 1; int blockSize = 1;
int blockNum = 1; int blockNum = 1;
for (int i = 0; i < input->order; i++) { for (int i = 0; i < input->order; i++) {
if (i < dimRDI) { if (i < dim) {
CheckNTErrors((input->dimSizeRDI[i] == mean->dimSizeRDI[i]), "Wrong size!"); CheckNTErrors((input->dimSize[i] == mean->dimSize[i]), "Wrong size!");
stride *= input->dimSizeRDI[i]; blockNum *= input->dimSize[i];
} }
else if (i > dimRDI) { else if (i > dim) {
CheckNTErrors((input->dimSizeRDI[i] == mean->dimSizeRDI[i - 1]), "Wrong size!"); CheckNTErrors((input->dimSize[i] == mean->dimSize[i - 1]), "Wrong size!");
blockNum *= input->dimSizeRDI[i]; stride *= input->dimSize[i];
} }
} }
blockSize = stride * strideNum; blockSize = stride * strideNum;
......
...@@ -95,15 +95,14 @@ void _CudaNormalize(const XTensor * input, XTensor * output, int dim, ...@@ -95,15 +95,14 @@ void _CudaNormalize(const XTensor * input, XTensor * output, int dim,
{ {
CheckNTErrors((input->dataType == DEFAULT_DTYPE), "TODO!"); CheckNTErrors((input->dataType == DEFAULT_DTYPE), "TODO!");
int dimRDI = input->order - dim - 1;
int stride = 1; int stride = 1;
int strideNum = input->dimSizeRDI[dimRDI]; int strideNum = input->dimSize[dim];
int blockNum = 1; int blockNum = 1;
for (int i = 0; i < input->order; i++) { for (int i = 0; i < input->order; i++) {
if (i < dimRDI) if (i > dim)
stride *= input->dimSizeRDI[i]; stride *= input->dimSize[i];
else if (i > dimRDI) else if (i < dim)
blockNum *= input->dimSizeRDI[i]; blockNum *= input->dimSize[i];
} }
int cudaGridSize[3]; int cudaGridSize[3];
......
...@@ -41,12 +41,11 @@ void _CopyInGrid(const XTensor * s, XTensor * t, int * index, int blockDim, int ...@@ -41,12 +41,11 @@ void _CopyInGrid(const XTensor * s, XTensor * t, int * index, int blockDim, int
{ {
CheckNTErrors((_IsSameShaped(s, t)), "Unmatched tensors!"); CheckNTErrors((_IsSameShaped(s, t)), "Unmatched tensors!");
int blockDimRDI = s->order - blockDim - 1;
int blockSize = 1; int blockSize = 1;
int blockNum = blockNumInGrid; int blockNum = blockNumInGrid;
int gridNum = 1; int gridNum = 1;
for (int i = 0; i < blockDimRDI; i++) for (int i = blockDim; i < s->order; i++)
blockSize *= s->dimSizeRDI[i]; blockSize *= s->dimSize[i];
CheckNTErrors((s->unitNum % (blockSize * blockNum) == 0), "Illegal block number!"); CheckNTErrors((s->unitNum % (blockSize * blockNum) == 0), "Illegal block number!");
gridNum = s->unitNum / (blockSize * blockNum); gridNum = s->unitNum / (blockSize * blockNum);
......
...@@ -53,26 +53,28 @@ void _CopyIndexed(const XTensor * s, XTensor * t, int dim, ...@@ -53,26 +53,28 @@ void _CopyIndexed(const XTensor * s, XTensor * t, int dim,
CheckNTErrors(dim < s->order && dim < t->order, "A too larget dimension specified!"); CheckNTErrors(dim < s->order && dim < t->order, "A too larget dimension specified!");
CheckNTErrors(s->unitSize == t->unitSize, "Unmatched tensors!"); CheckNTErrors(s->unitSize == t->unitSize, "Unmatched tensors!");
int dimRDI = s->order - dim - 1;
int blockSizeSrc = 1; int blockSizeSrc = 1;
int blockSizeTgt = 1; int blockSizeTgt = 1;
int blockNumSrc = 1; int blockNumSrc = 1;
int blockNumTgt = 1; int blockNumTgt = 1;
int leadDimSizeSrc = s->dimSizeRDI[dimRDI]; int leadDimSizeSrc = s->dimSize[dim];
int leadDimSizeTgt = t->dimSizeRDI[dimRDI]; int leadDimSizeTgt = t->dimSize[dim];
int indexOffsetNum = 1; int indexOffsetNum = 1;
for (int i = 0; i < dimRDI; i++) { for (int i = dim + 1; i < s->order; i++) {
blockSizeSrc *= s->dimSizeRDI[i]; blockSizeSrc *= s->dimSize[i];
blockSizeTgt *= t->dimSizeRDI[i]; }
for (int i = dim + 1; i < t->order; i++) {
blockSizeTgt *= t->dimSize[i];
}
for (int i = 0; i <= dim; i++)
{
blockNumSrc *= s->dimSize[i];
blockNumTgt *= t->dimSize[i];
} }
for (int i = dimRDI; i < s->order; i++)
blockNumSrc *= s->dimSizeRDI[i];
for (int i = dimRDI; i < t->order; i++)
blockNumTgt *= t->dimSizeRDI[i];
CheckNTErrors(blockSizeSrc == blockSizeTgt, "Unmatched tensors!"); CheckNTErrors(blockSizeSrc == blockSizeTgt, "Unmatched tensors!");
indexOffsetNum = blockNumSrc / s->dimSizeRDI[dimRDI]; indexOffsetNum = blockNumSrc / s->dimSize[dim];
int realIndexSize = indexOffsetNum * indexSize * copyNum; int realIndexSize = indexOffsetNum * indexSize * copyNum;
int * realSrcIndex = new int[realIndexSize]; int * realSrcIndex = new int[realIndexSize];
...@@ -219,14 +221,14 @@ make a new tensor to keep the result and return it ...@@ -219,14 +221,14 @@ make a new tensor to keep the result and return it
>> s - the source tensor >> s - the source tensor
>> dim - the leading dimension to define "sub-tensors" >> dim - the leading dimension to define "sub-tensors"
e.g., for a tensor of size (4, 2, 3) and dim = 0, e.g., for a tensor of size (3, 2, 4) and dim = 2,
we have 4 sub-tensors of size (2, 3) we have 4 sub-tensors of size (3,2)
>> srcIndex - index of the source sub-tensors >> srcIndex - index of the source sub-tensors
>> indexSize - length of srcIndex (and tgtIndex) >> indexSize - length of srcIndex (and tgtIndex)
>> tgtIndex - index of the target sub-tensors >> tgtIndex - index of the target sub-tensors
>> copyNum - number of the sub-tensors we copy for each source index, >> copyNum - number of the sub-tensors we copy for each source index,
e.g., for srcIndex = [0,1] and copyNum = 2, e.g., for srcIndex = [1,4] and copyNum = 2,
we actually copy the source sub-tensors 0, 1, 1 and 2 we actually copy the source sub-tensors 1, 2, 4, 5
<< return - the result of copying indexed sub-tensors << return - the result of copying indexed sub-tensors
*/ */
XTensor CopyIndexed(const XTensor & s, int dim, XTensor CopyIndexed(const XTensor & s, int dim,
...@@ -277,14 +279,14 @@ make a new tensor to keep the result and return it ...@@ -277,14 +279,14 @@ make a new tensor to keep the result and return it
>> s - the source tensor >> s - the source tensor
>> dim - the leading dimension to define "sub-tensors" >> dim - the leading dimension to define "sub-tensors"
e.g., for a tensor of size (4, 2, 3) and dim = 0, e.g., for a tensor of size (3, 2, 4) and dim = 2,
we have 4 sub-tensors of size (2, 3) we have 4 sub-tensors of size (3,2)
>> srcIndex - index of the source sub-tensors >> srcIndex - index of the source sub-tensors
>> indexSize - length of srcIndex (and tgtIndex) >> indexSize - length of srcIndex (and tgtIndex)
>> tgtIndex - index of the target sub-tensors >> tgtIndex - index of the target sub-tensors
>> copyNum - number of the sub-tensors we copy for each source index, >> copyNum - number of the sub-tensors we copy for each source index,
e.g., for srcIndex = [0,1] and copyNum = 2, e.g., for srcIndex = [1,4] and copyNum = 2,
we actually copy the source sub-tensors 0, 1, 1 and 2 we actually copy the source sub-tensors 1, 2, 4, 5
<< return - the result of copying indexed sub-tensors << return - the result of copying indexed sub-tensors
*/ */
XTensor CopyIndexed(const XTensor &s, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum) XTensor CopyIndexed(const XTensor &s, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum)
......
...@@ -33,6 +33,51 @@ gather indexed sub-tensors ...@@ -33,6 +33,51 @@ gather indexed sub-tensors
>> s - the source tensor >> s - the source tensor
>> t - the target tensor >> t - the target tensor
>> dim - the leading dimension to define "sub-tensors"
e.g., for a tensor of size (3, 2, 4) and dim = 2,
we have 4 sub-tensors of size (3, 2)
>> srcIndex - index of the source sub-tensors
>> indexSize - length of srcIndex (and tgtIndex)
*/
void _Gather(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize)
{
int * tgtIndex = new int[indexSize];
for(int i = 0; i < indexSize; i++)
tgtIndex[i] = i;
_CopyIndexed(s, t, dim, srcIndex, indexSize, tgtIndex, 1);
delete[] tgtIndex;
}
/*
gather indexed sub-tensors
>> s - the source tensor
>> t - the target tensor
>> srcIndex - index of the source sub-tensors
>> dim - the leading dimension to define "sub-tensors"
e.g., for a tensor of size (3, 2, 4) and dim = 2,
we have 4 sub-tensors of size (3, 2)
*/
void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
{
CheckNTErrors((s && t), "Invalid tensors!");
CheckNTErrors(s->devID == t->devID, "the data must be kept on the same device!");
CheckNTErrors((t->unitSize == srcIndex->unitSize), "Unmatched tensors!");
#ifdef USE_CUDA
if (s->devID >= 0 && t->devID >= 0) {
_CudaGather(s, t, srcIndex, dim);
return;
}
#endif
}
/*
gather indexed sub-tensors
>> s - the source tensor
>> t - the target tensor
>> srcIndex - the tensor to save the index of the source tensor >> srcIndex - the tensor to save the index of the source tensor
*/ */
void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex) void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex)
...@@ -79,10 +124,15 @@ XTensor Gather(XTensor &s, XTensor &index) ...@@ -79,10 +124,15 @@ XTensor Gather(XTensor &s, XTensor &index)
CheckNTErrors(s.order == 2, "The order of the input tensor must be 2!"); CheckNTErrors(s.order == 2, "The order of the input tensor must be 2!");
int order = index.order + 1; int order = s.order;
int * dimSize = new int[order]; int * dimSize = new int[order];
memcpy(dimSize, index.dimSize, index.order * sizeof(int));
dimSize[index.order] = s.GetDim(-1); for (int i = 0; i < s.order; i++) {
if (i == dim)
dimSize[i] = index.unitNum;
else
dimSize[i] = s.dimSize[i];
}
float dr = (!s.isSparse) ? 1.0F : s.denseRatio; float dr = (!s.isSparse) ? 1.0F : s.denseRatio;
XTensor t(order, dimSize, s.dataType, dr, s.devID, s.mem); XTensor t(order, dimSize, s.dataType, dr, s.devID, s.mem);
...@@ -93,11 +143,22 @@ XTensor Gather(XTensor &s, XTensor &index) ...@@ -93,11 +143,22 @@ XTensor Gather(XTensor &s, XTensor &index)
_Gather(&s, &t, &index); _Gather(&s, &t, &index);
/* tensor connection */ /* tensor connection */
if (s.enableGrad) {
XLink::MakeLink(&s, &index, &t, MOVEMENT_GATHER); XLink::MakeLink(&s, &index, &t, MOVEMENT_GATHER);
}
if(index.order > 1) {
int * dims = new int[index.order + 1];
memcpy(dims, index.dimSize, index.order * sizeof(int));
dims[index.order] = t.GetDim(-1);
XTensor tt;
tt = Reshape(t, index.order + 1, dims);
delete[] dims;
return tt;
}
else {
return t; return t;
}
} }
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
...@@ -68,6 +68,36 @@ void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int indexSize, int ...@@ -68,6 +68,36 @@ void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int indexSize, int
/* /*
gather indexed sub-tensors(cuda version) gather indexed sub-tensors(cuda version)
>> sData - the data pointer of the source tensor
>> tData - the data pointer of the target tensor
>> sIndex - the index of the source tensor
>> indexSize - the size of the srcIndex
>> stride - stride of a data block
>> strideNum - strideNum of a data block
>> blockNum - block size of data
*/
__global__
void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int stride, int strideNum, int blockNum)
{
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int idy = blockDim.y * blockIdx.y + threadIdx.y;
int blockIndex = idy / stride;
int offsetInBlock = idy % stride;
int size = stride * strideNum * blockNum;
#pragma unroll
for (int i = idx * stride + stride * strideNum * blockIndex + offsetInBlock;
i < stride * strideNum * blockIndex + offsetInBlock + stride * strideNum && i < size;
i += stride * blockDim.x) {
tData[i] = sData[sIndex[i]];
}
}
/*
gather indexed sub-tensors(cuda version)
>> s - the source tensor >> s - the source tensor
>> t - the target tensor >> t - the target tensor
>> srcIndex - the tensor to save the index of the source tensor >> srcIndex - the tensor to save the index of the source tensor
...@@ -117,6 +147,44 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex) ...@@ -117,6 +147,44 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex)
BacktoCudaDev(devID, devIDBackup); BacktoCudaDev(devID, devIDBackup);
} }
/*
gather indexed sub-tensors(cuda version)
>> s - the source tensor
>> t - the target tensor
>> srcIndex - the tensor to save the index of the source tensor
>> dim - the leading dimension to define "sub-tensors"
*/
void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
{
int devID = srcIndex->devID;
XMem * mem = s->mem;
int stride = 1;
int blockNum = 1;
int indexSize = srcIndex->unitNum;
int strideNum = srcIndex->dimSize[dim];
for (int i = 0; i < dim; i++)
blockNum *= srcIndex->dimSize[i];
for (int i = dim + 1; i < srcIndex->order; i++)
stride *= srcIndex->dimSize[i];
int * sIndex = NULL;
if (srcIndex->devID < 0) {
sIndex = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
(int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize);
}
else
sIndex = (int *)srcIndex->data;
int cudaGrids[3];
int cudaBlocks[3];
GDevs.GetCudaThread2D(devID, max(32, strideNum), stride*blockNum, MAX_INT, cudaGrids, cudaBlocks);
KernelGather << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> > ((DTYPE *)s->data, (DTYPE *)t->data, sIndex, stride, strideNum, blockNum);
}
#endif // USE_CUDA #endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
...@@ -32,6 +32,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -32,6 +32,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* gather indexed sub-tensors(cuda version) */ /* gather indexed sub-tensors(cuda version) */
void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex); void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex);
void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex,int dim);
#endif // USE_CUDA #endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -27,8 +27,14 @@ ...@@ -27,8 +27,14 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* gather selected sub-tensors */ /* gather selected sub-tensors */
void _Gather(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize);
/* gather selected sub-tensors */
void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex); void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex);
/* gather selected sub-tensors accoding to the dimension */
void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim);
/* gather selected sub-tensors (return an XTensor structure) /* gather selected sub-tensors (return an XTensor structure)
make a new tensor to keep the result and return it */ make a new tensor to keep the result and return it */
XTensor Gather(XTensor &s, XTensor &index); XTensor Gather(XTensor &s, XTensor &index);
......
...@@ -31,6 +31,9 @@ namespace nts{ // namespace nts(NiuTrans.Tensor) ...@@ -31,6 +31,9 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
/* get the max-valued items along a dimension of the tensor (cuda version) */ /* get the max-valued items along a dimension of the tensor (cuda version) */
void _CudaReduceMax(const XTensor * input, XTensor * output, int dim); void _CudaReduceMax(const XTensor * input, XTensor * output, int dim);
/* get the min-valued items along a dimension of the tensor (cuda version) */
void _CudaReduceMin(const XTensor * input, XTensor * output, int dim);
#endif // USE_CUDA #endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -29,14 +29,20 @@ namespace nts{ // namespace nts(NiuTrans.Tensor) ...@@ -29,14 +29,20 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
/* get the max value of the items along a dimension of the tensor. */ /* get the max value of the items along a dimension of the tensor. */
void _ReduceMax(const XTensor * input, XTensor * output, int dim); void _ReduceMax(const XTensor * input, XTensor * output, int dim);
/* get the min value of the items along a dimension of the tensor. */
void _ReduceMin(const XTensor * input, XTensor * output, int dim);
/* /*
get the max value of the items along a dimension of the tensor (return an XTensor structure) get the max value of the items along a dimension of the tensor (return an XTensor structure)
make a new tensor to keep the result and return it make a new tensor to keep the result and return it
*/ */
XTensor ReduceMax(const XTensor &input, int dim); XTensor ReduceMax(const XTensor &input, int dim);
/* get the max value of the items along a dimension of the tensor. */ /*
void ReduceMax(const XTensor &input, XTensor &output, int dim); get the min value of the items along a dimension of the tensor (return an XTensor structure)
make a new tensor to keep the result and return it
*/
XTensor ReduceMin(const XTensor &input, int dim);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -39,8 +39,7 @@ void _ReduceMean(const XTensor * input, XTensor * output, int dim) ...@@ -39,8 +39,7 @@ void _ReduceMean(const XTensor * input, XTensor * output, int dim)
{ {
CheckNTErrors((input->order > dim), "Illegal dimension specified!"); CheckNTErrors((input->order > dim), "Illegal dimension specified!");
int dimRDI = input->order - dim - 1; int num = input->dimSize[dim];
int num = input->dimSizeRDI[dimRDI];
_ReduceSum(input, output, dim); _ReduceSum(input, output, dim);
_ScaleAndShiftMe(output, (DTYPE)1/num, 0); _ScaleAndShiftMe(output, (DTYPE)1/num, 0);
......
...@@ -54,15 +54,14 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor ...@@ -54,15 +54,14 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor
CheckNTErrors((input->dataType == output->dataType), "Unmatched data types!"); CheckNTErrors((input->dataType == output->dataType), "Unmatched data types!");
CheckNTErrors((shift == NULL || _IsSameShaped(output, shift)), "Incorrect shift tensor size!"); CheckNTErrors((shift == NULL || _IsSameShaped(output, shift)), "Incorrect shift tensor size!");
int dimRDI = input->order - dim - 1; CheckNTErrors(dim < input->order, "Wrong dimension!");
CheckNTErrors(dimRDI >= 0, "Wrong dimension!");
for(int i = 0; i < input->order; i++){ for(int i = 0; i < input->order; i++){
if(i < dimRDI){ if(i < dim){
CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i]), "Unmatched tensors!"); CheckNTErrors((input->dimSize[i] == output->dimSize[i]), "Unmatched tensors!");
} }
else if(i > dimRDI){ else if(i > dim){
CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i - 1]), "Unmatched tensors!"); CheckNTErrors((input->dimSize[i] == output->dimSize[i - 1]), "Unmatched tensors!");
} }
} }
...@@ -75,21 +74,21 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor ...@@ -75,21 +74,21 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor
CheckNTErrors((input->dataType == DEFAULT_DTYPE), "TODO!"); CheckNTErrors((input->dataType == DEFAULT_DTYPE), "TODO!");
int stride = 1; int stride = 1;
int strideNum = input->dimSizeRDI[dimRDI]; int strideNum = input->dimSize[dim];
int blockSize = 1; int blockSize = 1;
int blockNum = 1; int blockNum = 1;
for (int i = 0; i < input->order; i++) { for (int i = 0; i < input->order; i++) {
if (i < dimRDI) if (i < dim)
stride *= input->dimSizeRDI[i]; blockNum *= input->dimSize[i];
else if (i > dimRDI) else if (i > dim)
blockNum *= input->dimSizeRDI[i]; stride *= input->dimSize[i];
} }
blockSize = stride * strideNum; blockSize = stride * strideNum;
if(input->dimSizeRDI[0] % (4 * 32 / sizeof(DTYPE)) == 0 && input->dimSizeRDI[0] >= 32){ if(input->dimSize[input->order - 1] % (4 * 32 / sizeof(DTYPE)) == 0 && input->dimSize[input->order - 1] >= 32){
int vecBufLength = 32 / sizeof(DTYPE); int vecBufLength = 32 / sizeof(DTYPE);
if(dimRDI == 0){ if(dim == input->order - 1){
//data is contiguous in dim 0 //data is contiguous in dim 0
for(int i = 0; i < blockNum; i++){ for(int i = 0; i < blockNum; i++){
// stride = 1 // stride = 1
...@@ -123,7 +122,7 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor ...@@ -123,7 +122,7 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor
} else{ } else{
//data is separated //data is separated
for(int i = 0; i < blockNum; i++){ for(int i = 0; i < blockNum; i++){
for(int j = 0; j < input->dimSizeRDI[0] / 32; j++){ for(int j = 0; j < input->dimSize[input->order - 1] / 32; j++){
DTYPE * ip = (DTYPE*)input->data + blockSize * i; DTYPE * ip = (DTYPE*)input->data + blockSize * i;
DTYPE * op = (DTYPE*)output->data + stride * i; DTYPE * op = (DTYPE*)output->data + stride * i;
DTYPE * sp = shift != NULL ? (DTYPE*)shift->data + stride * i : NULL; DTYPE * sp = shift != NULL ? (DTYPE*)shift->data + stride * i : NULL;
......
...@@ -692,13 +692,12 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen ...@@ -692,13 +692,12 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
CheckNTErrors(input->dataType == output->dataType, "Unmatched data types!"); CheckNTErrors(input->dataType == output->dataType, "Unmatched data types!");
CheckNTErrors(shift == NULL || output->unitNum == shift->unitNum, "Incorrect shift tensor size!"); CheckNTErrors(shift == NULL || output->unitNum == shift->unitNum, "Incorrect shift tensor size!");
int dimRDI = input->order - dim - 1;
for(int i = 0; i < input->order; i++){ for(int i = 0; i < input->order; i++){
if(i < dimRDI){ if(i < dim){
CheckNTErrors(input->dimSizeRDI[i] == output->dimSizeRDI[i], "Unmatched tensors!"); CheckNTErrors(input->dimSize[i] == output->dimSize[i], "Unmatched tensors!");
} }
else if(i > dimRDI){ else if(i > dim){
CheckNTErrors(input->dimSizeRDI[i] == output->dimSizeRDI[i - 1], "Unmatched tensors!"); CheckNTErrors(input->dimSize[i] == output->dimSize[i - 1], "Unmatched tensors!");
} }
} }
...@@ -709,32 +708,24 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen ...@@ -709,32 +708,24 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
int cudaBlockSize[3]; int cudaBlockSize[3];
int iter = 0; int iter = 0;
int stride = 1; int stride = 1;
int strideNum = input->dimSizeRDI[dimRDI]; int strideNum = input->dimSize[dim];
int blockSize = 1; int blockSize = 1;
int blockNum = 1; int blockNum = 1;
for (int i = 0; i < input->order; i++) { for (int i = 0; i < input->order; i++) {
if (i < dimRDI) if (i < dim)
stride *= input->dimSizeRDI[i]; blockNum *= input->dimSize[i];
else if (i > dimRDI) else if (i > dim)
blockNum *= input->dimSizeRDI[i]; stride *= input->dimSize[i];
} }
blockSize = stride * strideNum; blockSize = stride * strideNum;
int devID = input->devID; int devID = input->devID;
XMem * mem = input->mem; int devIDBackup;
ProtectCudaDev(devID, devIDBackup);
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
int bufSize = input->unitSize * cudaGridSize[0] * stride * blockNum * 2;
DTYPE * buf = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(input->devID, bufSize);
DTYPE * buf1 = buf;
DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum;
DTYPE * sp = shift != NULL ? (DTYPE*)shift->data : NULL; DTYPE * sp = shift != NULL ? (DTYPE*)shift->data : NULL;
int devIDBackup;
ProtectCudaDev(input->devID, devIDBackup);
if (stride == 1 && blockNum >= 10) { if (stride == 1 && blockNum >= 10) {
dim3 grids; dim3 grids;
dim3 blocks; dim3 blocks;
...@@ -761,6 +752,14 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen ...@@ -761,6 +752,14 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
strideNum, blockNum,sp, power, isExp); strideNum, blockNum,sp, power, isExp);
} }
else { else {
XMem * mem = input->mem;
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
int bufSize = input->unitSize * cudaGridSize[0] * stride * blockNum * 2;
DTYPE * buf = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(devID, bufSize);
DTYPE * buf1 = buf;
DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum;
do { do {
if (input->dataType == DEFAULT_DTYPE) { if (input->dataType == DEFAULT_DTYPE) {
DTYPE * iData = NULL; DTYPE * iData = NULL;
...@@ -904,13 +903,15 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen ...@@ -904,13 +903,15 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
iter++; iter++;
} while (strideNum > 1); } while (strideNum > 1);
}
ProtectCudaDev(input->devID, devIDBackup);
if (mem != NULL) if (mem != NULL)
mem->ReleaseBuf(mem->devID, bufSize); mem->ReleaseBuf(mem->devID, bufSize);
else else
XMemFree(input->devID, buf); XMemFree(devID, buf);
}
BacktoCudaDev(devID, devIDBackup);
} }
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -38,8 +38,7 @@ For a 1-dimensional data array a, variance = 1/n * \sum_i (a_i - mean)^2 ...@@ -38,8 +38,7 @@ For a 1-dimensional data array a, variance = 1/n * \sum_i (a_i - mean)^2
*/ */
void _ReduceVariance(const XTensor * input, XTensor * output, int dim, const XTensor * mean) void _ReduceVariance(const XTensor * input, XTensor * output, int dim, const XTensor * mean)
{ {
int dimRDI = input->order - dim - 1; int num = input->dimSize[dim];
int num = input->dimSizeRDI[dimRDI];
_ReduceSum(input, output, dim, mean, 2.0F); _ReduceSum(input, output, dim, mean, 2.0F);
_ScaleAndShiftMe(output, (DTYPE)1 / num, 0); _ScaleAndShiftMe(output, (DTYPE)1 / num, 0);
} }
......
...@@ -20,7 +20,7 @@ ...@@ -20,7 +20,7 @@
*/ */
#include "VectorBuffer.h" #include "VectorBuffer.h"
//#include "math.h"
namespace nts { namespace nts {
/* data size for each buffer */ /* data size for each buffer */
int VectorBuffer::size() int VectorBuffer::size()
...@@ -168,4 +168,13 @@ VectorBuffer VectorBuffer::maxData(const VectorBuffer &a) { ...@@ -168,4 +168,13 @@ VectorBuffer VectorBuffer::maxData(const VectorBuffer &a) {
return *this; return *this;
} }
/* conculte the max of two buffer */
VectorBuffer VectorBuffer::minData(const VectorBuffer &a) {
for (int i = 0; i != a.size(); i++) {
this->values[i] = MIN(a[i], this->values[i]);
printf("runhere");
}
return *this;
}
}/* end of the nts (NiuTrans.Tensor) namespace */ }/* end of the nts (NiuTrans.Tensor) namespace */
\ No newline at end of file
...@@ -20,7 +20,6 @@ ...@@ -20,7 +20,6 @@
*/ */
//#include <cstring> //#include <cstring>
#include <math.h>
#include "../../XGlobal.h" #include "../../XGlobal.h"
namespace nts { namespace nts {
...@@ -49,5 +48,8 @@ public: ...@@ -49,5 +48,8 @@ public:
/* conculte the max of two buffer */ /* conculte the max of two buffer */
VectorBuffer maxData(const VectorBuffer &a); VectorBuffer maxData(const VectorBuffer &a);
/* conculte the max of two buffer */
VectorBuffer minData(const VectorBuffer &a);
}; };
} }
\ No newline at end of file
...@@ -39,30 +39,29 @@ void _ConcatenateSolely(const TensorList * smalls, XTensor * big, int dim) ...@@ -39,30 +39,29 @@ void _ConcatenateSolely(const TensorList * smalls, XTensor * big, int dim)
CheckNTErrors(big->order > dim && dim >= 0, "Illegal dimension to concatenate!"); CheckNTErrors(big->order > dim && dim >= 0, "Illegal dimension to concatenate!");
int catDimSize = 0; int catDimSize = 0;
int dimRDI = big->order - dim - 1;
for (int i = 0; i < smalls->count; i++) { for (int i = 0; i < smalls->count; i++) {
XTensor * tensor = (XTensor*)smalls->GetItem(i); XTensor * tensor = (XTensor*)smalls->GetItem(i);
CheckNTErrors((big->order == tensor->order), "Unmatched tensor orders!"); CheckNTErrors((big->order == tensor->order), "Unmatched tensor orders!");
for (int j = 0; j < big->order; j++) { for (int j = 0; j < big->order; j++) {
if (j != dimRDI) { if (j != dim) {
CheckNTErrors((big->dimSizeRDI[j] == tensor->dimSizeRDI[j]), "Unmatched tensor sizes!"); CheckNTErrors((big->dimSize[j] == tensor->dimSize[j]), "Unmatched tensor sizes!");
} }
else { else {
catDimSize += tensor->dimSizeRDI[j]; catDimSize += tensor->dimSize[j];
} }
} }
} }
CheckNTErrors((catDimSize == big->dimSizeRDI[dimRDI]), "Unmatched tensor sizes!"); CheckNTErrors((catDimSize == big->dimSize[dim]), "Unmatched tensor sizes!");
int stride = 1; int stride = 1;
for (int i = 0; i < dimRDI; i++)
stride *= big->dimSizeRDI[i];
int blockNum = 1; int blockNum = 1;
for (int i = dimRDI + 1; i < big->order; i++) for (int i = 0; i < dim; i++)
blockNum *= big->dimSizeRDI[i]; blockNum *= big->dimSize[i];
for (int i = dim + 1; i < big->order; i++)
stride *= big->dimSize[i];
int offset = 0; int offset = 0;
...@@ -74,8 +73,8 @@ void _ConcatenateSolely(const TensorList * smalls, XTensor * big, int dim) ...@@ -74,8 +73,8 @@ void _ConcatenateSolely(const TensorList * smalls, XTensor * big, int dim)
if (smalls->count <= MIN_TENSOR_CAT_NUM) { if (smalls->count <= MIN_TENSOR_CAT_NUM) {
for (int i = 0; i < smalls->count; i++) { for (int i = 0; i < smalls->count; i++) {
XTensor * tensor = (XTensor*)smalls->GetItem(i); XTensor * tensor = (XTensor*)smalls->GetItem(i);
int sPitch = stride * tensor->dimSizeRDI[dimRDI] * tensor->unitSize; int sPitch = stride * tensor->dimSize[dim] * tensor->unitSize;
int tPitch = stride * big->dimSizeRDI[dimRDI] * big->unitSize; int tPitch = stride * big->dimSize[dim] * big->unitSize;
int mSize = sPitch; int mSize = sPitch;
int n = blockNum; int n = blockNum;
XMemCopy2D((char*)big->data + offset, tPitch, big->devID, XMemCopy2D((char*)big->data + offset, tPitch, big->devID,
...@@ -89,7 +88,7 @@ void _ConcatenateSolely(const TensorList * smalls, XTensor * big, int dim) ...@@ -89,7 +88,7 @@ void _ConcatenateSolely(const TensorList * smalls, XTensor * big, int dim)
int * blockSizes = new int[smalls->count]; int * blockSizes = new int[smalls->count];
for (int i = 0; i < smalls->count; i++) { for (int i = 0; i < smalls->count; i++) {
XTensor * tensor = (XTensor*)smalls->GetItem(i); XTensor * tensor = (XTensor*)smalls->GetItem(i);
blockSizes[i] = stride * tensor->dimSizeRDI[dimRDI] * tensor->unitSize; blockSizes[i] = stride * tensor->dimSize[dim] * tensor->unitSize;
sourceArrays->Add((char*)tensor->data); sourceArrays->Add((char*)tensor->data);
} }
......
...@@ -39,7 +39,7 @@ bool _IsSameShaped(const XTensor * a, const XTensor * b) ...@@ -39,7 +39,7 @@ bool _IsSameShaped(const XTensor * a, const XTensor * b)
return false; return false;
for(int i = 0; i < a->order; i++){ for(int i = 0; i < a->order; i++){
if(a->dimSizeRDI[i] != b->dimSizeRDI[i]) if(a->dimSize[i] != b->dimSize[i])
return false; return false;
} }
......
...@@ -46,10 +46,8 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim) ...@@ -46,10 +46,8 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
if(leadingDim < 0) if(leadingDim < 0)
leadingDim = 0; leadingDim = 0;
int whereToMergeRDI = s->order - whereToMerge - 1; if (leadingDim >= s->order)
int leadingDimRDI = s->order - leadingDim - 1; leadingDim = leadingDim - s->order;
if (leadingDimRDI < 0)
leadingDimRDI = s->order - 1;
CheckNTErrors((s != NULL && t != NULL), "Invalid tensors!"); CheckNTErrors((s != NULL && t != NULL), "Invalid tensors!");
CheckNTErrors((s->devID == t->devID || (s->devID < 0 && t->devID < 0)), CheckNTErrors((s->devID == t->devID || (s->devID < 0 && t->devID < 0)),
...@@ -57,19 +55,20 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim) ...@@ -57,19 +55,20 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
CheckNTErrors((s->unitNum == t->unitNum && s->unitSize == t->unitSize), "Unmatched tensors!"); CheckNTErrors((s->unitNum == t->unitNum && s->unitSize == t->unitSize), "Unmatched tensors!");
CheckNTErrors((s->order == t->order + 1), "Unmatched tensors!"); CheckNTErrors((s->order == t->order + 1), "Unmatched tensors!");
CheckNTErrors((leadingDimRDI > whereToMergeRDI), "Invalid leading dimension!"); CheckNTErrors((leadingDim < whereToMerge), "Invalid leading dimension!");
for (int i = 0; i < s->order; i++) { for (int i = 0; i < s->order; i++) {
if (i == whereToMergeRDI) { if (i == whereToMerge) {
CheckNTErrors((t->dimSizeRDI[i] == s->dimSizeRDI[i] * s->dimSizeRDI[leadingDimRDI]),
CheckNTErrors((t->dimSize[i - 1] == s->dimSize[i] * s->dimSize[leadingDim]),
"Unmatched tensor sizes!"); "Unmatched tensor sizes!");
} }
else if (i < leadingDimRDI){ else if (i < leadingDim){
CheckNTErrors((s->dimSizeRDI[i] == t->dimSizeRDI[i]), CheckNTErrors((s->dimSize[i] == t->dimSize[i]),
"Unmatched tensor sizes!"); "Unmatched tensor sizes!");
} }
else if (i > leadingDimRDI) { else if (i > leadingDim) {
CheckNTErrors((s->dimSizeRDI[i] == t->dimSizeRDI[i - 1]), CheckNTErrors((s->dimSize[i] == t->dimSize[i - 1]),
"Unmatched tensor sizes!"); "Unmatched tensor sizes!");
} }
} }
...@@ -78,14 +77,14 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim) ...@@ -78,14 +77,14 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
int blockNum = 1; int blockNum = 1;
int gridSize = 1; int gridSize = 1;
int gridNum = 1; int gridNum = 1;
int mergedNum = s->dimSizeRDI[leadingDimRDI]; int mergedNum = s->dimSize[leadingDim];
for (int i = 0; i < s->order; i++) { for (int i = 0; i < s->order; i++) {
if (i <= leadingDimRDI) { if (i >= leadingDim) {
if (i <= whereToMergeRDI) if (i >= whereToMerge)
blockSize *= s->dimSizeRDI[i]; blockSize *= s->dimSize[i];
else else
blockNum *= s->dimSizeRDI[i]; blockNum *= s->dimSize[i];
} }
} }
...@@ -122,7 +121,7 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim) ...@@ -122,7 +121,7 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
if (!isOnSameDevice) if (!isOnSameDevice)
dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(mem->devID, size); dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(mem->devID, size);
int blockNumInMerge = s->dimSizeRDI[leadingDimRDI]; int blockNumInMerge = s->dimSize[leadingDim];
int splitSizeInGrid = gridSize / blockNumInMerge; int splitSizeInGrid = gridSize / blockNumInMerge;
int realBlockSize = blockSize * t->unitSize; int realBlockSize = blockSize * t->unitSize;
...@@ -311,12 +310,11 @@ void _Merge(const TensorList * smalls, XTensor * t, int whereToMerge) ...@@ -311,12 +310,11 @@ void _Merge(const TensorList * smalls, XTensor * t, int whereToMerge)
int mergedNum = smalls->count; int mergedNum = smalls->count;
XTensor * s0 = smalls->GetItem(0); XTensor * s0 = smalls->GetItem(0);
int whereToMergeRDI = s0->order - whereToMerge - 1;
for (int i = 0; i < s0->order; i++) { for (int i = 0; i < s0->order; i++) {
if (i <= whereToMergeRDI) if (i >= whereToMerge)
blockSize *= s0->dimSizeRDI[i]; blockSize *= s0->dimSize[i];
else else
blockNum *= s0->dimSizeRDI[i]; blockNum *= s0->dimSize[i];
} }
CheckNTErrors((s0->unitNum % (blockSize * blockNum) == 0), "Incorrect size!"); CheckNTErrors((s0->unitNum % (blockSize * blockNum) == 0), "Incorrect size!");
......
...@@ -46,8 +46,6 @@ void Merge(const TensorList &smalls, XTensor &t, int whereToMerge); ...@@ -46,8 +46,6 @@ void Merge(const TensorList &smalls, XTensor &t, int whereToMerge);
/* merge two tensors into a big tensor (return an XTensor structure) */ /* merge two tensors into a big tensor (return an XTensor structure) */
XTensor Merge(const XTensor &smallA, const XTensor &smallB, int whereToMerge); XTensor Merge(const XTensor &smallA, const XTensor &smallB, int whereToMerge);
void Merge(const XTensor &smallA, const XTensor &smallB, XTensor &t, int whereToMerge);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
#endif // __MERGE_H__ #endif // __MERGE_H__
\ No newline at end of file
...@@ -31,7 +31,7 @@ ...@@ -31,7 +31,7 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* /*
transform a tensor by splitting it, e.g., (N, M) -> (3, N/3, M) transform a tensor by splitting it, e.g., (N, M) -> (N/3, M, 3)
>> s - the source tensor >> s - the source tensor
>> t - the target tensor (for return) >> t - the target tensor (for return)
...@@ -46,23 +46,22 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum) ...@@ -46,23 +46,22 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
CheckNTErrors((s->unitNum == t->unitNum && s->unitSize == t->unitSize), "Unmatched tensors!"); CheckNTErrors((s->unitNum == t->unitNum && s->unitSize == t->unitSize), "Unmatched tensors!");
CheckNTErrors((s->order == t->order - 1), "Unmatched tensors!"); CheckNTErrors((s->order == t->order - 1), "Unmatched tensors!");
CheckNTErrors((t->dimSizeRDI[t->order - 1] == splitNum), "Incorrect tensor sizes!"); CheckNTErrors((t->dimSize[0] == splitNum), "Incorrect tensor sizes!");
int whereToSplitRDI = s->order - whereToSplit - 1;
for (int i = 0; i < s->order; i++) { for (int i = 0; i < s->order; i++) {
if (i == whereToSplitRDI) { if (i == whereToSplit) {
CheckNTErrors((s->dimSizeRDI[i] == t->dimSizeRDI[i] * splitNum), CheckNTErrors((s->dimSize[i] == t->dimSize[i + 1] * splitNum),
"Unmatched tensor sizes!"); "Unmatched tensor sizes!");
} }
else { else {
CheckNTErrors((s->dimSizeRDI[i] == t->dimSizeRDI[i]), CheckNTErrors((s->dimSize[i] == t->dimSize[i + 1]),
"Unmatched tensor sizes!"); "Unmatched tensor sizes!");
} }
} }
/* for the case that we split the last dimension. Actually /* for the case that we split the last dimension. Actually
(N, M) and (3, N/3, M) have the same memory layout */ (N, M) and (N, M/3, 3) have the same memory layout */
if (s->order - 1 == whereToSplitRDI) { if (0 == whereToSplit) {
XMemCopy(t->data, t->devID, s->data, s->devID, s->unitNum * s->unitSize); XMemCopy(t->data, t->devID, s->data, s->devID, s->unitNum * s->unitSize);
return; return;
} }
...@@ -70,14 +69,14 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum) ...@@ -70,14 +69,14 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
int blockSize = 1; int blockSize = 1;
int blockNum = 1; int blockNum = 1;
for (int i = 0; i < s->order; i++) { for (int i = 0; i < s->order; i++) {
if (i == whereToSplitRDI) { if (i == whereToSplit) {
blockSize *= s->dimSizeRDI[i] / splitNum; blockSize *= s->dimSize[i] / splitNum;
blockNum *= splitNum; blockNum *= splitNum;
} }
else if (i < whereToSplitRDI) else if (i > whereToSplit)
blockSize *= s->dimSizeRDI[i]; blockSize *= s->dimSize[i];
else else
blockNum *= s->dimSizeRDI[i]; blockNum *= s->dimSize[i];
} }
CheckNTErrors((blockNum % splitNum == 0), "Incorrect split number!"); CheckNTErrors((blockNum % splitNum == 0), "Incorrect split number!");
...@@ -184,7 +183,7 @@ bool CheckSplitSize(const XTensor * s, const XTensor * t, int whereToSplit, int ...@@ -184,7 +183,7 @@ bool CheckSplitSize(const XTensor * s, const XTensor * t, int whereToSplit, int
} }
/* /*
transform a tensor by splitting it, e.g., (N, M) -> (3, N/3, M) (return an XTensor structure) transform a tensor by splitting it, e.g., (N, M) -> (N/3, M, 3) (return an XTensor structure)
make a new tensor to keep the result and return it make a new tensor to keep the result and return it
>> s - the source tensor >> s - the source tensor
...@@ -276,7 +275,6 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli ...@@ -276,7 +275,6 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli
CheckNTErrors((smalls->count == splitNum), "Unmatched tensors!"); CheckNTErrors((smalls->count == splitNum), "Unmatched tensors!");
CheckNTErrors((smalls->count > 0), "Wrong input!"); CheckNTErrors((smalls->count > 0), "Wrong input!");
int whereToSplitRDI = big->order - whereToSplit - 1;
bool uniform = true; bool uniform = true;
for (int i = 0; i < smalls->count; i++) { for (int i = 0; i < smalls->count; i++) {
...@@ -292,14 +290,14 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli ...@@ -292,14 +290,14 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli
int blockSize = 1; int blockSize = 1;
int blockNum = 1; int blockNum = 1;
for (int i = 0; i < big->order; i++) { for (int i = 0; i < big->order; i++) {
if (i == whereToSplitRDI) { if (i == whereToSplit) {
blockSize *= big->dimSizeRDI[i] / splitNum; blockSize *= big->dimSize[i] / splitNum;
blockNum *= splitNum; blockNum *= splitNum;
} }
else if (i < whereToSplitRDI) else if (i > whereToSplit)
blockSize *= big->dimSizeRDI[i]; blockSize *= big->dimSize[i];
else else
blockNum *= big->dimSizeRDI[i]; blockNum *= big->dimSize[i];
} }
CheckNTErrors((blockNum % splitNum == 0), "Incorrect split number!"); CheckNTErrors((blockNum % splitNum == 0), "Incorrect split number!");
......
...@@ -42,16 +42,15 @@ void _Unsqueeze(const XTensor * a, XTensor * b, int dim, int dSize) ...@@ -42,16 +42,15 @@ void _Unsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
CheckNTErrors((a->order == b->order - 1), "Unmatched tensors!"); CheckNTErrors((a->order == b->order - 1), "Unmatched tensors!");
CheckNTErrors((a->unitSize == b->unitSize), "Unmatched tensors!"); CheckNTErrors((a->unitSize == b->unitSize), "Unmatched tensors!");
int dimRDI = b->order - dim - 1;
for (int i = 0; i < b->order; i++) { for (int i = 0; i < b->order; i++) {
if (i < dimRDI) { if (i < dim) {
CheckNTErrors((a->dimSizeRDI[i] == b->dimSizeRDI[i]), "Unmatched tensors!"); CheckNTErrors((a->dimSize[i] == b->dimSize[i]), "Unmatched tensors!");
} }
else if (i > dimRDI) { else if (i > dim) {
CheckNTErrors((a->dimSizeRDI[i - 1] == b->dimSizeRDI[i]), "Unmatched tensors!"); CheckNTErrors((a->dimSize[i - 1] == b->dimSize[i]), "Unmatched tensors!");
} }
else { else {
CheckNTErrors((dSize == b->dimSizeRDI[i]), "Unmatched tensors!"); CheckNTErrors((dSize == b->dimSize[i]), "Unmatched tensors!");
} }
} }
...@@ -60,8 +59,8 @@ void _Unsqueeze(const XTensor * a, XTensor * b, int dim, int dSize) ...@@ -60,8 +59,8 @@ void _Unsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
int blockNumA = 1; int blockNumA = 1;
int blockNumB = 1; int blockNumB = 1;
for (int i = 0; i < dimRDI; i++) for (int i = dim; i < a->order; i++)
blockSize *= a->dimSizeRDI[i]; blockSize *= a->dimSize[i];
realBlockSize = blockSize * a->unitSize; realBlockSize = blockSize * a->unitSize;
......
...@@ -235,9 +235,8 @@ void _CudaUnsqueeze(const XTensor * a, XTensor * b, int dim, int dSize) ...@@ -235,9 +235,8 @@ void _CudaUnsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
int blockSize = 1; int blockSize = 1;
int blockNumA = 1; int blockNumA = 1;
int blockNumB = 1; int blockNumB = 1;
int dimRDI = b->order - dim - 1; for (int i = dim; i < a->order; i++)
for (int i = 0; i < dimRDI; i++) blockSize *= a->dimSize[i];
blockSize *= a->dimSizeRDI[i];
blockNumA = a->unitNum / blockSize; blockNumA = a->unitNum / blockSize;
blockNumB = b->unitNum / blockSize; blockNumB = b->unitNum / blockSize;
...@@ -250,7 +249,7 @@ void _CudaUnsqueeze(const XTensor * a, XTensor * b, int dim, int dSize) ...@@ -250,7 +249,7 @@ void _CudaUnsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
int devIDBackup = 0; int devIDBackup = 0;
ProtectCudaDev(a->devID, devIDBackup); ProtectCudaDev(a->devID, devIDBackup);
if (dimRDI == 0) { if (dim == b->order - 1) {
GDevs.GetCudaThread2D(a->devID, dSize, blockNumA, MAX_INT, cudaGrids, cudaBlocks); GDevs.GetCudaThread2D(a->devID, dSize, blockNumA, MAX_INT, cudaGrids, cudaBlocks);
if (a->dataType == X_FLOAT && b->dataType == X_FLOAT) { if (a->dataType == X_FLOAT && b->dataType == X_FLOAT) {
......
...@@ -47,7 +47,6 @@ void _Sort(const XTensor * a, XTensor * b, XTensor * index, int dim) ...@@ -47,7 +47,6 @@ void _Sort(const XTensor * a, XTensor * b, XTensor * index, int dim)
CheckNTErrors((a->order == index->order), "Unmatched input tensors!"); CheckNTErrors((a->order == index->order), "Unmatched input tensors!");
CheckNTErrors((index->dataType == X_INT), "Wrong data type!"); CheckNTErrors((index->dataType == X_INT), "Wrong data type!");
int dimRDI = a->order - dim - 1;
/* make the index tensor */ /* make the index tensor */
SetAscendingOrder(*index, dim); SetAscendingOrder(*index, dim);
...@@ -60,13 +59,13 @@ void _Sort(const XTensor * a, XTensor * b, XTensor * index, int dim) ...@@ -60,13 +59,13 @@ void _Sort(const XTensor * a, XTensor * b, XTensor * index, int dim)
} }
else { else {
int stride = 1; int stride = 1;
int strideNum = a->dimSizeRDI[dimRDI];
for (int i = 0; i < dimRDI; i++)
stride *= a->dimSizeRDI[i];
int blockNum = 1; int blockNum = 1;
for (int i = dimRDI + 1; i < a->order; i++) int strideNum = a->dimSize[dim];
blockNum *= a->dimSizeRDI[i]; for (int i = 0; i < dim; i++)
blockNum *= a->dimSize[i];
for (int i = dim + 1; i < a->order; i++)
stride *= a->dimSize[i];
int blockSize = stride * strideNum; int blockSize = stride * strideNum;
_CopyValues(a, b); _CopyValues(a, b);
......
...@@ -217,20 +217,19 @@ void _CudaSortBig(const XTensor * a, XTensor * b, XTensor * indexA, XTensor * in ...@@ -217,20 +217,19 @@ void _CudaSortBig(const XTensor * a, XTensor * b, XTensor * indexA, XTensor * in
CheckNTErrors((a->order > dim && dim >= 0), "Incorrect dimension specified!"); CheckNTErrors((a->order > dim && dim >= 0), "Incorrect dimension specified!");
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!"); CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
int dimRDI = a->order - dim - 1; if (k < 0 || k > b->dimSize[dim])
if (k < 0 || k > b->dimSizeRDI[dimRDI]) k = b->dimSize[dim];
k = b->dimSizeRDI[dimRDI];
XMem * mem = a->mem; XMem * mem = a->mem;
int stride = 1; int stride = 1;
int strideNum = a->dimSizeRDI[dimRDI];
for (int i = 0; i < dimRDI; i++)
stride *= a->dimSizeRDI[i];
int blockNum = 1; int blockNum = 1;
for (int i = dimRDI + 1; i < a->order; i++) int strideNum = a->dimSize[dim];
blockNum *= a->dimSizeRDI[i]; for (int i = 0; i < dim; i++)
blockNum *= a->dimSize[i];
for (int i = dim + 1; i < a->order; i++)
stride *= a->dimSize[i];
int m = GetNextPower2(strideNum); int m = GetNextPower2(strideNum);
int n = stride * blockNum; int n = stride * blockNum;
......
...@@ -45,15 +45,14 @@ void _TopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k) ...@@ -45,15 +45,14 @@ void _TopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
CheckNTErrors(index == NULL || a->order == index->order, "Unmatched input tensors!"); CheckNTErrors(index == NULL || a->order == index->order, "Unmatched input tensors!");
CheckNTErrors(index->dataType == X_INT, "Wrong data type!"); CheckNTErrors(index->dataType == X_INT, "Wrong data type!");
int dimRDI = a->order - dim - 1;
for (int i = 0; i < a->order; i++) { for (int i = 0; i < a->order; i++) {
if (i == dimRDI) { if (i == dim) {
CheckNTErrors(b->dimSizeRDI[i] == k, "A too large K"); CheckNTErrors((b->dimSize[i] == k), "A too large K");
CheckNTErrors(index == NULL || index->dimSizeRDI[i] == k, "Wrong size!"); CheckNTErrors((index == NULL || index->dimSize[i] == k), "Wrong size!");
} }
else { else {
CheckNTErrors(b->dimSizeRDI[i] == a->dimSizeRDI[i], "Wrong size!"); CheckNTErrors((b->dimSize[i] == a->dimSize[i]), "Wrong size!");
CheckNTErrors(index == NULL || index->dimSizeRDI[i] == a->dimSizeRDI[i], "Wrong size!"); CheckNTErrors((index == NULL || index->dimSize[i] == a->dimSize[i]), "Wrong size!");
} }
} }
...@@ -68,14 +67,14 @@ void _TopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k) ...@@ -68,14 +67,14 @@ void _TopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!"); CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
int stride = 1; int stride = 1;
int strideNumA = a->dimSizeRDI[dimRDI];
int strideNumB = b->dimSizeRDI[dimRDI];
for (int i = 0; i < dimRDI; i++)
stride *= a->dimSizeRDI[i];
int blockNum = 1; int blockNum = 1;
for (int i = dimRDI + 1; i < a->order; i++) int strideNumA = a->dimSize[dim];
blockNum *= a->dimSizeRDI[i]; int strideNumB = b->dimSize[dim];
for (int i = 0; i < dim; i++)
blockNum *= a->dimSize[i];
for (int i = dim + 1; i < a->order; i++)
stride *= a->dimSize[i];
int blockSizeA = stride * strideNumA; int blockSizeA = stride * strideNumA;
int blockSizeB = stride * strideNumB; int blockSizeB = stride * strideNumB;
......
...@@ -812,15 +812,14 @@ void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k) ...@@ -812,15 +812,14 @@ void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
CheckNTErrors((index->dataType == X_INT), "Wrong data type!"); CheckNTErrors((index->dataType == X_INT), "Wrong data type!");
CheckNTErrors((b->dimSize[dim] == k), "A too large K"); CheckNTErrors((b->dimSize[dim] == k), "A too large K");
int dimRDI = a->order - dim - 1;
int stride = 1; int stride = 1;
int strideNumA = a->dimSizeRDI[dimRDI];
for (int i = 0; i < dimRDI; i++)
stride *= a->dimSizeRDI[i];
int blockNum = 1; int blockNum = 1;
for (int i = dimRDI + 1; i < a->order; i++) int strideNumA = a->dimSize[dim];
blockNum *= a->dimSizeRDI[i]; for (int i = 0; i < dim; i++)
blockNum *= a->dimSize[i];
for (int i = dim + 1; i < a->order; i++)
stride *= a->dimSize[i];
int workerNum = blockNum < 16 ? 64 : 32; int workerNum = blockNum < 16 ? 64 : 32;
/* adjust the thread num according size of k for fitting the share memory size */ /* adjust the thread num according size of k for fitting the share memory size */
......
...@@ -47,7 +47,6 @@ void SetAscendingOrder(XTensor & tensor, int dim) ...@@ -47,7 +47,6 @@ void SetAscendingOrder(XTensor & tensor, int dim)
return; return;
} }
int dimRDI = tensor.order - dim - 1;
if(tensor.devID >= 0){ if(tensor.devID >= 0){
#ifdef USE_CUDA #ifdef USE_CUDA
CudaSetAscendingOrder(&tensor, dim); CudaSetAscendingOrder(&tensor, dim);
...@@ -57,13 +56,13 @@ void SetAscendingOrder(XTensor & tensor, int dim) ...@@ -57,13 +56,13 @@ void SetAscendingOrder(XTensor & tensor, int dim)
} }
else{ else{
int stride = 1; int stride = 1;
int strideNum = tensor.dimSizeRDI[dimRDI];
for(int i = 0; i < dimRDI; i++)
stride *= tensor.dimSizeRDI[i];
int blockNum = 1; int blockNum = 1;
for(int i = dimRDI + 1; i < tensor.order; i++) int strideNum = tensor.dimSize[dim];
blockNum *= tensor.dimSizeRDI[i]; for(int i = 0; i < dim; i++)
blockNum *= tensor.dimSize[i];
for(int i = dim + 1; i < tensor.order; i++)
stride *= tensor.dimSize[i];
for(int k = 0; k < blockNum; k++){ for(int k = 0; k < blockNum; k++){
for(int j = 0; j < strideNum; j++){ for(int j = 0; j < strideNum; j++){
......
...@@ -67,15 +67,14 @@ void CudaSetAscendingOrder(XTensor * a, int dim) ...@@ -67,15 +67,14 @@ void CudaSetAscendingOrder(XTensor * a, int dim)
{ {
CheckNTErrors((a->dataType == X_INT), "TODO!"); CheckNTErrors((a->dataType == X_INT), "TODO!");
int dimRDI = a->order - dim - 1;
int stride = 1; int stride = 1;
int strideNum = a->dimSizeRDI[dimRDI];
for(int i = 0; i < dimRDI; i++)
stride *= a->dimSizeRDI[i];
int blockNum = 1; int blockNum = 1;
for(int i = dimRDI + 1; i < a->order; i++) int strideNum = a->dimSize[dim];
blockNum *= a->dimSizeRDI[i]; for(int i = 0; i < dim; i++)
blockNum *= a->dimSize[i];
for(int i = dim + 1; i < a->order; i++)
stride *= a->dimSize[i];
int gridSize[3]; int gridSize[3];
int blockSize[3]; int blockSize[3];
......
...@@ -50,7 +50,6 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim) ...@@ -50,7 +50,6 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
return; return;
} }
int leadDimRDI = x->order - leadDim - 1;
if (!x->isSparse && !y->isSparse && if (!x->isSparse && !y->isSparse &&
x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE) x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE)
{ {
...@@ -70,13 +69,13 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim) ...@@ -70,13 +69,13 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
XTensor * blockMax = NULL; XTensor * blockMax = NULL;
XTensor * blockSum = NULL; XTensor * blockSum = NULL;
int dimensionSize = y->dimSizeRDI[leadDimRDI]; int dimensionSize = y->dimSize[leadDim];
int stride = 1; int stride = 1;
int blockSize = 1; int blockSize = 1;
int blockNum = 1; int blockNum = 1;
for (int i = 0; i < leadDimRDI; i++) for (int i = leadDim + 1; i < y->order; i++)
stride *= y->dimSizeRDI[i]; stride *= y->dimSize[i];
blockSize = stride * dimensionSize; blockSize = stride * dimensionSize;
blockNum = y->unitNum / blockSize; blockNum = y->unitNum / blockSize;
...@@ -87,7 +86,7 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim) ...@@ -87,7 +86,7 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
_ReduceSum(x, sum, leadDim, max, 1.0F, true); _ReduceSum(x, sum, leadDim, max, 1.0F, true);
if (x->devID >= 0) { if (x->devID >= 0) {
if(leadDimRDI == 0){ if(leadDim == x->order - 1){
blockSize = y->unitNum; blockSize = y->unitNum;
blockNum = 1; blockNum = 1;
blockx = NewTensor2D(blockSize/dimensionSize, -dimensionSize, x->dataType, x->devID, mem); blockx = NewTensor2D(blockSize/dimensionSize, -dimensionSize, x->dataType, x->devID, mem);
...@@ -138,7 +137,7 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim) ...@@ -138,7 +137,7 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
blockMax->data = mp; blockMax->data = mp;
blockSum->data = sp; blockSum->data = sp;
#ifdef USE_CUDA #ifdef USE_CUDA
if(leadDimRDI == 0) if(leadDim == x->order - 1)
_CudaLogSoftmaxSumMax(blockx, blocky, 1, blockSum, blockMax); _CudaLogSoftmaxSumMax(blockx, blocky, 1, blockSum, blockMax);
else else
_CudaLogSoftmaxSumMax(blockx, blocky, leadDim, blockSum, blockMax); _CudaLogSoftmaxSumMax(blockx, blocky, leadDim, blockSum, blockMax);
...@@ -299,7 +298,6 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, ...@@ -299,7 +298,6 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
if(leadDim < 0) if(leadDim < 0)
leadDim = y->order - 1; leadDim = y->order - 1;
int leadDimRDI = y->order - leadDim - 1;
#ifdef USE_CUDA #ifdef USE_CUDA
if (gold->devID >= 0) { if (gold->devID >= 0) {
_CudaLogSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName); _CudaLogSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
...@@ -307,12 +305,12 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, ...@@ -307,12 +305,12 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
} }
#endif #endif
int dimensionSize = y->dimSizeRDI[leadDimRDI]; int dimensionSize = y->dimSize[leadDim];
int stride = 1; int stride = 1;
int blockSize = 1; int blockSize = 1;
int blockNum = 1; int blockNum = 1;
for (int i = 0; i < leadDimRDI; i++) for (int i = leadDim + 1; i < y->order; i++)
stride *= y->dimSizeRDI[i]; stride *= y->dimSize[i];
blockSize = stride * dimensionSize; blockSize = stride * dimensionSize;
blockNum = y->unitNum / blockSize; blockNum = y->unitNum / blockSize;
...@@ -339,10 +337,10 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, ...@@ -339,10 +337,10 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
int key = gold->GetKeyInSparse(i); int key = gold->GetKeyInSparse(i);
DTYPE value = gold->GetInSparse(i); DTYPE value = gold->GetInSparse(i);
int offset = key; int offset = key;
if (dedx->dimSizeRDI[0] != gm) { if (dedx->dimSize[dedx->order - 1] != gm) {
int mi = key % gm; int mi = key % gm;
int ni = key / gm; int ni = key / gm;
int key2 = ni * dedx->dimSizeRDI[0] + mi; int key2 = ni * dedx->dimSize[dedx->order - 1] + mi;
offset = key2; offset = key2;
} }
if (key >= 0 && key < size) if (key >= 0 && key < size)
...@@ -396,10 +394,10 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, ...@@ -396,10 +394,10 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
int key = gold->GetKeyInSparse(i); int key = gold->GetKeyInSparse(i);
DTYPE value = gold->GetInSparse(i); DTYPE value = gold->GetInSparse(i);
int offset = key; int offset = key;
if (dedx->dimSizeRDI[0] != gm) { if (dedx->dimSize[dedx->order - 1] != gm) {
int mi = key % gm; int mi = key % gm;
int ni = key / gm; int ni = key / gm;
int key2 = ni * dedx->dimSizeRDI[0] + mi; int key2 = ni * dedx->dimSize[dedx->order - 1] + mi;
offset = key2; offset = key2;
} }
if (key >= 0 && key < size) if (key >= 0 && key < size)
...@@ -431,11 +429,11 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, ...@@ -431,11 +429,11 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
/* for columns with no xs we set dE/ds = 0 */ /* for columns with no xs we set dE/ds = 0 */
if (gold != NULL && gold->isSparse) { if (gold != NULL && gold->isSparse) {
CheckNTErrors((gold->order == 2), "The gold standard tensor must be of order 2!"); CheckNTErrors((gold->order == 2), "The gold standard tensor must be of order 2!");
if ((gold->dimSize[1] > 1 && !gold->isAllValued[0]) || gold->dimSize[1] != dedx->dimSizeRDI[0]) { if ((gold->dimSize[1] > 1 && !gold->isAllValued[0]) || gold->dimSize[1] != dedx->dimSize[dedx->order - 1]) {
int gn = gold->dimSize[0]; int gn = gold->dimSize[0];
int gm = gold->dimSize[1]; int gm = gold->dimSize[1];
int sm = dedx->dimSizeRDI[0]; int sm = dedx->dimSize[dedx->order - 1];
int sn = dedx->dimSizeRDI[1]; int sn = dedx->dimSize[dedx->order - 2];
int * flags = new int[sm]; int * flags = new int[sm];
memset(flags, 0, sizeof(int)*sm); memset(flags, 0, sizeof(int)*sm);
......
...@@ -385,13 +385,12 @@ void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, ...@@ -385,13 +385,12 @@ void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
"Tensors used in log softmax are not on the same GPU."); "Tensors used in log softmax are not on the same GPU.");
CheckNTErrors((gold != NULL), "No x gold standard is found!"); CheckNTErrors((gold != NULL), "No x gold standard is found!");
int leadDimRDI = y->order - leadDim - 1; int dimensionSize = y->dimSize[leadDim];
int dimensionSize = y->dimSizeRDI[leadDimRDI];
int stride = 1; int stride = 1;
int blockSize = 1; int blockSize = 1;
int blockNum = 1; int blockNum = 1;
for (int i = 0; i < leadDimRDI; i++) for (int i = leadDim + 1; i < y->order; i++)
stride *= y->dimSizeRDI[i]; stride *= y->dimSize[i];
blockSize = stride * dimensionSize; blockSize = stride * dimensionSize;
blockNum = y->unitNum / blockSize; blockNum = y->unitNum / blockSize;
......
...@@ -50,18 +50,17 @@ DTYPE _LossCompute(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName, ...@@ -50,18 +50,17 @@ DTYPE _LossCompute(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
if (output->devID < 0) { if (output->devID < 0) {
CheckNTErrors((gLen >= 0 && gLen <= output->unitNum), "Illegal input length!"); CheckNTErrors((gLen >= 0 && gLen <= output->unitNum), "Illegal input length!");
CheckNTErrors((_IsSameShaped(gold, output)), "The input tensors must be of the same size!"); CheckNTErrors((_IsSameShaped(gold, output)), "The input tensors must be of the same size!");
CheckNTErrors((gold->dimSizeRDI[0] == 1 && output->dimSizeRDI[0] == 1), "TODO!"); CheckNTErrors((gold->dimSize[gold->order - 1] == 1 && output->dimSize[output->order - 1] == 1), "TODO!");
CheckNTErrors((gold->order > leadDim && leadDim >= 0), "Illegal leading dimension!"); CheckNTErrors((gold->order > leadDim && leadDim >= 0), "Illegal leading dimension!");
CheckNTErrors((gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE), "TODO!"); CheckNTErrors((gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE), "TODO!");
int leadDimRDI = output->order - leadDim - 1; int dimensionSize = output->dimSize[leadDim];
int dimensionSize = output->dimSizeRDI[leadDimRDI];
int stride = 1; int stride = 1;
int blockSize = 1; int blockSize = 1;
int blockNum = 1; int blockNum = 1;
for(int i = 0; i < leadDimRDI; i++) for(int i = leadDim + 1; i < output->order; i++)
stride *= output->dimSizeRDI[i]; stride *= output->dimSize[i];
blockSize = stride * dimensionSize; blockSize = stride * dimensionSize;
blockNum = output->unitNum / blockSize; blockNum = output->unitNum / blockSize;
...@@ -207,18 +206,17 @@ DTYPE _LossComputeForLogScale(XTensor * gold, XTensor * output, ...@@ -207,18 +206,17 @@ DTYPE _LossComputeForLogScale(XTensor * gold, XTensor * output,
{ {
CheckNTErrors(gLen >= 0 && gLen <= output->unitNum, "Illegal input length!"); CheckNTErrors(gLen >= 0 && gLen <= output->unitNum, "Illegal input length!");
CheckNTErrors(_IsSameShaped(gold, output), "The input tensors must be of the same size!"); CheckNTErrors(_IsSameShaped(gold, output), "The input tensors must be of the same size!");
CheckNTErrors(gold->dimSizeRDI[0] == 1 && output->dimSizeRDI[0] == 1, "TODO!"); CheckNTErrors(gold->dimSize[gold->order - 1] == 1 && output->dimSize[output->order - 1] == 1, "TODO!");
CheckNTErrors(gold->order > leadDim && leadDim >= 0, "Illegal leading dimension!"); CheckNTErrors(gold->order > leadDim && leadDim >= 0, "Illegal leading dimension!");
CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, "TODO!"); CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, "TODO!");
int leadDimRDI = output->order - leadDim - 1; int dimensionSize = output->dimSize[leadDim];
int dimensionSize = output->dimSizeRDI[leadDimRDI];
int stride = 1; int stride = 1;
int blockSize = 1; int blockSize = 1;
int blockNum = 1; int blockNum = 1;
for(int i = 0; i < leadDimRDI; i++) for(int i = leadDim + 1; i < output->order; i++)
stride *= output->dimSizeRDI[i]; stride *= output->dimSize[i];
blockSize = stride * dimensionSize; blockSize = stride * dimensionSize;
blockNum = output->unitNum / blockSize; blockNum = output->unitNum / blockSize;
...@@ -409,21 +407,20 @@ void _LossBackward(XTensor * dedy, XTensor * t, XTensor * y, ...@@ -409,21 +407,20 @@ void _LossBackward(XTensor * dedy, XTensor * t, XTensor * y,
CheckNTErrors(t->order > leadDim, "Illegal leading dimension!"); CheckNTErrors(t->order > leadDim, "Illegal leading dimension!");
CheckNTErrors(t->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE, "TODO!"); CheckNTErrors(t->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE, "TODO!");
int leadDimRDI = leadDim >= 0 ? y->order - leadDim - 1 : -1; if (leadDim < 0) {
if(leadDimRDI < 0){ leadDim = 0;
leadDimRDI = y->order - 1;
tBeg = 0; tBeg = 0;
yBeg = 0; yBeg = 0;
tLen = y->dimSizeRDI[leadDimRDI]; tLen = y->dimSize[leadDim];
} }
int dimensionSize = y->dimSizeRDI[leadDimRDI]; int dimensionSize = y->dimSize[leadDim];
int stride = 1; int stride = 1;
int blockSize = 1; int blockSize = 1;
int blockNum = 1; int blockNum = 1;
for(int i = 0; i < leadDimRDI; i++) for(int i = leadDim + 1; i < y->order; i++)
stride *= y->dimSizeRDI[i]; stride *= y->dimSize[i];
blockSize = stride * dimensionSize; blockSize = stride * dimensionSize;
blockNum = y->unitNum / blockSize; blockNum = y->unitNum / blockSize;
......
...@@ -56,7 +56,7 @@ DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName, ...@@ -56,7 +56,7 @@ DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
{ {
CheckNTErrors((gLen >= 0 && gLen <= y->unitNum), "Illegal input length!"); CheckNTErrors((gLen >= 0 && gLen <= y->unitNum), "Illegal input length!");
CheckNTErrors((_IsSameShaped(gold, y)), "The input tensors must be of the same size!"); CheckNTErrors((_IsSameShaped(gold, y)), "The input tensors must be of the same size!");
CheckNTErrors((gold->dimSizeRDI[0] == 1 && y->dimSizeRDI[0] == 1), "TODO!"); CheckNTErrors((gold->dimSize[gold->order - 1] == 1 && y->dimSize[y->order - 1] == 1), "TODO!");
CheckNTErrors((gold->order > leadDim && leadDim >= 0), "Illegal leading dimension!"); CheckNTErrors((gold->order > leadDim && leadDim >= 0), "Illegal leading dimension!");
CheckNTErrors((gold->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE), "TODO!"); CheckNTErrors((gold->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE), "TODO!");
CheckNTErrors((gold->devID == y->devID), "Tensors must be on the same device!"); CheckNTErrors((gold->devID == y->devID), "Tensors must be on the same device!");
...@@ -91,7 +91,7 @@ DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName, ...@@ -91,7 +91,7 @@ DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
diffNew->order = 2; diffNew->order = 2;
diffNew->dimSize[1] = diffNew->dimSize[0]; diffNew->dimSize[1] = diffNew->dimSize[0];
diffNew->dimSize[0] = 1; diffNew->dimSize[0] = 1;
diffNew->dimSizeRDI[1] = 1; diffNew->dimSize[diffNew->order - 2] = 1;
} }
delete diff; delete diff;
diff = diffNew; diff = diffNew;
...@@ -125,7 +125,7 @@ DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName, ...@@ -125,7 +125,7 @@ DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
diffNew->order = 2; diffNew->order = 2;
diffNew->dimSize[1] = diffNew->dimSize[0]; diffNew->dimSize[1] = diffNew->dimSize[0];
diffNew->dimSize[0] = 1; diffNew->dimSize[0] = 1;
diffNew->dimSizeRDI[1] = 1; diffNew->dimSize[diffNew->order - 2] = 1;
} }
delete diff; delete diff;
diff = diffNew; diff = diffNew;
...@@ -162,7 +162,7 @@ DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName, ...@@ -162,7 +162,7 @@ DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
diffNew->order = 2; diffNew->order = 2;
diffNew->dimSize[1] = diffNew->dimSize[0]; diffNew->dimSize[1] = diffNew->dimSize[0];
diffNew->dimSize[0] = 1; diffNew->dimSize[0] = 1;
diffNew->dimSizeRDI[1] = 1; diffNew->dimSize[diffNew->order - 2] = 1;
} }
delete diff; delete diff;
diff = diffNew; diff = diffNew;
...@@ -349,22 +349,21 @@ void _CudaLossBackward(XTensor * dedy, XTensor * t, XTensor * y, ...@@ -349,22 +349,21 @@ void _CudaLossBackward(XTensor * dedy, XTensor * t, XTensor * y,
"The vectors must be on the same GPU."); "The vectors must be on the same GPU.");
CheckNTErrors((tBeg == yBeg), "TODO!"); CheckNTErrors((tBeg == yBeg), "TODO!");
int leadDimRDI = leadDim >= 0 ? y->order - leadDim - 1 : -1; if (leadDim < 0) {
if(leadDimRDI < 0){ leadDim = 0;
leadDimRDI = y->order - 1;
tBeg = 0; tBeg = 0;
yBeg = 0; yBeg = 0;
tLen = y->dimSizeRDI[leadDimRDI]; tLen = y->dimSize[leadDim];
} }
int dimensionSize = y->dimSizeRDI[leadDimRDI]; int dimensionSize = y->dimSize[leadDim];
int stride = 1; int stride = 1;
int blockSize = 1; int blockSize = 1;
int blockNum = 1; int blockNum = 1;
int size = 1; int size = 1;
for(int i = 0; i < leadDimRDI; i++) for(int i = leadDim + 1; i < y->order; i++)
stride *= y->dimSizeRDI[i]; stride *= y->dimSize[i];
size = tLen * stride; size = tLen * stride;
blockSize = stride * dimensionSize; blockSize = stride * dimensionSize;
blockNum = y->unitNum / blockSize; blockNum = y->unitNum / blockSize;
......
...@@ -41,7 +41,6 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim) ...@@ -41,7 +41,6 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)
if(leadDim < 0) if(leadDim < 0)
leadDim = x->order - 1; leadDim = x->order - 1;
int leadDimRDI = x->order - leadDim - 1;
if(!x->isSparse && !y->isSparse && x->dataType == y->dataType){ if(!x->isSparse && !y->isSparse && x->dataType == y->dataType){
int * dimSize = new int[x->order - 1]; int * dimSize = new int[x->order - 1];
for(int i = 0; i < x->order; i++){ for(int i = 0; i < x->order; i++){
...@@ -71,13 +70,13 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim) ...@@ -71,13 +70,13 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)
else{ else{
CheckNTErrors((x->dataType == DEFAULT_DTYPE), "TODO!"); CheckNTErrors((x->dataType == DEFAULT_DTYPE), "TODO!");
int dimensionSize = y->dimSizeRDI[leadDimRDI]; int dimensionSize = y->dimSize[leadDim];
int stride = 1; int stride = 1;
int blockSize = 1; int blockSize = 1;
int blockNum = 1; int blockNum = 1;
for(int i = 0; i < leadDimRDI; i++) for(int i = leadDim + 1; i < y->order; i++)
stride *= y->dimSizeRDI[i]; stride *= y->dimSize[i];
blockSize = stride * dimensionSize; blockSize = stride * dimensionSize;
blockNum = y->unitNum / blockSize; blockNum = y->unitNum / blockSize;
...@@ -207,8 +206,6 @@ void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, ...@@ -207,8 +206,6 @@ void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
if(leadDim < 0) if(leadDim < 0)
leadDim = y->order - 1; leadDim = y->order - 1;
int leadDimRDI = y->order - leadDim - 1;
#ifdef USE_CUDA #ifdef USE_CUDA
if(y->devID >= 0){ if(y->devID >= 0){
_CudaSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName); _CudaSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
...@@ -216,12 +213,12 @@ void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, ...@@ -216,12 +213,12 @@ void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
} }
#endif #endif
int dimensionSize = y->dimSizeRDI[leadDimRDI]; int dimensionSize = y->dimSize[leadDim];
int stride = 1; int stride = 1;
int blockSize = 1; int blockSize = 1;
int blockNum = 1; int blockNum = 1;
for(int i = 0; i < leadDimRDI; i++) for(int i = leadDim + 1; i < y->order; i++)
stride *= y->dimSizeRDI[i]; stride *= y->dimSize[i];
blockSize = stride * dimensionSize; blockSize = stride * dimensionSize;
blockNum = y->unitNum / blockSize; blockNum = y->unitNum / blockSize;
......
...@@ -226,14 +226,13 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s ...@@ -226,14 +226,13 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s
CheckNTErrors((x->devID == y->devID), "Tensors used in softmax are not on the same GPU."); CheckNTErrors((x->devID == y->devID), "Tensors used in softmax are not on the same GPU.");
CheckNTErrors((_IsSameShaped(x, y)), "Input tensors must be of the same size!"); CheckNTErrors((_IsSameShaped(x, y)), "Input tensors must be of the same size!");
int leadDimRDI = y->order - leadDim - 1; int dimensionSize = y->dimSize[leadDim];
int dimensionSize = y->dimSizeRDI[leadDimRDI];
int stride = 1; int stride = 1;
int blockSize = 1; int blockSize = 1;
int blockNum = 1; int blockNum = 1;
for(int i = 0; i < leadDimRDI; i++) for(int i = leadDim + 1; i < y->order; i++)
stride *= y->dimSizeRDI[i]; stride *= y->dimSize[i];
blockSize = stride * dimensionSize; blockSize = stride * dimensionSize;
blockNum = y->unitNum / blockSize; blockNum = y->unitNum / blockSize;
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论