Commit c9ef15f8 by 张裕浩

Remove XTensor RDI representation.

parent 1f4eecdd
...@@ -233,7 +233,6 @@ void XTensor::Init() ...@@ -233,7 +233,6 @@ void XTensor::Init()
devID = -1; devID = -1;
order = -1; order = -1;
memset(dimSize, 0, sizeof(int) * MAX_TENSOR_DIM_NUM); memset(dimSize, 0, sizeof(int) * MAX_TENSOR_DIM_NUM);
memset(dimSizeRDI, 0, sizeof(int) * MAX_TENSOR_DIM_NUM);
dataType = DEFAULT_DTYPE; dataType = DEFAULT_DTYPE;
unitSize = sizeof(float); unitSize = sizeof(float);
unitNum = 0; unitNum = 0;
...@@ -278,7 +277,6 @@ void XTensor::ShallowCopy(const XTensor &tensor) ...@@ -278,7 +277,6 @@ void XTensor::ShallowCopy(const XTensor &tensor)
{ {
order = tensor.order; order = tensor.order;
memcpy(dimSize, tensor.dimSize, sizeof(int) * MAX_TENSOR_DIM_NUM); memcpy(dimSize, tensor.dimSize, sizeof(int) * MAX_TENSOR_DIM_NUM);
memcpy(dimSizeRDI, tensor.dimSizeRDI, sizeof(int) * MAX_TENSOR_DIM_NUM);
dataType = tensor.dataType; dataType = tensor.dataType;
unitSize = tensor.unitSize; unitSize = tensor.unitSize;
unitNum = tensor.unitNum; unitNum = tensor.unitNum;
...@@ -442,7 +440,7 @@ bool XTensor::IsSameShaped(const XTensor * a, const XTensor * b) ...@@ -442,7 +440,7 @@ bool XTensor::IsSameShaped(const XTensor * a, const XTensor * b)
return false; return false;
for(int i = 0; i < a->order; i++){ for(int i = 0; i < a->order; i++){
if(a->dimSizeRDI[i] != b->dimSizeRDI[i]) if(a->dimSize[i] != b->dimSize[i])
return false; return false;
} }
...@@ -478,7 +476,6 @@ void XTensor::SetDim(int * myDimSize) ...@@ -478,7 +476,6 @@ void XTensor::SetDim(int * myDimSize)
{ {
for (int i = 0; i < order; i++) { for (int i = 0; i < order; i++) {
dimSize[i] = myDimSize[i]; dimSize[i] = myDimSize[i];
dimSizeRDI[order - i - 1] = myDimSize[i];
} }
} }
...@@ -505,20 +502,17 @@ reshape the tensor ...@@ -505,20 +502,17 @@ reshape the tensor
void XTensor::Reshape(const int myOrder, const int * myDimSize) void XTensor::Reshape(const int myOrder, const int * myDimSize)
{ {
int dims[MAX_TENSOR_DIM_NUM]; int dims[MAX_TENSOR_DIM_NUM];
int dimsRDI[MAX_TENSOR_DIM_NUM];
int num = 1; int num = 1;
for(int i = 0; i < myOrder; i++){ for(int i = 0; i < myOrder; i++){
num *= myDimSize[i]; num *= myDimSize[i];
dims[i] = abs(myDimSize[i]); dims[i] = abs(myDimSize[i]);
dimsRDI[myOrder - i - 1] = dims[i];
} }
CheckNTErrors(abs(num) == unitNum, "Wrong size found when we reshape the tensor!"); CheckNTErrors(abs(num) == unitNum, "Wrong size found when we reshape the tensor!");
order = myOrder; order = myOrder;
memcpy(dimSize, dims, sizeof(int) * order); memcpy(dimSize, dims, sizeof(int) * order);
memcpy(dimSizeRDI, dimsRDI, sizeof(int) * order);
} }
/* /*
...@@ -888,7 +882,6 @@ void XTensor::SetAscendingOrder(int dim) ...@@ -888,7 +882,6 @@ void XTensor::SetAscendingOrder(int dim)
CheckNTErrors((dim >= 0 && dim < order), "Wrong dimension specified!"); CheckNTErrors((dim >= 0 && dim < order), "Wrong dimension specified!");
CheckNTErrors((dataType == X_INT), "TODO!"); CheckNTErrors((dataType == X_INT), "TODO!");
int dimRDI = order - dim - 1;
if(devID >= 0){ if(devID >= 0){
#ifdef USE_CUDA #ifdef USE_CUDA
CudaSetAscendingOrder(this, dim); CudaSetAscendingOrder(this, dim);
...@@ -898,13 +891,13 @@ void XTensor::SetAscendingOrder(int dim) ...@@ -898,13 +891,13 @@ void XTensor::SetAscendingOrder(int dim)
} }
else{ else{
int stride = 1; int stride = 1;
int strideNum = dimSizeRDI[dimRDI];
for(int i = 0; i < dimRDI; i++)
stride *= dimSizeRDI[i];
int blockNum = 1; int blockNum = 1;
for(int i = dimRDI + 1; i < order; i++) int strideNum = dimSize[dim];
blockNum *= dimSizeRDI[i]; for(int i = 0; i < dim; i++)
blockNum *= dimSize[i];
for(int i = dim + 1; i < order; i++)
stride *= dimSize[i];
for(int k = 0; k < blockNum; k++){ for(int k = 0; k < blockNum; k++){
for(int j = 0; j < strideNum; j++){ for(int j = 0; j < strideNum; j++){
...@@ -939,17 +932,13 @@ void * XTensor::GetCell(int index[], int size) const ...@@ -939,17 +932,13 @@ void * XTensor::GetCell(int index[], int size) const
{ {
CheckNTErrors((size == order), "Illegal index!"); CheckNTErrors((size == order), "Illegal index!");
int * indexRDI = new int[size];
for (int i = 0; i < size; i++)
indexRDI[size - i - 1] = index[i];
int offset = indexRDI[size - 1]; int offset = index[0];
for(int i = size - 2; i >= 0; i--){ for(int i = 1; i < size; ++i){
CheckNTErrors((indexRDI[i] < dimSizeRDI[i]), "Index is out of range!"); CheckNTErrors((index[i] < dimSize[i]), "Index is out of range!");
offset = offset * dimSizeRDI[i] + indexRDI[i]; offset = offset * dimSize[i] + index[i];
} }
delete[] indexRDI;
if(isSparse){ if(isSparse){
DTYPE value; DTYPE value;
...@@ -1365,7 +1354,6 @@ bool XTensor::Resize(const int myOrder, const int * myDimSize, ...@@ -1365,7 +1354,6 @@ bool XTensor::Resize(const int myOrder, const int * myDimSize,
bool zeroData = false; bool zeroData = false;
for(int i = 0; i < order; i++){ for(int i = 0; i < order; i++){
dimSize[i] = abs(myDimSize[i]); dimSize[i] = abs(myDimSize[i]);
dimSizeRDI[order - i - 1] = dimSize[i];
if(myDimSize[i] < 0) if(myDimSize[i] < 0)
filledData = false; filledData = false;
if(myDimSize[i] == 0) if(myDimSize[i] == 0)
...@@ -1564,7 +1552,7 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int beg, ...@@ -1564,7 +1552,7 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int beg,
if (isSparse) { if (isSparse) {
int num = 0; int num = 0;
for (int i = 0; i < order; i++) for (int i = 0; i < order; i++)
num *= dimSizeRDI[i]; num *= dimSize[i];
num = int(num * denseRatio + 1); num = int(num * denseRatio + 1);
int tupleSize = sizeof(int) + sizeof(DTYPE); int tupleSize = sizeof(int) + sizeof(DTYPE);
int size = sizeof(int) + tupleSize*(num); int size = sizeof(int) + tupleSize*(num);
...@@ -1756,8 +1744,8 @@ void XTensor::Read(FILE * file, const char * label) ...@@ -1756,8 +1744,8 @@ void XTensor::Read(FILE * file, const char * label)
int ds[MAX_TENSOR_DIM_NUM]; int ds[MAX_TENSOR_DIM_NUM];
for (int i = 0; i < order; i++) { for (int i = 0; i < order; i++) {
ds[i] = key % dimSizeRDI[i]; ds[i] = key % dimSize[i];
key /= dimSizeRDI[i]; key /= dimSize[i];
} }
Set(value, ds); Set(value, ds);
} }
......
...@@ -95,8 +95,6 @@ public: ...@@ -95,8 +95,6 @@ public:
/* size of each dimension */ /* size of each dimension */
int dimSize[MAX_TENSOR_DIM_NUM]; int dimSize[MAX_TENSOR_DIM_NUM];
/* size of each dimension by Reversed Dimension Indexing (RDI) Mode */
int dimSizeRDI[MAX_TENSOR_DIM_NUM];
/* data unit - data type for every cell */ /* data unit - data type for every cell */
TENSOR_DATA_TYPE dataType; TENSOR_DATA_TYPE dataType;
......
...@@ -41,7 +41,6 @@ where i is the index of the item ...@@ -41,7 +41,6 @@ where i is the index of the item
*/ */
void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim) void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
{ {
int leadingDimRDI = a->order - leadingDim - 1;
CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum), CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum),
"Unmatched tensors in multiplication!"); "Unmatched tensors in multiplication!");
CheckNTErrors((a->order == b->order && a->order == c->order), CheckNTErrors((a->order == b->order && a->order == c->order),
...@@ -59,17 +58,17 @@ void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int le ...@@ -59,17 +58,17 @@ void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int le
int blockSizeB = 1; int blockSizeB = 1;
int blockSizeC = 1; int blockSizeC = 1;
int blockNum = 1; int blockNum = 1;
int dimensionSizeA = a->dimSizeRDI[leadingDimRDI]; int dimensionSizeA = a->dimSize[leadingDim];
int dimensionSizeB = b->dimSizeRDI[leadingDimRDI]; int dimensionSizeB = b->dimSize[leadingDim];
int dimensionSizeC = c->dimSizeRDI[leadingDimRDI]; int dimensionSizeC = c->dimSize[leadingDim];
for (int i = 0; i < a->order; i++) { for (int i = 0; i < a->order; i++) {
if (i != leadingDimRDI) { if (i != leadingDim) {
CheckNTErrors((a->dimSizeRDI[i] == b->dimSizeRDI[i] && a->dimSizeRDI[i] == c->dimSizeRDI[i]), CheckNTErrors((a->dimSize[i] == b->dimSize[i] && a->dimSize[i] == c->dimSize[i]),
"Unmatched tensors!"); "Unmatched tensors!");
} }
if (i < leadingDimRDI) if (i > leadingDim)
stride *= a->dimSizeRDI[i]; stride *= a->dimSize[i];
} }
blockSizeA = stride * dimensionSizeA; blockSizeA = stride * dimensionSizeA;
......
...@@ -122,7 +122,6 @@ where i is the item index ...@@ -122,7 +122,6 @@ where i is the item index
*/ */
void _CudaDiv(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim) void _CudaDiv(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
{ {
int leadingDimRDI = a->order - leadingDim - 1;
CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum), CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum),
"Unmatched tensors in multiplication!"); "Unmatched tensors in multiplication!");
CheckNTErrors((a->order == b->order && a->order == c->order), "Unmatched tensors!"); CheckNTErrors((a->order == b->order && a->order == c->order), "Unmatched tensors!");
...@@ -130,18 +129,18 @@ void _CudaDiv(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, in ...@@ -130,18 +129,18 @@ void _CudaDiv(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, in
int stride = 1; int stride = 1;
int blockSizeA = 1; int blockSizeA = 1;
int blockNum = 1; int blockNum = 1;
int dimensionSizeA = a->dimSizeRDI[leadingDimRDI]; int dimensionSizeA = a->dimSize[leadingDim];
int dimensionSizeB = b->dimSizeRDI[leadingDimRDI]; int dimensionSizeB = b->dimSize[leadingDim];
int dimensionSizeC = c->dimSizeRDI[leadingDimRDI]; int dimensionSizeC = c->dimSize[leadingDim];
for (int i = 0; i < a->order; i++) { for (int i = 0; i < a->order; i++) {
if (i != leadingDimRDI) { if (i != leadingDim) {
CheckNTErrors((a->dimSizeRDI[i] == b->dimSizeRDI[i] && CheckNTErrors((a->dimSize[i] == b->dimSize[i] &&
a->dimSizeRDI[i] == c->dimSizeRDI[i]), a->dimSize[i] == c->dimSize[i]),
"Unmatched tensors!"); "Unmatched tensors!");
} }
if (i < leadingDimRDI) if (i > leadingDim)
stride *= a->dimSizeRDI[i]; stride *= a->dimSize[i];
} }
blockSizeA = stride * dimensionSizeA; blockSizeA = stride * dimensionSizeA;
......
...@@ -77,18 +77,18 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -77,18 +77,18 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
return; return;
} }
int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1]; int an = transposedA == X_TRANS ? a->dimSize[a->order - 1] : a->dimSize[a->order - 2];
int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0]; int am = transposedA == X_TRANS ? a->dimSize[a->order - 2] : a->dimSize[a->order - 1];
int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1]; int bn = transposedB == X_TRANS ? b->dimSize[b->order - 1] : b->dimSize[b->order - 2];
int bm = transposedB == X_TRANS ? b->dimSizeRDI[1] : b->dimSizeRDI[0]; int bm = transposedB == X_TRANS ? b->dimSize[b->order - 2] : b->dimSize[b->order - 1];
int cn = c->dimSizeRDI[1]; int cn = c->dimSize[c->order - 2];
int cm = c->dimSizeRDI[0]; int cm = c->dimSize[c->order - 1];
CheckNTErrors((am == bn && an == cn && bm == cm), "Unmatched tensors in multiplication!"); CheckNTErrors((am == bn && an == cn && bm == cm), "Unmatched tensors in multiplication!");
int aBlockSize = a->dimSizeRDI[0] * a->dimSizeRDI[1]; int aBlockSize = a->dimSize[a->order - 1] * a->dimSize[a->order - 2];
int bBlockSize = b->dimSizeRDI[0] * b->dimSizeRDI[1]; int bBlockSize = b->dimSize[b->order - 1] * b->dimSize[b->order - 2];
int cBlockSize = c->dimSizeRDI[0] * c->dimSizeRDI[1]; int cBlockSize = c->dimSize[c->order - 1] * c->dimSize[c->order - 2];
int aRealBlockSize = aBlockSize * a->unitSize; int aRealBlockSize = aBlockSize * a->unitSize;
int bRealBlockSize = bBlockSize * b->unitSize; int bRealBlockSize = bBlockSize * b->unitSize;
int cRealBlockSize = cBlockSize * c->unitSize; int cRealBlockSize = cBlockSize * c->unitSize;
...@@ -96,24 +96,25 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -96,24 +96,25 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
int bBlockNum = 1; int bBlockNum = 1;
int cBlockNum = 1; int cBlockNum = 1;
for (int i = 2; i < a->order; i++) {
CheckNTErrors(a->dimSizeRDI[i] == c->dimSizeRDI[i - 2 + b->order], "Incorrect tensor sizes!"); for (int i = 0; i < a->order - 2; i++) {
aBlockNum *= a->dimSizeRDI[i]; CheckNTErrors(a->dimSize[i] == c->dimSize[i], "Incorrect tensor sizes!");
cBlockNum *= a->dimSizeRDI[i]; aBlockNum *= a->dimSize[i];
cBlockNum *= a->dimSize[i];
} }
for (int i = 2; i < b->order; i++) { for (int i = 0; i < b->order - 2; i++) {
CheckNTErrors(b->dimSizeRDI[i] == c->dimSizeRDI[i], "Incorrect tensor sizes!"); CheckNTErrors(b->dimSize[i] == c->dimSize[i - 2 + a->order], "Incorrect tensor sizes!");
bBlockNum *= b->dimSizeRDI[i]; bBlockNum *= b->dimSize[i];
cBlockNum *= b->dimSizeRDI[i]; cBlockNum *= b->dimSize[i];
} }
XList * aList = new XList(10); XList * aList = new XList(10);
XList * bList = new XList(10); XList * bList = new XList(10);
XList * cList = new XList(10); XList * cList = new XList(10);
int aDimSize[2] = { -a->dimSizeRDI[1], a->dimSizeRDI[0] }; int aDimSize[2] = { -a->dimSize[a->order - 2], a->dimSize[a->order - 1] };
int bDimSize[2] = { -b->dimSizeRDI[1], b->dimSizeRDI[0] }; int bDimSize[2] = { -b->dimSize[b->order - 2], b->dimSize[b->order - 1] };
int cDimSize[2] = { -c->dimSizeRDI[1], c->dimSizeRDI[0] }; int cDimSize[2] = { -c->dimSize[c->order - 2], c->dimSize[c->order - 1] };
bool isSparseMul = false; bool isSparseMul = false;
...@@ -230,20 +231,20 @@ XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, ...@@ -230,20 +231,20 @@ XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!"); CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!"); CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
int an = transposedA == X_TRANS ? a.dimSizeRDI[0] : a.dimSizeRDI[1]; int an = transposedA == X_TRANS ? a.dimSize[a.order - 1] : a.dimSize[a.order - 2];
int am = transposedA == X_TRANS ? a.dimSizeRDI[1] : a.dimSizeRDI[0]; int am = transposedA == X_TRANS ? a.dimSize[a.order - 2] : a.dimSize[a.order - 1];
int bn = transposedB == X_TRANS ? b.dimSizeRDI[0] : b.dimSizeRDI[1]; int bn = transposedB == X_TRANS ? b.dimSize[b.order - 1] : b.dimSize[b.order - 2];
int bm = transposedB == X_TRANS ? b.dimSizeRDI[1] : b.dimSizeRDI[0]; int bm = transposedB == X_TRANS ? b.dimSize[b.order - 2] : b.dimSize[b.order - 1];
CheckNTErrors(am == bn, "Unmatched tensors in multiplication!"); CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
int order = a.order + b.order - 2; int order = a.order + b.order - 2;
int sub = 0; int sub = 0;
int * dimSize = new int[order]; int * dimSize = new int[order];
for (int i = 2; i < a.order; i++) for (int i = 0; i < a.order - 2; i++)
dimSize[sub++] = a.dimSizeRDI[a.order + 1 - i]; dimSize[sub++] = a.dimSize[i];
for (int i = 2; i < b.order; i++) for (int i = 0; i < b.order - 2; i++)
dimSize[sub++] = b.dimSizeRDI[b.order + 1 - i]; dimSize[sub++] = b.dimSize[i];
dimSize[sub++] = an; dimSize[sub++] = an;
dimSize[sub++] = bm; dimSize[sub++] = bm;
...@@ -280,20 +281,20 @@ XTensor MatrixMul(const XTensor &a, const XTensor &b, ...@@ -280,20 +281,20 @@ XTensor MatrixMul(const XTensor &a, const XTensor &b,
CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!"); CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!"); CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
int an = a.dimSizeRDI[1]; int an = a.dimSize[a.order - 2];
int am = a.dimSizeRDI[0]; int am = a.dimSize[a.order - 1];
int bn = b.dimSizeRDI[1]; int bn = b.dimSize[b.order - 2];
int bm = b.dimSizeRDI[0]; int bm = b.dimSize[b.order - 1];
CheckNTErrors(am == bn, "Unmatched tensors in multiplication!"); CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
int order = a.order + b.order - 2; int order = a.order + b.order - 2;
int sub = 0; int sub = 0;
int * dimSize = new int[order]; int * dimSize = new int[order];
for (int i = 2; i < a.order; i++) for (int i = 0; i < a.order - 2; i++)
dimSize[sub++] = a.dimSizeRDI[a.order + 1 - i]; dimSize[sub++] = a.dimSize[i];
for (int i = 2; i < b.order; i++) for (int i = 0; i < b.order - 2; i++)
dimSize[sub++] = b.dimSizeRDI[b.order + 1 - i]; dimSize[sub++] = b.dimSize[i];
dimSize[sub++] = an; dimSize[sub++] = an;
dimSize[sub++] = bm; dimSize[sub++] = bm;
......
...@@ -56,7 +56,6 @@ void _MatrixMulBatched(const XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -56,7 +56,6 @@ void _MatrixMulBatched(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
"Input tensors must have a order >= 2!"); "Input tensors must have a order >= 2!");
CheckNTErrors((a->order == b->order && a->order == c->order), CheckNTErrors((a->order == b->order && a->order == c->order),
"Input tensor and output tensor must have same order!"); "Input tensor and output tensor must have same order!");
if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0)
_MatrixMulBatchedGPU(a, transposedA, b, transposedB, c, alpha, beta); _MatrixMulBatchedGPU(a, transposedA, b, transposedB, c, alpha, beta);
else else
...@@ -94,27 +93,27 @@ void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -94,27 +93,27 @@ void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
"Input tensor and output tensor must have same order!"); "Input tensor and output tensor must have same order!");
CheckNTErrors(a->devID >= 0 && b->devID >= 0 && c->devID >= 0, "The tensors must be on GPUs"); CheckNTErrors(a->devID >= 0 && b->devID >= 0 && c->devID >= 0, "The tensors must be on GPUs");
int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1]; int an = transposedA == X_TRANS ? a->dimSize[a->order - 1] : a->dimSize[a->order - 2];
int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0]; int am = transposedA == X_TRANS ? a->dimSize[a->order - 2] : a->dimSize[a->order - 1];
int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1]; int bn = transposedB == X_TRANS ? b->dimSize[b->order - 1] : b->dimSize[b->order - 2];
int bm = transposedB == X_TRANS ? b->dimSizeRDI[1] : b->dimSizeRDI[0]; int bm = transposedB == X_TRANS ? b->dimSize[b->order - 2] : b->dimSize[b->order - 1];
int cn = c->dimSizeRDI[1]; int cn = c->dimSize[c->order - 2];
int cm = c->dimSizeRDI[0]; int cm = c->dimSize[c->order - 1];
CheckNTErrors((am == bn && an == cn && bm == cm), "Unmatched tensors in multiplication!"); CheckNTErrors((am == bn && an == cn && bm == cm), "Unmatched tensors in multiplication!");
int aBlockSize = a->dimSizeRDI[0] * a->dimSizeRDI[1]; int aBlockSize = a->dimSize[a->order - 1] * a->dimSize[a->order - 2];
int bBlockSize = b->dimSizeRDI[0] * b->dimSizeRDI[1]; int bBlockSize = b->dimSize[b->order - 1] * b->dimSize[b->order - 2];
int cBlockSize = c->dimSizeRDI[0] * c->dimSizeRDI[1]; int cBlockSize = c->dimSize[c->order - 1] * c->dimSize[c->order - 2];
int aRealBlockSize = aBlockSize * a->unitSize; int aRealBlockSize = aBlockSize * a->unitSize;
int bRealBlockSize = bBlockSize * b->unitSize; int bRealBlockSize = bBlockSize * b->unitSize;
int cRealBlockSize = cBlockSize * c->unitSize; int cRealBlockSize = cBlockSize * c->unitSize;
int blockNum = 1; int blockNum = 1;
for (int i = 2; i < a->order; i++) { for (int i = 0; i < a->order - 2; i++) {
CheckNTErrors((a->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!"); CheckNTErrors((a->dimSize[i] == c->dimSize[i]), "Incorrect tensor sizes!");
CheckNTErrors((b->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!"); CheckNTErrors((b->dimSize[i] == c->dimSize[i]), "Incorrect tensor sizes!");
blockNum *= a->dimSizeRDI[i]; blockNum *= a->dimSize[i];
} }
int devIDBackup = 0; int devIDBackup = 0;
...@@ -125,9 +124,9 @@ void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -125,9 +124,9 @@ void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
a->data, transposedA, a->dataType, aBlockSize, a->data, transposedA, a->dataType, aBlockSize,
b->data, transposedB, b->dataType, bBlockSize, b->data, transposedB, b->dataType, bBlockSize,
c->data, c->dataType, cBlockSize, blockNum, c->data, c->dataType, cBlockSize, blockNum,
a->dimSizeRDI[1], a->dimSizeRDI[0], a->dimSize[a->order - 2], a->dimSize[a->order - 1],
b->dimSizeRDI[1], b->dimSizeRDI[0], b->dimSize[b->order - 2], b->dimSize[b->order - 1],
c->dimSizeRDI[1], c->dimSizeRDI[0], alpha, beta); c->dimSize[c->order - 2], c->dimSize[c->order - 1], alpha, beta);
BacktoCudaDev(a->devID, devIDBackup); BacktoCudaDev(a->devID, devIDBackup);
#endif #endif
...@@ -163,32 +162,32 @@ CheckNTErrors((a && b && c), "Empty input tensors!"); ...@@ -163,32 +162,32 @@ CheckNTErrors((a && b && c), "Empty input tensors!");
"Input tensor and output tensor must have same order!"); "Input tensor and output tensor must have same order!");
int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1]; int an = transposedA == X_TRANS ? a->dimSize[a->order - 1] : a->dimSize[a->order - 2];
int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0]; int am = transposedA == X_TRANS ? a->dimSize[a->order - 2] : a->dimSize[a->order - 1];
int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1]; int bn = transposedB == X_TRANS ? b->dimSize[b->order - 1] : b->dimSize[b->order - 2];
int bm = transposedB == X_TRANS ? b->dimSizeRDI[1] : b->dimSizeRDI[0]; int bm = transposedB == X_TRANS ? b->dimSize[b->order - 2] : b->dimSize[b->order - 1];
int cn = c->dimSizeRDI[1]; int cn = c->dimSize[c->order - 2];
int cm = c->dimSizeRDI[0]; int cm = c->dimSize[c->order - 1];
CheckNTErrors(am == bn && an == cn && bm == cm, "Unmatched tensors in multiplication!"); CheckNTErrors(am == bn && an == cn && bm == cm, "Unmatched tensors in multiplication!");
int aBlockSize = a->dimSizeRDI[0] * a->dimSizeRDI[1]; int aBlockSize = a->dimSize[a->order - 1] * a->dimSize[a->order - 2];
int bBlockSize = b->dimSizeRDI[0] * b->dimSizeRDI[1]; int bBlockSize = b->dimSize[b->order - 1] * b->dimSize[b->order - 2];
int cBlockSize = c->dimSizeRDI[0] * c->dimSizeRDI[1]; int cBlockSize = c->dimSize[c->order - 1] * c->dimSize[c->order - 2];
int aRealBlockSize = aBlockSize * a->unitSize; int aRealBlockSize = aBlockSize * a->unitSize;
int bRealBlockSize = bBlockSize * b->unitSize; int bRealBlockSize = bBlockSize * b->unitSize;
int cRealBlockSize = cBlockSize * c->unitSize; int cRealBlockSize = cBlockSize * c->unitSize;
int blockNum = 1; int blockNum = 1;
for (int i = 2; i < a->order; i++) { for (int i = 0; i < a->order - 2; i++) {
CheckNTErrors((a->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!"); CheckNTErrors((a->dimSize[i] == c->dimSize[i]), "Incorrect tensor sizes!");
CheckNTErrors((b->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!"); CheckNTErrors((b->dimSize[i] == c->dimSize[i]), "Incorrect tensor sizes!");
blockNum *= a->dimSizeRDI[i]; blockNum *= a->dimSize[i];
} }
int aDimSize[2] = {-a->dimSizeRDI[1], a->dimSizeRDI[0]}; int aDimSize[2] = {-a->dimSize[a->order - 2], a->dimSize[a->order - 1]};
int bDimSize[2] = {-b->dimSizeRDI[1], b->dimSizeRDI[0]}; int bDimSize[2] = {-b->dimSize[b->order - 2], b->dimSize[b->order - 1]};
int cDimSize[2] = {-c->dimSizeRDI[1], c->dimSizeRDI[0]}; int cDimSize[2] = {-c->dimSize[c->order - 2], c->dimSize[c->order - 1]};
XTensor * ai = NewTensor2D(aDimSize[0], aDimSize[1], a->dataType, a->devID, a->mem); XTensor * ai = NewTensor2D(aDimSize[0], aDimSize[1], a->dataType, a->devID, a->mem);
XTensor * bi = NewTensor2D(bDimSize[0], bDimSize[1], b->dataType, b->devID, b->mem); XTensor * bi = NewTensor2D(bDimSize[0], bDimSize[1], b->dataType, b->devID, b->mem);
...@@ -230,7 +229,6 @@ void _MatrixMulBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA, ...@@ -230,7 +229,6 @@ void _MatrixMulBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA,
{ {
CheckNTErrors(a && b && c, "Empty input lists!"); CheckNTErrors(a && b && c, "Empty input lists!");
CheckNTErrors(a->count == b->count && a->count == c->count, "Input lists must be of the same size!"); CheckNTErrors(a->count == b->count && a->count == c->count, "Input lists must be of the same size!");
if (a->count == 0) if (a->count == 0)
return; return;
...@@ -291,10 +289,10 @@ XTensor MatrixMulBatched(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const ...@@ -291,10 +289,10 @@ XTensor MatrixMulBatched(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const
CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!"); CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
CheckNTErrors(a.order == b.order, "Input tensor and output tensor must have same order!"); CheckNTErrors(a.order == b.order, "Input tensor and output tensor must have same order!");
int an = transposedA == X_TRANS ? a.dimSizeRDI[0] : a.dimSizeRDI[1]; int an = transposedA == X_TRANS ? a.dimSize[a.order - 1] : a.dimSize[a.order - 2];
int am = transposedA == X_TRANS ? a.dimSizeRDI[1] : a.dimSizeRDI[0]; int am = transposedA == X_TRANS ? a.dimSize[a.order - 2] : a.dimSize[a.order - 1];
int bn = transposedB == X_TRANS ? b.dimSizeRDI[0] : b.dimSizeRDI[1]; int bn = transposedB == X_TRANS ? b.dimSize[b.order - 1] : b.dimSize[b.order - 2];
int bm = transposedB == X_TRANS ? b.dimSizeRDI[1] : b.dimSizeRDI[0]; int bm = transposedB == X_TRANS ? b.dimSize[b.order - 2] : b.dimSize[b.order - 1];
CheckNTErrors(am == bn, "Unmatched tensors in multiplication!"); CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
...@@ -347,10 +345,10 @@ XTensor MatrixMulBatched(const XTensor &a, const XTensor &b, ...@@ -347,10 +345,10 @@ XTensor MatrixMulBatched(const XTensor &a, const XTensor &b,
CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!"); CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
CheckNTErrors(a.order == b.order, "Input tensor and output tensor must have same order!"); CheckNTErrors(a.order == b.order, "Input tensor and output tensor must have same order!");
int an = a.dimSizeRDI[1]; int an = a.dimSize[a.order - 2];
int am = a.dimSizeRDI[0]; int am = a.dimSize[a.order - 1];
int bn = b.dimSizeRDI[1]; int bn = b.dimSize[b.order - 2];
int bm = b.dimSizeRDI[0]; int bm = b.dimSize[b.order - 1];
CheckNTErrors(am == bn, "Unmatched tensors in multiplication!"); CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
......
...@@ -41,7 +41,6 @@ where i is the index of the item ...@@ -41,7 +41,6 @@ where i is the index of the item
*/ */
void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim) void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
{ {
int leadingDimRDI = a->order - leadingDim - 1;
CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum), CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum),
"Unmatched tensors in multiplication!"); "Unmatched tensors in multiplication!");
CheckNTErrors((a->order == b->order && a->order == c->order), CheckNTErrors((a->order == b->order && a->order == c->order),
...@@ -59,18 +58,18 @@ void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, i ...@@ -59,18 +58,18 @@ void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, i
int blockSizeB = 1; int blockSizeB = 1;
int blockSizeC = 1; int blockSizeC = 1;
int blockNum = 1; int blockNum = 1;
int dimensionSizeA = a->dimSizeRDI[leadingDimRDI]; int dimensionSizeA = a->dimSize[leadingDim];
int dimensionSizeB = b->dimSizeRDI[leadingDimRDI]; int dimensionSizeB = b->dimSize[leadingDim];
int dimensionSizeC = c->dimSizeRDI[leadingDimRDI]; int dimensionSizeC = c->dimSize[leadingDim];
for (int i = 0; i < a->order; i++) { for (int i = 0; i < a->order; i++) {
if (i != leadingDimRDI) { if (i != leadingDim) {
CheckNTErrors((a->dimSizeRDI[i] == b->dimSizeRDI[i] && CheckNTErrors((a->dimSize[i] == b->dimSize[i] &&
a->dimSizeRDI[i] == c->dimSizeRDI[i]), a->dimSize[i] == c->dimSize[i]),
"Unmatched tensors!"); "Unmatched tensors!");
} }
if (i < leadingDimRDI) if (i > leadingDim)
stride *= a->dimSizeRDI[i]; stride *= a->dimSize[i];
} }
blockSizeA = stride * dimensionSizeA; blockSizeA = stride * dimensionSizeA;
......
...@@ -122,7 +122,6 @@ where i is the item index ...@@ -122,7 +122,6 @@ where i is the item index
*/ */
void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim) void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
{ {
int leadingDimRDI = a->order - leadingDim - 1;
CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum), CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum),
"Unmatched tensors in multiplication!"); "Unmatched tensors in multiplication!");
CheckNTErrors((a->order == b->order && a->order == c->order), "Unmatched tensors!"); CheckNTErrors((a->order == b->order && a->order == c->order), "Unmatched tensors!");
...@@ -130,18 +129,18 @@ void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alph ...@@ -130,18 +129,18 @@ void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alph
int stride = 1; int stride = 1;
int blockSizeA = 1; int blockSizeA = 1;
int blockNum = 1; int blockNum = 1;
int dimensionSizeA = a->dimSizeRDI[leadingDimRDI]; int dimensionSizeA = a->dimSize[leadingDim];
int dimensionSizeB = b->dimSizeRDI[leadingDimRDI]; int dimensionSizeB = b->dimSize[leadingDim];
int dimensionSizeC = c->dimSizeRDI[leadingDimRDI]; int dimensionSizeC = c->dimSize[leadingDim];
for (int i = 0; i < a->order; i++) { for (int i = 0; i < a->order; i++) {
if (i != leadingDimRDI) { if (i != leadingDim) {
CheckNTErrors((a->dimSizeRDI[i] == b->dimSizeRDI[i] && CheckNTErrors((a->dimSize[i] == b->dimSize[i] &&
a->dimSizeRDI[i] == c->dimSizeRDI[i]), a->dimSize[i] == c->dimSize[i]),
"Unmatched tensors!"); "Unmatched tensors!");
} }
if (i < leadingDimRDI) if (i > leadingDim)
stride *= a->dimSizeRDI[i]; stride *= a->dimSize[i];
} }
blockSizeA = stride * dimensionSizeA; blockSizeA = stride * dimensionSizeA;
......
...@@ -41,14 +41,14 @@ void _SumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c, DTYPE bet ...@@ -41,14 +41,14 @@ void _SumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c, DTYPE bet
{ {
CheckNTErrors((a && b && c), "Empty input tensors!"); CheckNTErrors((a && b && c), "Empty input tensors!");
CheckNTErrors((XTensor::IsSameShaped(a, c)), "Unmatched tensors in addition!"); CheckNTErrors((XTensor::IsSameShaped(a, c)), "Unmatched tensors in addition!");
CheckNTErrors((b->order == 2 && b->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]), CheckNTErrors((b->order == 2 && b->dimSize[b->order - 1] == 1 && b->dimSize[b->order - 2] == a->dimSize[a->order - 2]),
"Illegal input vector size!"); "Illegal input vector size!");
int rowNum = a->dimSize[0]; int rowNum = a->dimSize[0];
int colNum = a->dimSize[1]; int colNum = a->dimSize[1];
int blockNum = 1; int blockNum = 1;
for (int i = 2; i < a->order; i++) for (int i = 0; i < a->order - 2; i++)
blockNum *= a->dimSizeRDI[i]; blockNum *= a->dimSize[i];
int blockSize = colNum * rowNum; int blockSize = colNum * rowNum;
if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) { if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
......
...@@ -68,7 +68,7 @@ void _CudaSumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c, DTYPE ...@@ -68,7 +68,7 @@ void _CudaSumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
{ {
CheckNTErrors((a && b && c), "Empty input tensors!"); CheckNTErrors((a && b && c), "Empty input tensors!");
CheckNTErrors((XTensor::IsSameShaped(a, c)), "Unmatched tensors in addition!"); CheckNTErrors((XTensor::IsSameShaped(a, c)), "Unmatched tensors in addition!");
CheckNTErrors((b->order == 2 && b->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]), CheckNTErrors((b->order == 2 && b->dimSize[b->order - 1] == 1 && b->dimSize[b->order - 2] == a->dimSize[a->order - 2]),
"Illegal input vector size!"); "Illegal input vector size!");
CheckNTErrors((a->dataType == DEFAULT_DTYPE && b->dataType == DEFAULT_DTYPE && CheckNTErrors((a->dataType == DEFAULT_DTYPE && b->dataType == DEFAULT_DTYPE &&
c->dataType == DEFAULT_DTYPE), "TODO"); c->dataType == DEFAULT_DTYPE), "TODO");
...@@ -76,8 +76,8 @@ void _CudaSumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c, DTYPE ...@@ -76,8 +76,8 @@ void _CudaSumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
int rowNum = a->dimSize[0]; int rowNum = a->dimSize[0];
int colNum = a->dimSize[1]; int colNum = a->dimSize[1];
int blockNum = 1; int blockNum = 1;
for (int i = 2; i < a->order; i++) for (int i = 0; i < a->order - 2; i++)
blockNum *= a->dimSizeRDI[i]; blockNum *= a->dimSize[i];
int cudaGridSize[3]; int cudaGridSize[3];
int cudaBlockSize[3]; int cudaBlockSize[3];
......
...@@ -41,7 +41,7 @@ void _SumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c, DTYPE bet ...@@ -41,7 +41,7 @@ void _SumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c, DTYPE bet
{ {
CheckNTErrors((a && b && c), "Empty input tensors!"); CheckNTErrors((a && b && c), "Empty input tensors!");
CheckNTErrors((XTensor::IsSameShaped(a, c)), "Unmatched tensors in addition!"); CheckNTErrors((XTensor::IsSameShaped(a, c)), "Unmatched tensors in addition!");
CheckNTErrors((a->order == 2 && a->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]), CheckNTErrors((a->order == 2 && a->dimSize[a->order - 1] == 1 && b->dimSize[b->order - 2] == a->dimSize[a->order - 2]),
"Illegal input vector size!"); "Illegal input vector size!");
if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) { if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
...@@ -53,8 +53,8 @@ void _SumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c, DTYPE bet ...@@ -53,8 +53,8 @@ void _SumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c, DTYPE bet
int rowNum = b->dimSize[0]; int rowNum = b->dimSize[0];
int colNum = b->dimSize[1]; int colNum = b->dimSize[1];
int blockNum = 1; int blockNum = 1;
for (int i = 2; i < b->order; i++) for (int i = 0; i < b->order - 2; i++)
blockNum *= b->dimSizeRDI[i]; blockNum *= b->dimSize[i];
int blockSize = colNum * rowNum; int blockSize = colNum * rowNum;
if (!a->isSparse && !b->isSparse) { if (!a->isSparse && !b->isSparse) {
......
...@@ -84,7 +84,7 @@ void _CudaSumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c, DTYPE ...@@ -84,7 +84,7 @@ void _CudaSumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
{ {
CheckNTErrors((a && b && c), "Empty input tensors!"); CheckNTErrors((a && b && c), "Empty input tensors!");
CheckNTErrors((XTensor::IsSameShaped(a, c)), "Unmatched tensors in addition!"); CheckNTErrors((XTensor::IsSameShaped(a, c)), "Unmatched tensors in addition!");
CheckNTErrors((a->order == 2 && a->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]), CheckNTErrors((a->order == 2 && a->dimSize[a->order - 1] == 1 && b->dimSize[b->order - 2] == a->dimSize[a->order - 2]),
"Illegal input vector size!"); "Illegal input vector size!");
CheckNTErrors((a->dataType == DEFAULT_DTYPE && b->dataType == DEFAULT_DTYPE && CheckNTErrors((a->dataType == DEFAULT_DTYPE && b->dataType == DEFAULT_DTYPE &&
c->dataType == DEFAULT_DTYPE), "TODO"); c->dataType == DEFAULT_DTYPE), "TODO");
...@@ -92,13 +92,13 @@ void _CudaSumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c, DTYPE ...@@ -92,13 +92,13 @@ void _CudaSumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
int rowNum = b->dimSize[0]; int rowNum = b->dimSize[0];
int colNum = b->dimSize[1]; int colNum = b->dimSize[1];
int blockNum = 1; int blockNum = 1;
for (int i = 2; i < b->order; i++) for (int i = 0; i < b->order - 2; i++)
blockNum *= b->dimSizeRDI[i]; blockNum *= b->dimSize[i];
int cudaGridSize[3]; int cudaGridSize[3];
int cudaBlockSize[3]; int cudaBlockSize[3];
GDevs.GetCudaThread(c->devID, a->dimSizeRDI[1], cudaGridSize, cudaBlockSize); GDevs.GetCudaThread(c->devID, a->dimSize[a->order - 2], cudaGridSize, cudaBlockSize);
int devIDBackup = 0; int devIDBackup = 0;
ProtectCudaDev(a->devID, devIDBackup); ProtectCudaDev(a->devID, devIDBackup);
......
...@@ -44,26 +44,25 @@ where a and b are the scalar and bias respectively, and \epsilon is the adjustme ...@@ -44,26 +44,25 @@ where a and b are the scalar and bias respectively, and \epsilon is the adjustme
*/ */
void _Normalize(const XTensor * input, XTensor * output, int dim, const XTensor * mean, const XTensor * var, const XTensor * a, const XTensor * b, DTYPE epsilon) void _Normalize(const XTensor * input, XTensor * output, int dim, const XTensor * mean, const XTensor * var, const XTensor * a, const XTensor * b, DTYPE epsilon)
{ {
int dimRDI = input->order - dim - 1;
CheckNTErrors((XTensor::IsSameShaped(input, output)), "Unmatched input tensors!"); CheckNTErrors((XTensor::IsSameShaped(input, output)), "Unmatched input tensors!");
CheckNTErrors((XTensor::IsSameShaped(a, b)), "Unmatched input tensors"); CheckNTErrors((XTensor::IsSameShaped(a, b)), "Unmatched input tensors");
CheckNTErrors((XTensor::IsSameShaped(mean, var)), "Unmatched input tensors"); CheckNTErrors((XTensor::IsSameShaped(mean, var)), "Unmatched input tensors");
CheckNTErrors((input && output && mean && var && a && b), "Empty input tensors!"); CheckNTErrors((input && output && mean && var && a && b), "Empty input tensors!");
CheckNTErrors((dimRDI >= 0 && dimRDI < input->order), "Incorrect reduction dimension!"); CheckNTErrors((dim >= 0 && dim < input->order), "Incorrect reduction dimension!");
CheckNTErrors((input->order == mean->order + 1), "Incorrect reduction dimension!"); CheckNTErrors((input->order == mean->order + 1), "Incorrect reduction dimension!");
int stride = 1; int stride = 1;
int strideNum = input->dimSizeRDI[dimRDI]; int strideNum = input->dimSize[dim];
int blockSize = 1; int blockSize = 1;
int blockNum = 1; int blockNum = 1;
for (int i = 0; i < input->order; i++) { for (int i = 0; i < input->order; i++) {
if (i < dimRDI) { if (i < dim) {
CheckNTErrors((input->dimSizeRDI[i] == mean->dimSizeRDI[i]), "Wrong size!"); CheckNTErrors((input->dimSize[i] == mean->dimSize[i]), "Wrong size!");
stride *= input->dimSizeRDI[i]; blockNum *= input->dimSize[i];
} }
else if (i > dimRDI) { else if (i > dim) {
CheckNTErrors((input->dimSizeRDI[i] == mean->dimSizeRDI[i - 1]), "Wrong size!"); CheckNTErrors((input->dimSize[i] == mean->dimSize[i - 1]), "Wrong size!");
blockNum *= input->dimSizeRDI[i]; stride *= input->dimSize[i];
} }
} }
blockSize = stride * strideNum; blockSize = stride * strideNum;
......
...@@ -95,15 +95,14 @@ void _CudaNormalize(const XTensor * input, XTensor * output, int dim, ...@@ -95,15 +95,14 @@ void _CudaNormalize(const XTensor * input, XTensor * output, int dim,
{ {
CheckNTErrors((input->dataType == DEFAULT_DTYPE), "TODO!"); CheckNTErrors((input->dataType == DEFAULT_DTYPE), "TODO!");
int dimRDI = input->order - dim - 1;
int stride = 1; int stride = 1;
int strideNum = input->dimSizeRDI[dimRDI]; int strideNum = input->dimSize[dim];
int blockNum = 1; int blockNum = 1;
for (int i = 0; i < input->order; i++) { for (int i = 0; i < input->order; i++) {
if (i < dimRDI) if (i > dim)
stride *= input->dimSizeRDI[i]; stride *= input->dimSize[i];
else if (i > dimRDI) else if (i < dim)
blockNum *= input->dimSizeRDI[i]; blockNum *= input->dimSize[i];
} }
int cudaGridSize[3]; int cudaGridSize[3];
......
...@@ -40,12 +40,11 @@ void _CopyInGrid(const XTensor * s, XTensor * t, int * index, int blockDim, int ...@@ -40,12 +40,11 @@ void _CopyInGrid(const XTensor * s, XTensor * t, int * index, int blockDim, int
{ {
CheckNTErrors((XTensor::IsSameShaped(s, t)), "Unmatched tensors!"); CheckNTErrors((XTensor::IsSameShaped(s, t)), "Unmatched tensors!");
int blockDimRDI = s->order - blockDim - 1;
int blockSize = 1; int blockSize = 1;
int blockNum = blockNumInGrid; int blockNum = blockNumInGrid;
int gridNum = 1; int gridNum = 1;
for (int i = 0; i < blockDimRDI; i++) for (int i = blockDim; i < s->order; i++)
blockSize *= s->dimSizeRDI[i]; blockSize *= s->dimSize[i];
CheckNTErrors((s->unitNum % (blockSize * blockNum) == 0), "Illegal block number!"); CheckNTErrors((s->unitNum % (blockSize * blockNum) == 0), "Illegal block number!");
gridNum = s->unitNum / (blockSize * blockNum); gridNum = s->unitNum / (blockSize * blockNum);
......
...@@ -52,26 +52,28 @@ void _CopyIndexed(const XTensor * s, XTensor * t, int dim, ...@@ -52,26 +52,28 @@ void _CopyIndexed(const XTensor * s, XTensor * t, int dim,
CheckNTErrors((dim < s->order && dim < t->order), "A too larget dimension specified!"); CheckNTErrors((dim < s->order && dim < t->order), "A too larget dimension specified!");
CheckNTErrors((s->unitSize == t->unitSize), "Unmatched tensors!"); CheckNTErrors((s->unitSize == t->unitSize), "Unmatched tensors!");
int dimRDI = s->order - dim - 1;
int blockSizeSrc = 1; int blockSizeSrc = 1;
int blockSizeTgt = 1; int blockSizeTgt = 1;
int blockNumSrc = 1; int blockNumSrc = 1;
int blockNumTgt = 1; int blockNumTgt = 1;
int leadDimSizeSrc = s->dimSizeRDI[dimRDI]; int leadDimSizeSrc = s->dimSize[dim];
int leadDimSizeTgt = t->dimSizeRDI[dimRDI]; int leadDimSizeTgt = t->dimSize[dim];
int indexOffsetNum = 1; int indexOffsetNum = 1;
for (int i = 0; i < dimRDI; i++) { for (int i = dim + 1; i < s->order; i++) {
blockSizeSrc *= s->dimSizeRDI[i]; blockSizeSrc *= s->dimSize[i];
blockSizeTgt *= t->dimSizeRDI[i]; }
for (int i = dim + 1; i < t->order; i++) {
blockSizeTgt *= t->dimSize[i];
}
for (int i = 0; i <= dim; i++)
{
blockNumSrc *= s->dimSize[i];
blockNumTgt *= t->dimSize[i];
} }
for (int i = dimRDI; i < s->order; i++)
blockNumSrc *= s->dimSizeRDI[i];
for (int i = dimRDI; i < t->order; i++)
blockNumTgt *= t->dimSizeRDI[i];
CheckNTErrors((blockSizeSrc == blockSizeTgt), "Unmatched tensors!"); CheckNTErrors((blockSizeSrc == blockSizeTgt), "Unmatched tensors!");
indexOffsetNum = blockNumSrc / s->dimSizeRDI[dimRDI]; indexOffsetNum = blockNumSrc / s->dimSize[dim];
int realIndexSize = indexOffsetNum * indexSize * copyNum; int realIndexSize = indexOffsetNum * indexSize * copyNum;
int * realSrcIndex = new int[realIndexSize]; int * realSrcIndex = new int[realIndexSize];
......
...@@ -160,16 +160,14 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim) ...@@ -160,16 +160,14 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
int devID = srcIndex->devID; int devID = srcIndex->devID;
XMem * mem = s->mem; XMem * mem = s->mem;
int dimRDI = srcIndex->order - dim - 1;
int stride = 1; int stride = 1;
int indexSize = srcIndex->unitNum;
int strideNum = srcIndex->dimSizeRDI[dimRDI];
for (int i = 0; i < dimRDI; i++)
stride *= srcIndex->dimSizeRDI[i];
int blockNum = 1; int blockNum = 1;
for (int i = dimRDI + 1; i < srcIndex->order; i++) int indexSize = srcIndex->unitNum;
blockNum *= srcIndex->dimSizeRDI[i]; int strideNum = srcIndex->dimSize[dim];
for (int i = 0; i < dim; i++)
blockNum *= srcIndex->dimSize[i];
for (int i = dim + 1; i < srcIndex->order; i++)
stride *= srcIndex->dimSize[i];
int * sIndex = NULL; int * sIndex = NULL;
if (srcIndex->devID < 0) { if (srcIndex->devID < 0) {
......
...@@ -44,16 +44,15 @@ void _ReduceMax(const XTensor * input, XTensor * output, int dim) ...@@ -44,16 +44,15 @@ void _ReduceMax(const XTensor * input, XTensor * output, int dim)
CheckNTErrors((input->order > dim && dim >=0), "Illegal dimension to reduce!"); CheckNTErrors((input->order > dim && dim >=0), "Illegal dimension to reduce!");
CheckNTErrors((input->dataType == output->dataType), "Unmatched data types!"); CheckNTErrors((input->dataType == output->dataType), "Unmatched data types!");
int dimRDI = input->order - dim - 1; CheckNTErrors(dim < input->order, "Wrong dimension!");
CheckNTErrors(dimRDI >= 0, "Wrong dimension!");
for(int i = 0; i < input->order; i++){ for(int i = 0; i < input->order; i++){
if(i < dimRDI){ if(i < dim){
CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i]), CheckNTErrors((input->dimSize[i] == output->dimSize[i]),
"Unmatched tensors!"); "Unmatched tensors!");
} }
else if(i > dimRDI){ else if(i > dim){
CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i - 1]), CheckNTErrors((input->dimSize[i] == output->dimSize[i - 1]),
"Unmatched tensors!"); "Unmatched tensors!");
} }
} }
...@@ -67,31 +66,31 @@ void _ReduceMax(const XTensor * input, XTensor * output, int dim) ...@@ -67,31 +66,31 @@ void _ReduceMax(const XTensor * input, XTensor * output, int dim)
CheckNTErrors((input->dataType == DEFAULT_DTYPE), "TODO!"); CheckNTErrors((input->dataType == DEFAULT_DTYPE), "TODO!");
int stride = 1; int stride = 1;
int strideNum = input->dimSizeRDI[dimRDI]; int strideNum = input->dimSize[dim];
int blockSize = 1; int blockSize = 1;
int blockNum = 1; int blockNum = 1;
for (int i = 0; i < input->order; i++) { for (int i = 0; i < input->order; i++) {
if (i < dimRDI) if (i > dim)
stride *= input->dimSizeRDI[i]; stride *= input->dimSize[i];
else if (i > dimRDI) else if (i < dim)
blockNum *= input->dimSizeRDI[i]; blockNum *= input->dimSize[i];
} }
blockSize = stride * strideNum; blockSize = stride * strideNum;
if(input->dimSizeRDI[0] % (4 * 32 / sizeof(DTYPE)) == 0 && input->dimSizeRDI[0] >= 32){ if(input->dimSize[input->order - 1] % (4 * 32 / sizeof(DTYPE)) == 0 && input->dimSize[input->order - 1] >= 32){
int vecBufLength = 32 / sizeof(DTYPE); int vecBufLength = 32 / sizeof(DTYPE);
if(dimRDI == 0){ if (dim == input->order - 1) {
//data is contiguous in dim 0 //data is contiguous in dim 0
for(int i = 0; i < blockNum; i++){ for (int i = 0; i < blockNum; i++) {
DTYPE * ip = (DTYPE*)input->data + blockSize * i; DTYPE * ip = (DTYPE*)input->data + blockSize * i;
DTYPE * op = (DTYPE*)output->data + i; DTYPE * op = (DTYPE*)output->data + i;
VectorBuffer vecBuf[4]; VectorBuffer vecBuf[4];
for(int j = 0; j < 4; j++){ for (int j = 0; j < 4; j++) {
vecBuf[j] = VectorBuffer::loadu((DTYPE*)(ip) + j * vecBufLength); vecBuf[j] = VectorBuffer::loadu((DTYPE*)(ip)+j * vecBufLength);
} }
for(int j = 1; j < strideNum / 32; j++){ for (int j = 1; j < strideNum / 32; j++) {
const DTYPE* ptr = (DTYPE*)(ip + j * vecBufLength); const DTYPE* ptr = (DTYPE*)(ip + j * vecBufLength);
vecBuf[0] = vecBuf[0].maxData(VectorBuffer::loadu(ptr + 0 * vecBufLength)); vecBuf[0] = vecBuf[0].maxData(VectorBuffer::loadu(ptr + 0 * vecBufLength));
vecBuf[1] = vecBuf[1].maxData(VectorBuffer::loadu(ptr + 1 * vecBufLength)); vecBuf[1] = vecBuf[1].maxData(VectorBuffer::loadu(ptr + 1 * vecBufLength));
...@@ -102,16 +101,17 @@ void _ReduceMax(const XTensor * input, XTensor * output, int dim) ...@@ -102,16 +101,17 @@ void _ReduceMax(const XTensor * input, XTensor * output, int dim)
vecBuf[0] = vecBuf[0].maxData(vecBuf[2]); vecBuf[0] = vecBuf[0].maxData(vecBuf[2]);
vecBuf[0] = vecBuf[0].maxData(vecBuf[3]); vecBuf[0] = vecBuf[0].maxData(vecBuf[3]);
DTYPE maxN = DTYPE_MIN; DTYPE maxN = DTYPE_MIN;
for(int k = 0; k < vecBufLength; k++){ for (int k = 0; k < vecBufLength; k++) {
maxN = MAX(maxN,vecBuf[0][k]); maxN = MAX(maxN, vecBuf[0][k]);
} }
*op = maxN; *op = maxN;
} }
} else{ }
else {
//data is separated //data is separated
for(int i = 0; i < blockNum; i++){ for(int i = 0; i < blockNum; i++){
for(int j = 0; j < input->dimSizeRDI[0] / 32; j++){ for(int j = 0; j < input->dimSize[input->order - 1] / 32; j++){
DTYPE * ip = (DTYPE*)input->data + blockSize * i; DTYPE * ip = (DTYPE*)input->data + blockSize * i;
DTYPE * op = (DTYPE*)output->data + stride * i; DTYPE * op = (DTYPE*)output->data + stride * i;
VectorBuffer vecBuf[4]; VectorBuffer vecBuf[4];
......
...@@ -504,13 +504,12 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim) ...@@ -504,13 +504,12 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
CheckNTErrors(input->order > dim && dim >=0, "Illegal dimension to reduce!"); CheckNTErrors(input->order > dim && dim >=0, "Illegal dimension to reduce!");
CheckNTErrors(input->dataType == output->dataType, "Unmatched data types!"); CheckNTErrors(input->dataType == output->dataType, "Unmatched data types!");
int dimRDI = input->order - dim - 1;
for(int i = 0; i < input->order; i++){ for(int i = 0; i < input->order; i++){
if(i < dimRDI){ if(i < dim){
CheckNTErrors(input->dimSizeRDI[i] == output->dimSizeRDI[i], "Unmatched tensors!"); CheckNTErrors(input->dimSize[i] == output->dimSize[i], "Unmatched tensors!");
} }
else if(i > dimRDI){ else if(i > dim){
CheckNTErrors(input->dimSizeRDI[i] == output->dimSizeRDI[i - 1], "Unmatched tensors!"); CheckNTErrors(input->dimSize[i] == output->dimSize[i - 1], "Unmatched tensors!");
} }
} }
...@@ -518,15 +517,15 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim) ...@@ -518,15 +517,15 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
int cudaBlockSize[3]; int cudaBlockSize[3];
int iter = 0; int iter = 0;
int stride = 1; int stride = 1;
int strideNum = input->dimSizeRDI[dimRDI]; int strideNum = input->dimSize[dim];
int blockSize = 1; int blockSize = 1;
int blockNum = 1; int blockNum = 1;
for (int i = 0; i < input->order; i++) { for (int i = 0; i < input->order; i++) {
if (i < dimRDI) if (i < dim)
stride *= input->dimSizeRDI[i]; blockNum *= input->dimSize[i];
else if (i > dimRDI) else if (i > dim)
blockNum *= input->dimSizeRDI[i]; stride *= input->dimSize[i];
} }
blockSize = stride * strideNum; blockSize = stride * strideNum;
......
...@@ -39,8 +39,7 @@ void _ReduceMean(const XTensor * input, XTensor * output, int dim) ...@@ -39,8 +39,7 @@ void _ReduceMean(const XTensor * input, XTensor * output, int dim)
{ {
CheckNTErrors((input->order > dim), "Illegal dimension specified!"); CheckNTErrors((input->order > dim), "Illegal dimension specified!");
int dimRDI = input->order - dim - 1; int num = input->dimSize[dim];
int num = input->dimSizeRDI[dimRDI];
_ReduceSum(input, output, dim); _ReduceSum(input, output, dim);
_ScaleAndShiftMe(output, (DTYPE)1/num, 0); _ScaleAndShiftMe(output, (DTYPE)1/num, 0);
......
...@@ -53,15 +53,14 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor ...@@ -53,15 +53,14 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor
CheckNTErrors((input->dataType == output->dataType), "Unmatched data types!"); CheckNTErrors((input->dataType == output->dataType), "Unmatched data types!");
CheckNTErrors((shift == NULL || XTensor::IsSameShaped(output, shift)), "Incorrect shift tensor size!"); CheckNTErrors((shift == NULL || XTensor::IsSameShaped(output, shift)), "Incorrect shift tensor size!");
int dimRDI = input->order - dim - 1; CheckNTErrors(dim < input->order, "Wrong dimension!");
CheckNTErrors(dimRDI >= 0, "Wrong dimension!");
for(int i = 0; i < input->order; i++){ for(int i = 0; i < input->order; i++){
if(i < dimRDI){ if(i < dim){
CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i]), "Unmatched tensors!"); CheckNTErrors((input->dimSize[i] == output->dimSize[i]), "Unmatched tensors!");
} }
else if(i > dimRDI){ else if(i > dim){
CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i - 1]), "Unmatched tensors!"); CheckNTErrors((input->dimSize[i] == output->dimSize[i - 1]), "Unmatched tensors!");
} }
} }
...@@ -74,21 +73,21 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor ...@@ -74,21 +73,21 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor
CheckNTErrors((input->dataType == DEFAULT_DTYPE), "TODO!"); CheckNTErrors((input->dataType == DEFAULT_DTYPE), "TODO!");
int stride = 1; int stride = 1;
int strideNum = input->dimSizeRDI[dimRDI]; int strideNum = input->dimSize[dim];
int blockSize = 1; int blockSize = 1;
int blockNum = 1; int blockNum = 1;
for (int i = 0; i < input->order; i++) { for (int i = 0; i < input->order; i++) {
if (i < dimRDI) if (i < dim)
stride *= input->dimSizeRDI[i]; blockNum *= input->dimSize[i];
else if (i > dimRDI) else if (i > dim)
blockNum *= input->dimSizeRDI[i]; stride *= input->dimSize[i];
} }
blockSize = stride * strideNum; blockSize = stride * strideNum;
if(input->dimSizeRDI[0] % (4 * 32 / sizeof(DTYPE)) == 0 && input->dimSizeRDI[0] >= 32){ if(input->dimSize[input->order - 1] % (4 * 32 / sizeof(DTYPE)) == 0 && input->dimSize[input->order - 1] >= 32){
int vecBufLength = 32 / sizeof(DTYPE); int vecBufLength = 32 / sizeof(DTYPE);
if(dimRDI == 0){ if(dim == input->order - 1){
//data is contiguous in dim 0 //data is contiguous in dim 0
for(int i = 0; i < blockNum; i++){ for(int i = 0; i < blockNum; i++){
// stride = 1 // stride = 1
...@@ -122,7 +121,7 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor ...@@ -122,7 +121,7 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor
} else{ } else{
//data is separated //data is separated
for(int i = 0; i < blockNum; i++){ for(int i = 0; i < blockNum; i++){
for(int j = 0; j < input->dimSizeRDI[0] / 32; j++){ for(int j = 0; j < input->dimSize[input->order - 1] / 32; j++){
DTYPE * ip = (DTYPE*)input->data + blockSize * i; DTYPE * ip = (DTYPE*)input->data + blockSize * i;
DTYPE * op = (DTYPE*)output->data + stride * i; DTYPE * op = (DTYPE*)output->data + stride * i;
DTYPE * sp = shift != NULL ? (DTYPE*)shift->data + stride * i : NULL; DTYPE * sp = shift != NULL ? (DTYPE*)shift->data + stride * i : NULL;
......
...@@ -692,13 +692,12 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen ...@@ -692,13 +692,12 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
CheckNTErrors(input->dataType == output->dataType, "Unmatched data types!"); CheckNTErrors(input->dataType == output->dataType, "Unmatched data types!");
CheckNTErrors(shift == NULL || output->unitNum == shift->unitNum, "Incorrect shift tensor size!"); CheckNTErrors(shift == NULL || output->unitNum == shift->unitNum, "Incorrect shift tensor size!");
int dimRDI = input->order - dim - 1;
for(int i = 0; i < input->order; i++){ for(int i = 0; i < input->order; i++){
if(i < dimRDI){ if(i < dim){
CheckNTErrors(input->dimSizeRDI[i] == output->dimSizeRDI[i], "Unmatched tensors!"); CheckNTErrors(input->dimSize[i] == output->dimSize[i], "Unmatched tensors!");
} }
else if(i > dimRDI){ else if(i > dim){
CheckNTErrors(input->dimSizeRDI[i] == output->dimSizeRDI[i - 1], "Unmatched tensors!"); CheckNTErrors(input->dimSize[i] == output->dimSize[i - 1], "Unmatched tensors!");
} }
} }
...@@ -709,15 +708,15 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen ...@@ -709,15 +708,15 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
int cudaBlockSize[3]; int cudaBlockSize[3];
int iter = 0; int iter = 0;
int stride = 1; int stride = 1;
int strideNum = input->dimSizeRDI[dimRDI]; int strideNum = input->dimSize[dim];
int blockSize = 1; int blockSize = 1;
int blockNum = 1; int blockNum = 1;
for (int i = 0; i < input->order; i++) { for (int i = 0; i < input->order; i++) {
if (i < dimRDI) if (i < dim)
stride *= input->dimSizeRDI[i]; blockNum *= input->dimSize[i];
else if (i > dimRDI) else if (i > dim)
blockNum *= input->dimSizeRDI[i]; stride *= input->dimSize[i];
} }
blockSize = stride * strideNum; blockSize = stride * strideNum;
......
...@@ -38,8 +38,7 @@ For a 1-dimensional data array a, variance = 1/n * \sum_i (a_i - mean)^2 ...@@ -38,8 +38,7 @@ For a 1-dimensional data array a, variance = 1/n * \sum_i (a_i - mean)^2
*/ */
void _ReduceVariance(const XTensor * input, XTensor * output, int dim, const XTensor * mean) void _ReduceVariance(const XTensor * input, XTensor * output, int dim, const XTensor * mean)
{ {
int dimRDI = input->order - dim - 1; int num = input->dimSize[dim];
int num = input->dimSizeRDI[dimRDI];
_ReduceSum(input, output, dim, mean, 2.0F); _ReduceSum(input, output, dim, mean, 2.0F);
_ScaleAndShiftMe(output, (DTYPE)1 / num, 0); _ScaleAndShiftMe(output, (DTYPE)1 / num, 0);
} }
......
...@@ -20,7 +20,7 @@ ...@@ -20,7 +20,7 @@
*/ */
#include "VectorBuffer.h" #include "VectorBuffer.h"
#include "math.h"
namespace nts { namespace nts {
/* data size for each buffer */ /* data size for each buffer */
int VectorBuffer::size() int VectorBuffer::size()
......
...@@ -39,30 +39,29 @@ void _ConcatenateSolely(const XList * smalls, XTensor * big, int dim) ...@@ -39,30 +39,29 @@ void _ConcatenateSolely(const XList * smalls, XTensor * big, int dim)
CheckNTErrors(big->order > dim && dim >= 0, "Illegal dimension to concatenate!"); CheckNTErrors(big->order > dim && dim >= 0, "Illegal dimension to concatenate!");
int catDimSize = 0; int catDimSize = 0;
int dimRDI = big->order - dim - 1;
for (int i = 0; i < smalls->count; i++) { for (int i = 0; i < smalls->count; i++) {
XTensor * tensor = (XTensor*)smalls->GetItem(i); XTensor * tensor = (XTensor*)smalls->GetItem(i);
CheckNTErrors((big->order == tensor->order), "Unmatched tensor orders!"); CheckNTErrors((big->order == tensor->order), "Unmatched tensor orders!");
for (int j = 0; j < big->order; j++) { for (int j = 0; j < big->order; j++) {
if (j != dimRDI) { if (j != dim) {
CheckNTErrors((big->dimSizeRDI[j] == tensor->dimSizeRDI[j]), "Unmatched tensor sizes!"); CheckNTErrors((big->dimSize[j] == tensor->dimSize[j]), "Unmatched tensor sizes!");
} }
else { else {
catDimSize += tensor->dimSizeRDI[j]; catDimSize += tensor->dimSize[j];
} }
} }
} }
CheckNTErrors((catDimSize == big->dimSizeRDI[dimRDI]), "Unmatched tensor sizes!"); CheckNTErrors((catDimSize == big->dimSize[dim]), "Unmatched tensor sizes!");
int stride = 1; int stride = 1;
for (int i = 0; i < dimRDI; i++)
stride *= big->dimSizeRDI[i];
int blockNum = 1; int blockNum = 1;
for (int i = dimRDI + 1; i < big->order; i++) for (int i = 0; i < dim; i++)
blockNum *= big->dimSizeRDI[i]; blockNum *= big->dimSize[i];
for (int i = dim + 1; i < big->order; i++)
stride *= big->dimSize[i];
int offset = 0; int offset = 0;
...@@ -74,8 +73,8 @@ void _ConcatenateSolely(const XList * smalls, XTensor * big, int dim) ...@@ -74,8 +73,8 @@ void _ConcatenateSolely(const XList * smalls, XTensor * big, int dim)
if (smalls->count <= MIN_TENSOR_CAT_NUM) { if (smalls->count <= MIN_TENSOR_CAT_NUM) {
for (int i = 0; i < smalls->count; i++) { for (int i = 0; i < smalls->count; i++) {
XTensor * tensor = (XTensor*)smalls->GetItem(i); XTensor * tensor = (XTensor*)smalls->GetItem(i);
int sPitch = stride * tensor->dimSizeRDI[dimRDI] * tensor->unitSize; int sPitch = stride * tensor->dimSize[dim] * tensor->unitSize;
int tPitch = stride * big->dimSizeRDI[dimRDI] * big->unitSize; int tPitch = stride * big->dimSize[dim] * big->unitSize;
int mSize = sPitch; int mSize = sPitch;
int n = blockNum; int n = blockNum;
XMemCopy2D((char*)big->data + offset, tPitch, big->devID, XMemCopy2D((char*)big->data + offset, tPitch, big->devID,
...@@ -89,7 +88,7 @@ void _ConcatenateSolely(const XList * smalls, XTensor * big, int dim) ...@@ -89,7 +88,7 @@ void _ConcatenateSolely(const XList * smalls, XTensor * big, int dim)
int * blockSizes = new int[smalls->count]; int * blockSizes = new int[smalls->count];
for (int i = 0; i < smalls->count; i++) { for (int i = 0; i < smalls->count; i++) {
XTensor * tensor = (XTensor*)smalls->GetItem(i); XTensor * tensor = (XTensor*)smalls->GetItem(i);
blockSizes[i] = stride * tensor->dimSizeRDI[dimRDI] * tensor->unitSize; blockSizes[i] = stride * tensor->dimSize[dim] * tensor->unitSize;
sourceArrays->Add(tensor->data); sourceArrays->Add(tensor->data);
} }
......
...@@ -45,10 +45,8 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim) ...@@ -45,10 +45,8 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
if(leadingDim < 0) if(leadingDim < 0)
leadingDim = 0; leadingDim = 0;
int whereToMergeRDI = s->order - whereToMerge - 1; if (leadingDim >= s->order)
int leadingDimRDI = s->order - leadingDim - 1; leadingDim = leadingDim - s->order;
if (leadingDimRDI < 0)
leadingDimRDI = s->order - 1;
CheckNTErrors((s != NULL && t != NULL), "Invalid tensors!"); CheckNTErrors((s != NULL && t != NULL), "Invalid tensors!");
CheckNTErrors((s->devID == t->devID || (s->devID < 0 && t->devID < 0)), CheckNTErrors((s->devID == t->devID || (s->devID < 0 && t->devID < 0)),
...@@ -56,19 +54,20 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim) ...@@ -56,19 +54,20 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
CheckNTErrors((s->unitNum == t->unitNum && s->unitSize == t->unitSize), "Unmatched tensors!"); CheckNTErrors((s->unitNum == t->unitNum && s->unitSize == t->unitSize), "Unmatched tensors!");
CheckNTErrors((s->order == t->order + 1), "Unmatched tensors!"); CheckNTErrors((s->order == t->order + 1), "Unmatched tensors!");
CheckNTErrors((leadingDimRDI > whereToMergeRDI), "Invalid leading dimension!"); CheckNTErrors((leadingDim < whereToMerge), "Invalid leading dimension!");
for (int i = 0; i < s->order; i++) { for (int i = 0; i < s->order; i++) {
if (i == whereToMergeRDI) { if (i == whereToMerge) {
CheckNTErrors((t->dimSizeRDI[i] == s->dimSizeRDI[i] * s->dimSizeRDI[leadingDimRDI]),
CheckNTErrors((t->dimSize[i - 1] == s->dimSize[i] * s->dimSize[leadingDim]),
"Unmatched tensor sizes!"); "Unmatched tensor sizes!");
} }
else if (i < leadingDimRDI){ else if (i < leadingDim){
CheckNTErrors((s->dimSizeRDI[i] == t->dimSizeRDI[i]), CheckNTErrors((s->dimSize[i] == t->dimSize[i]),
"Unmatched tensor sizes!"); "Unmatched tensor sizes!");
} }
else if (i > leadingDimRDI) { else if (i > leadingDim) {
CheckNTErrors((s->dimSizeRDI[i] == t->dimSizeRDI[i - 1]), CheckNTErrors((s->dimSize[i] == t->dimSize[i - 1]),
"Unmatched tensor sizes!"); "Unmatched tensor sizes!");
} }
} }
...@@ -77,14 +76,14 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim) ...@@ -77,14 +76,14 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
int blockNum = 1; int blockNum = 1;
int gridSize = 1; int gridSize = 1;
int gridNum = 1; int gridNum = 1;
int mergedNum = s->dimSizeRDI[leadingDimRDI]; int mergedNum = s->dimSize[leadingDim];
for (int i = 0; i < s->order; i++) { for (int i = 0; i < s->order; i++) {
if (i <= leadingDimRDI) { if (i >= leadingDim) {
if (i <= whereToMergeRDI) if (i >= whereToMerge)
blockSize *= s->dimSizeRDI[i]; blockSize *= s->dimSize[i];
else else
blockNum *= s->dimSizeRDI[i]; blockNum *= s->dimSize[i];
} }
} }
...@@ -121,7 +120,7 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim) ...@@ -121,7 +120,7 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
if (!isOnSameDevice) if (!isOnSameDevice)
dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(mem->devID, size); dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(mem->devID, size);
int blockNumInMerge = s->dimSizeRDI[leadingDimRDI]; int blockNumInMerge = s->dimSize[leadingDim];
int splitSizeInGrid = gridSize / blockNumInMerge; int splitSizeInGrid = gridSize / blockNumInMerge;
int realBlockSize = blockSize * t->unitSize; int realBlockSize = blockSize * t->unitSize;
...@@ -238,12 +237,11 @@ void _Merge(const XList * smalls, XTensor * big, int whereToMerge) ...@@ -238,12 +237,11 @@ void _Merge(const XList * smalls, XTensor * big, int whereToMerge)
int mergedNum = smalls->count; int mergedNum = smalls->count;
XTensor * s0 = (XTensor*)smalls->GetItem(0); XTensor * s0 = (XTensor*)smalls->GetItem(0);
int whereToMergeRDI = s0->order - whereToMerge - 1;
for (int i = 0; i < s0->order; i++) { for (int i = 0; i < s0->order; i++) {
if (i <= whereToMergeRDI) if (i >= whereToMerge)
blockSize *= s0->dimSizeRDI[i]; blockSize *= s0->dimSize[i];
else else
blockNum *= s0->dimSizeRDI[i]; blockNum *= s0->dimSize[i];
} }
CheckNTErrors((s0->unitNum % (blockSize * blockNum) == 0), "Incorrect size!"); CheckNTErrors((s0->unitNum % (blockSize * blockNum) == 0), "Incorrect size!");
......
...@@ -46,23 +46,22 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum) ...@@ -46,23 +46,22 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
CheckNTErrors((s->unitNum == t->unitNum && s->unitSize == t->unitSize), "Unmatched tensors!"); CheckNTErrors((s->unitNum == t->unitNum && s->unitSize == t->unitSize), "Unmatched tensors!");
CheckNTErrors((s->order == t->order - 1), "Unmatched tensors!"); CheckNTErrors((s->order == t->order - 1), "Unmatched tensors!");
CheckNTErrors((t->dimSizeRDI[t->order - 1] == splitNum), "Incorrect tensor sizes!"); CheckNTErrors((t->dimSize[0] == splitNum), "Incorrect tensor sizes!");
int whereToSplitRDI = s->order - whereToSplit - 1;
for (int i = 0; i < s->order; i++) { for (int i = 0; i < s->order; i++) {
if (i == whereToSplitRDI) { if (i == whereToSplit) {
CheckNTErrors((s->dimSizeRDI[i] == t->dimSizeRDI[i] * splitNum), CheckNTErrors((s->dimSize[i] == t->dimSize[i + 1] * splitNum),
"Unmatched tensor sizes!"); "Unmatched tensor sizes!");
} }
else { else {
CheckNTErrors((s->dimSizeRDI[i] == t->dimSizeRDI[i]), CheckNTErrors((s->dimSize[i] == t->dimSize[i + 1]),
"Unmatched tensor sizes!"); "Unmatched tensor sizes!");
} }
} }
/* for the case that we split the last dimension. Actually /* for the case that we split the last dimension. Actually
(N, M) and (N, M/3, 3) have the same memory layout */ (N, M) and (N, M/3, 3) have the same memory layout */
if (s->order - 1 == whereToSplitRDI) { if (0 == whereToSplit) {
XMemCopy(t->data, t->devID, s->data, s->devID, s->unitNum * s->unitSize); XMemCopy(t->data, t->devID, s->data, s->devID, s->unitNum * s->unitSize);
return; return;
} }
...@@ -70,14 +69,14 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum) ...@@ -70,14 +69,14 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
int blockSize = 1; int blockSize = 1;
int blockNum = 1; int blockNum = 1;
for (int i = 0; i < s->order; i++) { for (int i = 0; i < s->order; i++) {
if (i == whereToSplitRDI) { if (i == whereToSplit) {
blockSize *= s->dimSizeRDI[i] / splitNum; blockSize *= s->dimSize[i] / splitNum;
blockNum *= splitNum; blockNum *= splitNum;
} }
else if (i < whereToSplitRDI) else if (i > whereToSplit)
blockSize *= s->dimSizeRDI[i]; blockSize *= s->dimSize[i];
else else
blockNum *= s->dimSizeRDI[i]; blockNum *= s->dimSize[i];
} }
CheckNTErrors((blockNum % splitNum == 0), "Incorrect split number!"); CheckNTErrors((blockNum % splitNum == 0), "Incorrect split number!");
...@@ -215,7 +214,6 @@ void _Split(const XTensor * big, XList * smalls, int whereToSplit, int splitNum) ...@@ -215,7 +214,6 @@ void _Split(const XTensor * big, XList * smalls, int whereToSplit, int splitNum)
CheckNTErrors((smalls->count == splitNum), "Unmatched tensors!"); CheckNTErrors((smalls->count == splitNum), "Unmatched tensors!");
CheckNTErrors((smalls->count > 0), "Wrong input!"); CheckNTErrors((smalls->count > 0), "Wrong input!");
int whereToSplitRDI = big->order - whereToSplit - 1;
bool uniform = true; bool uniform = true;
for (int i = 0; i < smalls->count; i++) { for (int i = 0; i < smalls->count; i++) {
...@@ -231,14 +229,14 @@ void _Split(const XTensor * big, XList * smalls, int whereToSplit, int splitNum) ...@@ -231,14 +229,14 @@ void _Split(const XTensor * big, XList * smalls, int whereToSplit, int splitNum)
int blockSize = 1; int blockSize = 1;
int blockNum = 1; int blockNum = 1;
for (int i = 0; i < big->order; i++) { for (int i = 0; i < big->order; i++) {
if (i == whereToSplitRDI) { if (i == whereToSplit) {
blockSize *= big->dimSizeRDI[i] / splitNum; blockSize *= big->dimSize[i] / splitNum;
blockNum *= splitNum; blockNum *= splitNum;
} }
else if (i < whereToSplitRDI) else if (i > whereToSplit)
blockSize *= big->dimSizeRDI[i]; blockSize *= big->dimSize[i];
else else
blockNum *= big->dimSizeRDI[i]; blockNum *= big->dimSize[i];
} }
CheckNTErrors((blockNum % splitNum == 0), "Incorrect split number!"); CheckNTErrors((blockNum % splitNum == 0), "Incorrect split number!");
......
...@@ -42,16 +42,15 @@ void _Unsqueeze(const XTensor * a, XTensor * b, int dim, int dSize) ...@@ -42,16 +42,15 @@ void _Unsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
CheckNTErrors((a->order == b->order - 1), "Unmatched tensors!"); CheckNTErrors((a->order == b->order - 1), "Unmatched tensors!");
CheckNTErrors((a->unitSize == b->unitSize), "Unmatched tensors!"); CheckNTErrors((a->unitSize == b->unitSize), "Unmatched tensors!");
int dimRDI = b->order - dim - 1;
for (int i = 0; i < b->order; i++) { for (int i = 0; i < b->order; i++) {
if (i < dimRDI) { if (i < dim) {
CheckNTErrors((a->dimSizeRDI[i] == b->dimSizeRDI[i]), "Unmatched tensors!"); CheckNTErrors((a->dimSize[i] == b->dimSize[i]), "Unmatched tensors!");
} }
else if (i > dimRDI) { else if (i > dim) {
CheckNTErrors((a->dimSizeRDI[i - 1] == b->dimSizeRDI[i]), "Unmatched tensors!"); CheckNTErrors((a->dimSize[i - 1] == b->dimSize[i]), "Unmatched tensors!");
} }
else { else {
CheckNTErrors((dSize == b->dimSizeRDI[i]), "Unmatched tensors!"); CheckNTErrors((dSize == b->dimSize[i]), "Unmatched tensors!");
} }
} }
...@@ -60,8 +59,8 @@ void _Unsqueeze(const XTensor * a, XTensor * b, int dim, int dSize) ...@@ -60,8 +59,8 @@ void _Unsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
int blockNumA = 1; int blockNumA = 1;
int blockNumB = 1; int blockNumB = 1;
for (int i = 0; i < dimRDI; i++) for (int i = dim; i < a->order; i++)
blockSize *= a->dimSizeRDI[i]; blockSize *= a->dimSize[i];
realBlockSize = blockSize * a->unitSize; realBlockSize = blockSize * a->unitSize;
......
...@@ -235,9 +235,8 @@ void _CudaUnsqueeze(const XTensor * a, XTensor * b, int dim, int dSize) ...@@ -235,9 +235,8 @@ void _CudaUnsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
int blockSize = 1; int blockSize = 1;
int blockNumA = 1; int blockNumA = 1;
int blockNumB = 1; int blockNumB = 1;
int dimRDI = b->order - dim - 1; for (int i = dim; i < a->order; i++)
for (int i = 0; i < dimRDI; i++) blockSize *= a->dimSize[i];
blockSize *= a->dimSizeRDI[i];
blockNumA = a->unitNum / blockSize; blockNumA = a->unitNum / blockSize;
blockNumB = b->unitNum / blockSize; blockNumB = b->unitNum / blockSize;
...@@ -250,7 +249,7 @@ void _CudaUnsqueeze(const XTensor * a, XTensor * b, int dim, int dSize) ...@@ -250,7 +249,7 @@ void _CudaUnsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
int devIDBackup = 0; int devIDBackup = 0;
ProtectCudaDev(a->devID, devIDBackup); ProtectCudaDev(a->devID, devIDBackup);
if (dimRDI == 0) { if (dim == b->order - 1) {
GDevs.GetCudaThread2D(a->devID, dSize, blockNumA, MAX_INT, cudaGrids, cudaBlocks); GDevs.GetCudaThread2D(a->devID, dSize, blockNumA, MAX_INT, cudaGrids, cudaBlocks);
if (a->dataType == X_FLOAT && b->dataType == X_FLOAT) { if (a->dataType == X_FLOAT && b->dataType == X_FLOAT) {
......
...@@ -42,7 +42,6 @@ void _Sort(const XTensor * a, XTensor * b, XTensor * index, int dim) ...@@ -42,7 +42,6 @@ void _Sort(const XTensor * a, XTensor * b, XTensor * index, int dim)
CheckNTErrors((a->order == index->order), "Unmatched input tensors!"); CheckNTErrors((a->order == index->order), "Unmatched input tensors!");
CheckNTErrors((index->dataType == X_INT), "Wrong data type!"); CheckNTErrors((index->dataType == X_INT), "Wrong data type!");
int dimRDI = a->order - dim - 1;
/* make the index tensor */ /* make the index tensor */
index->SetAscendingOrder(dim); index->SetAscendingOrder(dim);
...@@ -55,13 +54,13 @@ void _Sort(const XTensor * a, XTensor * b, XTensor * index, int dim) ...@@ -55,13 +54,13 @@ void _Sort(const XTensor * a, XTensor * b, XTensor * index, int dim)
} }
else { else {
int stride = 1; int stride = 1;
int strideNum = a->dimSizeRDI[dimRDI];
for (int i = 0; i < dimRDI; i++)
stride *= a->dimSizeRDI[i];
int blockNum = 1; int blockNum = 1;
for (int i = dimRDI + 1; i < a->order; i++) int strideNum = a->dimSize[dim];
blockNum *= a->dimSizeRDI[i]; for (int i = 0; i < dim; i++)
blockNum *= a->dimSize[i];
for (int i = dim + 1; i < a->order; i++)
stride *= a->dimSize[i];
int blockSize = stride * strideNum; int blockSize = stride * strideNum;
_CopyValues(a, b); _CopyValues(a, b);
......
...@@ -217,20 +217,19 @@ void _CudaSortBig(const XTensor * a, XTensor * b, XTensor * indexA, XTensor * in ...@@ -217,20 +217,19 @@ void _CudaSortBig(const XTensor * a, XTensor * b, XTensor * indexA, XTensor * in
CheckNTErrors((a->order > dim && dim >= 0), "Incorrect dimension specified!"); CheckNTErrors((a->order > dim && dim >= 0), "Incorrect dimension specified!");
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!"); CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
int dimRDI = a->order - dim - 1; if (k < 0 || k > b->dimSize[dim])
if (k < 0 || k > b->dimSizeRDI[dimRDI]) k = b->dimSize[dim];
k = b->dimSizeRDI[dimRDI];
XMem * mem = a->mem; XMem * mem = a->mem;
int stride = 1; int stride = 1;
int strideNum = a->dimSizeRDI[dimRDI];
for (int i = 0; i < dimRDI; i++)
stride *= a->dimSizeRDI[i];
int blockNum = 1; int blockNum = 1;
for (int i = dimRDI + 1; i < a->order; i++) int strideNum = a->dimSize[dim];
blockNum *= a->dimSizeRDI[i]; for (int i = 0; i < dim; i++)
blockNum *= a->dimSize[i];
for (int i = dim + 1; i < a->order; i++)
stride *= a->dimSize[i];
int m = GetNextPower2(strideNum); int m = GetNextPower2(strideNum);
int n = stride * blockNum; int n = stride * blockNum;
......
...@@ -41,15 +41,14 @@ void _TopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k) ...@@ -41,15 +41,14 @@ void _TopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
CheckNTErrors((index == NULL || a->order == index->order), "Unmatched input tensors!"); CheckNTErrors((index == NULL || a->order == index->order), "Unmatched input tensors!");
CheckNTErrors((index->dataType == X_INT), "Wrong data type!"); CheckNTErrors((index->dataType == X_INT), "Wrong data type!");
int dimRDI = a->order - dim - 1;
for (int i = 0; i < a->order; i++) { for (int i = 0; i < a->order; i++) {
if (i == dimRDI) { if (i == dim) {
CheckNTErrors((b->dimSizeRDI[i] == k), "A too large K"); CheckNTErrors((b->dimSize[i] == k), "A too large K");
CheckNTErrors((index == NULL || index->dimSizeRDI[i] == k), "Wrong size!"); CheckNTErrors((index == NULL || index->dimSize[i] == k), "Wrong size!");
} }
else { else {
CheckNTErrors((b->dimSizeRDI[i] == a->dimSizeRDI[i]), "Wrong size!"); CheckNTErrors((b->dimSize[i] == a->dimSize[i]), "Wrong size!");
CheckNTErrors((index == NULL || index->dimSizeRDI[i] == a->dimSizeRDI[i]), "Wrong size!"); CheckNTErrors((index == NULL || index->dimSize[i] == a->dimSize[i]), "Wrong size!");
} }
} }
...@@ -64,14 +63,14 @@ void _TopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k) ...@@ -64,14 +63,14 @@ void _TopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!"); CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
int stride = 1; int stride = 1;
int strideNumA = a->dimSizeRDI[dimRDI];
int strideNumB = b->dimSizeRDI[dimRDI];
for (int i = 0; i < dimRDI; i++)
stride *= a->dimSizeRDI[i];
int blockNum = 1; int blockNum = 1;
for (int i = dimRDI + 1; i < a->order; i++) int strideNumA = a->dimSize[dim];
blockNum *= a->dimSizeRDI[i]; int strideNumB = b->dimSize[dim];
for (int i = 0; i < dim; i++)
blockNum *= a->dimSize[i];
for (int i = dim + 1; i < a->order; i++)
stride *= a->dimSize[i];
int blockSizeA = stride * strideNumA; int blockSizeA = stride * strideNumA;
int blockSizeB = stride * strideNumB; int blockSizeB = stride * strideNumB;
......
...@@ -811,15 +811,14 @@ void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k) ...@@ -811,15 +811,14 @@ void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
CheckNTErrors((index->dataType == X_INT), "Wrong data type!"); CheckNTErrors((index->dataType == X_INT), "Wrong data type!");
CheckNTErrors((b->dimSize[dim] == k), "A too large K"); CheckNTErrors((b->dimSize[dim] == k), "A too large K");
int dimRDI = a->order - dim - 1;
int stride = 1; int stride = 1;
int strideNumA = a->dimSizeRDI[dimRDI];
for (int i = 0; i < dimRDI; i++)
stride *= a->dimSizeRDI[i];
int blockNum = 1; int blockNum = 1;
for (int i = dimRDI + 1; i < a->order; i++) int strideNumA = a->dimSize[dim];
blockNum *= a->dimSizeRDI[i]; for (int i = 0; i < dim; i++)
blockNum *= a->dimSize[i];
for (int i = dim + 1; i < a->order; i++)
stride *= a->dimSize[i];
int workerNum = blockNum < 16 ? 64 : 32; int workerNum = blockNum < 16 ? 64 : 32;
/* adjust the thread num according size of k for fitting the share memory size */ /* adjust the thread num according size of k for fitting the share memory size */
......
...@@ -67,15 +67,14 @@ void CudaSetAscendingOrder(XTensor * a, int dim) ...@@ -67,15 +67,14 @@ void CudaSetAscendingOrder(XTensor * a, int dim)
{ {
CheckNTErrors((a->dataType == X_INT), "TODO!"); CheckNTErrors((a->dataType == X_INT), "TODO!");
int dimRDI = a->order - dim - 1;
int stride = 1; int stride = 1;
int strideNum = a->dimSizeRDI[dimRDI];
for(int i = 0; i < dimRDI; i++)
stride *= a->dimSizeRDI[i];
int blockNum = 1; int blockNum = 1;
for(int i = dimRDI + 1; i < a->order; i++) int strideNum = a->dimSize[dim];
blockNum *= a->dimSizeRDI[i]; for(int i = 0; i < dim; i++)
blockNum *= a->dimSize[i];
for(int i = dim + 1; i < a->order; i++)
stride *= a->dimSize[i];
int gridSize[3]; int gridSize[3];
int blockSize[3]; int blockSize[3];
......
...@@ -49,7 +49,6 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim) ...@@ -49,7 +49,6 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
return; return;
} }
int leadDimRDI = x->order - leadDim - 1;
if (!x->isSparse && !y->isSparse && if (!x->isSparse && !y->isSparse &&
x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE) x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE)
{ {
...@@ -69,13 +68,13 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim) ...@@ -69,13 +68,13 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
XTensor * blockMax = NULL; XTensor * blockMax = NULL;
XTensor * blockSum = NULL; XTensor * blockSum = NULL;
int dimensionSize = y->dimSizeRDI[leadDimRDI]; int dimensionSize = y->dimSize[leadDim];
int stride = 1; int stride = 1;
int blockSize = 1; int blockSize = 1;
int blockNum = 1; int blockNum = 1;
for (int i = 0; i < leadDimRDI; i++) for (int i = leadDim + 1; i < y->order; i++)
stride *= y->dimSizeRDI[i]; stride *= y->dimSize[i];
blockSize = stride * dimensionSize; blockSize = stride * dimensionSize;
blockNum = y->unitNum / blockSize; blockNum = y->unitNum / blockSize;
...@@ -86,7 +85,7 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim) ...@@ -86,7 +85,7 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
_ReduceSum(x, sum, leadDim, max, 1.0F, true); _ReduceSum(x, sum, leadDim, max, 1.0F, true);
if (x->devID >= 0) { if (x->devID >= 0) {
if(leadDimRDI == 0){ if(leadDim == x->order - 1){
blockSize = y->unitNum; blockSize = y->unitNum;
blockNum = 1; blockNum = 1;
blockx = NewTensor2D(blockSize/dimensionSize, -dimensionSize, x->dataType, x->devID, mem); blockx = NewTensor2D(blockSize/dimensionSize, -dimensionSize, x->dataType, x->devID, mem);
...@@ -137,7 +136,7 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim) ...@@ -137,7 +136,7 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
blockMax->data = mp; blockMax->data = mp;
blockSum->data = sp; blockSum->data = sp;
#ifdef USE_CUDA #ifdef USE_CUDA
if(leadDimRDI == 0) if(leadDim == x->order - 1)
_CudaLogSoftmaxSumMax(blockx, blocky, 1, blockSum, blockMax); _CudaLogSoftmaxSumMax(blockx, blocky, 1, blockSum, blockMax);
else else
_CudaLogSoftmaxSumMax(blockx, blocky, leadDim, blockSum, blockMax); _CudaLogSoftmaxSumMax(blockx, blocky, leadDim, blockSum, blockMax);
...@@ -289,7 +288,6 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, ...@@ -289,7 +288,6 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
if(leadDim < 0) if(leadDim < 0)
leadDim = y->order - 1; leadDim = y->order - 1;
int leadDimRDI = y->order - leadDim - 1;
#ifdef USE_CUDA #ifdef USE_CUDA
if (gold->devID >= 0) { if (gold->devID >= 0) {
_CudaLogSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName); _CudaLogSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
...@@ -297,12 +295,12 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, ...@@ -297,12 +295,12 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
} }
#endif #endif
int dimensionSize = y->dimSizeRDI[leadDimRDI]; int dimensionSize = y->dimSize[leadDim];
int stride = 1; int stride = 1;
int blockSize = 1; int blockSize = 1;
int blockNum = 1; int blockNum = 1;
for (int i = 0; i < leadDimRDI; i++) for (int i = leadDim + 1; i < y->order - 1; i++)
stride *= y->dimSizeRDI[i]; stride *= y->dimSize[i];
blockSize = stride * dimensionSize; blockSize = stride * dimensionSize;
blockNum = y->unitNum / blockSize; blockNum = y->unitNum / blockSize;
...@@ -329,10 +327,10 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, ...@@ -329,10 +327,10 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
int key = gold->GetKeyInSparse(i); int key = gold->GetKeyInSparse(i);
DTYPE value = gold->GetInSparse(i); DTYPE value = gold->GetInSparse(i);
int offset = key; int offset = key;
if (dedx->dimSizeRDI[0] != gm) { if (dedx->dimSize[dedx->order - 1] != gm) {
int mi = key % gm; int mi = key % gm;
int ni = key / gm; int ni = key / gm;
int key2 = ni * dedx->dimSizeRDI[0] + mi; int key2 = ni * dedx->dimSize[dedx->order - 1] + mi;
offset = key2; offset = key2;
} }
if (key >= 0 && key < size) if (key >= 0 && key < size)
...@@ -386,10 +384,10 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, ...@@ -386,10 +384,10 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
int key = gold->GetKeyInSparse(i); int key = gold->GetKeyInSparse(i);
DTYPE value = gold->GetInSparse(i); DTYPE value = gold->GetInSparse(i);
int offset = key; int offset = key;
if (dedx->dimSizeRDI[0] != gm) { if (dedx->dimSize[dedx->order - 1] != gm) {
int mi = key % gm; int mi = key % gm;
int ni = key / gm; int ni = key / gm;
int key2 = ni * dedx->dimSizeRDI[0] + mi; int key2 = ni * dedx->dimSize[dedx->order - 1] + mi;
offset = key2; offset = key2;
} }
if (key >= 0 && key < size) if (key >= 0 && key < size)
...@@ -421,11 +419,11 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, ...@@ -421,11 +419,11 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
/* for columns with no xs we set dE/ds = 0 */ /* for columns with no xs we set dE/ds = 0 */
if (gold != NULL && gold->isSparse) { if (gold != NULL && gold->isSparse) {
CheckNTErrors((gold->order == 2), "The gold standard tensor must be of order 2!"); CheckNTErrors((gold->order == 2), "The gold standard tensor must be of order 2!");
if ((gold->dimSize[1] > 1 && !gold->isAllValued[0]) || gold->dimSize[1] != dedx->dimSizeRDI[0]) { if ((gold->dimSize[1] > 1 && !gold->isAllValued[0]) || gold->dimSize[1] != dedx->dimSize[dedx->order - 1]) {
int gn = gold->dimSize[0]; int gn = gold->dimSize[0];
int gm = gold->dimSize[1]; int gm = gold->dimSize[1];
int sm = dedx->dimSizeRDI[0]; int sm = dedx->dimSize[dedx->order - 1];
int sn = dedx->dimSizeRDI[1]; int sn = dedx->dimSize[dedx->order - 2];
int * flags = new int[sm]; int * flags = new int[sm];
memset(flags, 0, sizeof(int)*sm); memset(flags, 0, sizeof(int)*sm);
......
...@@ -384,13 +384,12 @@ void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, ...@@ -384,13 +384,12 @@ void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
"Tensors used in log softmax are not on the same GPU."); "Tensors used in log softmax are not on the same GPU.");
CheckNTErrors((gold != NULL), "No x gold standard is found!"); CheckNTErrors((gold != NULL), "No x gold standard is found!");
int leadDimRDI = y->order - leadDim - 1; int dimensionSize = y->dimSize[leadDim];
int dimensionSize = y->dimSizeRDI[leadDimRDI];
int stride = 1; int stride = 1;
int blockSize = 1; int blockSize = 1;
int blockNum = 1; int blockNum = 1;
for (int i = 0; i < leadDimRDI; i++) for (int i = leadDim + 1; i < y->order; i++)
stride *= y->dimSizeRDI[i]; stride *= y->dimSize[i];
blockSize = stride * dimensionSize; blockSize = stride * dimensionSize;
blockNum = y->unitNum / blockSize; blockNum = y->unitNum / blockSize;
......
...@@ -49,18 +49,17 @@ DTYPE _LossCompute(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName, ...@@ -49,18 +49,17 @@ DTYPE _LossCompute(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
if (output->devID < 0) { if (output->devID < 0) {
CheckNTErrors((gLen >= 0 && gLen <= output->unitNum), "Illegal input length!"); CheckNTErrors((gLen >= 0 && gLen <= output->unitNum), "Illegal input length!");
CheckNTErrors((XTensor::IsSameShaped(gold, output)), "The input tensors must be of the same size!"); CheckNTErrors((XTensor::IsSameShaped(gold, output)), "The input tensors must be of the same size!");
CheckNTErrors((gold->dimSizeRDI[0] == 1 && output->dimSizeRDI[0] == 1), "TODO!"); CheckNTErrors((gold->dimSize[gold->order - 1] == 1 && output->dimSize[output->order - 1] == 1), "TODO!");
CheckNTErrors((gold->order > leadDim && leadDim >= 0), "Illegal leading dimension!"); CheckNTErrors((gold->order > leadDim && leadDim >= 0), "Illegal leading dimension!");
CheckNTErrors((gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE), "TODO!"); CheckNTErrors((gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE), "TODO!");
int leadDimRDI = output->order - leadDim - 1; int dimensionSize = output->dimSize[leadDim];
int dimensionSize = output->dimSizeRDI[leadDimRDI];
int stride = 1; int stride = 1;
int blockSize = 1; int blockSize = 1;
int blockNum = 1; int blockNum = 1;
for(int i = 0; i < leadDimRDI; i++) for(int i = leadDim + 1; i < output->order; i++)
stride *= output->dimSizeRDI[i]; stride *= output->dimSize[i];
blockSize = stride * dimensionSize; blockSize = stride * dimensionSize;
blockNum = output->unitNum / blockSize; blockNum = output->unitNum / blockSize;
...@@ -206,18 +205,17 @@ DTYPE _LossComputeForLogScale(XTensor * gold, XTensor * output, ...@@ -206,18 +205,17 @@ DTYPE _LossComputeForLogScale(XTensor * gold, XTensor * output,
{ {
CheckNTErrors(gLen >= 0 && gLen <= output->unitNum, "Illegal input length!"); CheckNTErrors(gLen >= 0 && gLen <= output->unitNum, "Illegal input length!");
CheckNTErrors(XTensor::IsSameShaped(gold, output), "The input tensors must be of the same size!"); CheckNTErrors(XTensor::IsSameShaped(gold, output), "The input tensors must be of the same size!");
CheckNTErrors(gold->dimSizeRDI[0] == 1 && output->dimSizeRDI[0] == 1, "TODO!"); CheckNTErrors(gold->dimSize[gold->order - 1] == 1 && output->dimSize[output->order - 1] == 1, "TODO!");
CheckNTErrors(gold->order > leadDim && leadDim >= 0, "Illegal leading dimension!"); CheckNTErrors(gold->order > leadDim && leadDim >= 0, "Illegal leading dimension!");
CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, "TODO!"); CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, "TODO!");
int leadDimRDI = output->order - leadDim - 1; int dimensionSize = output->dimSize[leadDim];
int dimensionSize = output->dimSizeRDI[leadDimRDI];
int stride = 1; int stride = 1;
int blockSize = 1; int blockSize = 1;
int blockNum = 1; int blockNum = 1;
for(int i = 0; i < leadDimRDI; i++) for(int i = leadDim + 1; i < output->order; i++)
stride *= output->dimSizeRDI[i]; stride *= output->dimSize[i];
blockSize = stride * dimensionSize; blockSize = stride * dimensionSize;
blockNum = output->unitNum / blockSize; blockNum = output->unitNum / blockSize;
...@@ -408,21 +406,21 @@ void _LossBackward(XTensor * dedy, XTensor * t, XTensor * y, ...@@ -408,21 +406,21 @@ void _LossBackward(XTensor * dedy, XTensor * t, XTensor * y,
CheckNTErrors(t->order > leadDim, "Illegal leading dimension!"); CheckNTErrors(t->order > leadDim, "Illegal leading dimension!");
CheckNTErrors(t->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE, "TODO!"); CheckNTErrors(t->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE, "TODO!");
int leadDimRDI = leadDim >= 0 ? y->order - leadDim - 1 : -1; if(leadDim < 0){
if(leadDimRDI < 0){ leadDim = 0;
leadDimRDI = y->order - 1;
tBeg = 0; tBeg = 0;
yBeg = 0; yBeg = 0;
tLen = y->dimSizeRDI[leadDimRDI]; tLen = y->dimSize[leadDim];
printf("%d", tLen);
} }
int dimensionSize = y->dimSizeRDI[leadDimRDI]; int dimensionSize = y->dimSize[leadDim];
int stride = 1; int stride = 1;
int blockSize = 1; int blockSize = 1;
int blockNum = 1; int blockNum = 1;
for(int i = 0; i < leadDimRDI; i++) for(int i = leadDim + 1; i < y->order; i++)
stride *= y->dimSizeRDI[i]; stride *= y->dimSize[i];
blockSize = stride * dimensionSize; blockSize = stride * dimensionSize;
blockNum = y->unitNum / blockSize; blockNum = y->unitNum / blockSize;
......
...@@ -56,7 +56,7 @@ DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName, ...@@ -56,7 +56,7 @@ DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
{ {
CheckNTErrors((gLen >= 0 && gLen <= y->unitNum), "Illegal input length!"); CheckNTErrors((gLen >= 0 && gLen <= y->unitNum), "Illegal input length!");
CheckNTErrors((XTensor::IsSameShaped(gold, y)), "The input tensors must be of the same size!"); CheckNTErrors((XTensor::IsSameShaped(gold, y)), "The input tensors must be of the same size!");
CheckNTErrors((gold->dimSizeRDI[0] == 1 && y->dimSizeRDI[0] == 1), "TODO!"); CheckNTErrors((gold->dimSize[gold->order - 1] == 1 && y->dimSize[y->order - 1] == 1), "TODO!");
CheckNTErrors((gold->order > leadDim && leadDim >= 0), "Illegal leading dimension!"); CheckNTErrors((gold->order > leadDim && leadDim >= 0), "Illegal leading dimension!");
CheckNTErrors((gold->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE), "TODO!"); CheckNTErrors((gold->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE), "TODO!");
CheckNTErrors((gold->devID == y->devID), "Tensors must be on the same device!"); CheckNTErrors((gold->devID == y->devID), "Tensors must be on the same device!");
...@@ -91,7 +91,7 @@ DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName, ...@@ -91,7 +91,7 @@ DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
diffNew->order = 2; diffNew->order = 2;
diffNew->dimSize[1] = diffNew->dimSize[0]; diffNew->dimSize[1] = diffNew->dimSize[0];
diffNew->dimSize[0] = 1; diffNew->dimSize[0] = 1;
diffNew->dimSizeRDI[1] = 1; diffNew->dimSize[diffNew->order - 2] = 1;
} }
delete diff; delete diff;
diff = diffNew; diff = diffNew;
...@@ -125,7 +125,7 @@ DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName, ...@@ -125,7 +125,7 @@ DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
diffNew->order = 2; diffNew->order = 2;
diffNew->dimSize[1] = diffNew->dimSize[0]; diffNew->dimSize[1] = diffNew->dimSize[0];
diffNew->dimSize[0] = 1; diffNew->dimSize[0] = 1;
diffNew->dimSizeRDI[1] = 1; diffNew->dimSize[diffNew->order - 2] = 1;
} }
delete diff; delete diff;
diff = diffNew; diff = diffNew;
...@@ -162,7 +162,7 @@ DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName, ...@@ -162,7 +162,7 @@ DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
diffNew->order = 2; diffNew->order = 2;
diffNew->dimSize[1] = diffNew->dimSize[0]; diffNew->dimSize[1] = diffNew->dimSize[0];
diffNew->dimSize[0] = 1; diffNew->dimSize[0] = 1;
diffNew->dimSizeRDI[1] = 1; diffNew->dimSize[diffNew->order - 2] = 1;
} }
delete diff; delete diff;
diff = diffNew; diff = diffNew;
...@@ -349,22 +349,22 @@ void _CudaLossBackward(XTensor * dedy, XTensor * t, XTensor * y, ...@@ -349,22 +349,22 @@ void _CudaLossBackward(XTensor * dedy, XTensor * t, XTensor * y,
"The vectors must be on the same GPU."); "The vectors must be on the same GPU.");
CheckNTErrors((tBeg == yBeg), "TODO!"); CheckNTErrors((tBeg == yBeg), "TODO!");
int leadDimRDI = leadDim >= 0 ? y->order - leadDim - 1 : -1; if(leadDim < 0){
if(leadDimRDI < 0){ leadDim = 0;
leadDimRDI = y->order - 1;
tBeg = 0; tBeg = 0;
yBeg = 0; yBeg = 0;
tLen = y->dimSizeRDI[leadDimRDI]; tLen = y->dimSize[leadDim];
printf("%d", tLen);
} }
int dimensionSize = y->dimSizeRDI[leadDimRDI]; int dimensionSize = y->dimSize[leadDim];
int stride = 1; int stride = 1;
int blockSize = 1; int blockSize = 1;
int blockNum = 1; int blockNum = 1;
int size = 1; int size = 1;
for(int i = 0; i < leadDimRDI; i++) for(int i = leadDim + 1; i < y->order; i++)
stride *= y->dimSizeRDI[i]; stride *= y->dimSize[i];
size = tLen * stride; size = tLen * stride;
blockSize = stride * dimensionSize; blockSize = stride * dimensionSize;
blockNum = y->unitNum / blockSize; blockNum = y->unitNum / blockSize;
......
...@@ -40,7 +40,6 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim) ...@@ -40,7 +40,6 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)
if(leadDim < 0) if(leadDim < 0)
leadDim = x->order - 1; leadDim = x->order - 1;
int leadDimRDI = x->order - leadDim - 1;
if(!x->isSparse && !y->isSparse && x->dataType == y->dataType){ if(!x->isSparse && !y->isSparse && x->dataType == y->dataType){
int * dimSize = new int[x->order - 1]; int * dimSize = new int[x->order - 1];
for(int i = 0; i < x->order; i++){ for(int i = 0; i < x->order; i++){
...@@ -70,13 +69,13 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim) ...@@ -70,13 +69,13 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)
else{ else{
CheckNTErrors((x->dataType == DEFAULT_DTYPE), "TODO!"); CheckNTErrors((x->dataType == DEFAULT_DTYPE), "TODO!");
int dimensionSize = y->dimSizeRDI[leadDimRDI]; int dimensionSize = y->dimSize[leadDim];
int stride = 1; int stride = 1;
int blockSize = 1; int blockSize = 1;
int blockNum = 1; int blockNum = 1;
for(int i = 0; i < leadDimRDI; i++) for(int i = leadDim + 1; i < y->order; i++)
stride *= y->dimSizeRDI[i]; stride *= y->dimSize[i];
blockSize = stride * dimensionSize; blockSize = stride * dimensionSize;
blockNum = y->unitNum / blockSize; blockNum = y->unitNum / blockSize;
...@@ -184,7 +183,6 @@ void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, ...@@ -184,7 +183,6 @@ void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
if(leadDim < 0) if(leadDim < 0)
leadDim = y->order - 1; leadDim = y->order - 1;
int leadDimRDI = y->order - leadDim - 1;
#ifdef USE_CUDA #ifdef USE_CUDA
if(y->devID >= 0){ if(y->devID >= 0){
...@@ -193,12 +191,12 @@ void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, ...@@ -193,12 +191,12 @@ void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
} }
#endif #endif
int dimensionSize = y->dimSizeRDI[leadDimRDI]; int dimensionSize = y->dimSize[leadDim];
int stride = 1; int stride = 1;
int blockSize = 1; int blockSize = 1;
int blockNum = 1; int blockNum = 1;
for(int i = 0; i < leadDimRDI; i++) for(int i = leadDim + 1; i < y->order; i++)
stride *= y->dimSizeRDI[i]; stride *= y->dimSize[i];
blockSize = stride * dimensionSize; blockSize = stride * dimensionSize;
blockNum = y->unitNum / blockSize; blockNum = y->unitNum / blockSize;
......
...@@ -225,14 +225,13 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s ...@@ -225,14 +225,13 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s
CheckNTErrors((x->devID == y->devID), "Tensors used in softmax are not on the same GPU."); CheckNTErrors((x->devID == y->devID), "Tensors used in softmax are not on the same GPU.");
CheckNTErrors((XTensor::IsSameShaped(x, y)), "Input tensors must be of the same size!"); CheckNTErrors((XTensor::IsSameShaped(x, y)), "Input tensors must be of the same size!");
int leadDimRDI = y->order - leadDim - 1; int dimensionSize = y->dimSize[leadDim];
int dimensionSize = y->dimSizeRDI[leadDimRDI];
int stride = 1; int stride = 1;
int blockSize = 1; int blockSize = 1;
int blockNum = 1; int blockNum = 1;
for(int i = 0; i < leadDimRDI; i++) for(int i = leadDim + 1; i < y->order; i++)
stride *= y->dimSizeRDI[i]; stride *= y->dimSize[i];
blockSize = stride * dimensionSize; blockSize = stride * dimensionSize;
blockNum = y->unitNum / blockSize; blockNum = y->unitNum / blockSize;
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论