Commit c9ef15f8 by 张裕浩

Remove XTensor RDI representation.

parent 1f4eecdd
......@@ -233,7 +233,6 @@ void XTensor::Init()
devID = -1;
order = -1;
memset(dimSize, 0, sizeof(int) * MAX_TENSOR_DIM_NUM);
memset(dimSizeRDI, 0, sizeof(int) * MAX_TENSOR_DIM_NUM);
dataType = DEFAULT_DTYPE;
unitSize = sizeof(float);
unitNum = 0;
......@@ -278,7 +277,6 @@ void XTensor::ShallowCopy(const XTensor &tensor)
{
order = tensor.order;
memcpy(dimSize, tensor.dimSize, sizeof(int) * MAX_TENSOR_DIM_NUM);
memcpy(dimSizeRDI, tensor.dimSizeRDI, sizeof(int) * MAX_TENSOR_DIM_NUM);
dataType = tensor.dataType;
unitSize = tensor.unitSize;
unitNum = tensor.unitNum;
......@@ -442,7 +440,7 @@ bool XTensor::IsSameShaped(const XTensor * a, const XTensor * b)
return false;
for(int i = 0; i < a->order; i++){
if(a->dimSizeRDI[i] != b->dimSizeRDI[i])
if(a->dimSize[i] != b->dimSize[i])
return false;
}
......@@ -478,7 +476,6 @@ void XTensor::SetDim(int * myDimSize)
{
for (int i = 0; i < order; i++) {
dimSize[i] = myDimSize[i];
dimSizeRDI[order - i - 1] = myDimSize[i];
}
}
......@@ -505,20 +502,17 @@ reshape the tensor
void XTensor::Reshape(const int myOrder, const int * myDimSize)
{
int dims[MAX_TENSOR_DIM_NUM];
int dimsRDI[MAX_TENSOR_DIM_NUM];
int num = 1;
for(int i = 0; i < myOrder; i++){
num *= myDimSize[i];
dims[i] = abs(myDimSize[i]);
dimsRDI[myOrder - i - 1] = dims[i];
}
CheckNTErrors(abs(num) == unitNum, "Wrong size found when we reshape the tensor!");
order = myOrder;
memcpy(dimSize, dims, sizeof(int) * order);
memcpy(dimSizeRDI, dimsRDI, sizeof(int) * order);
}
/*
......@@ -888,7 +882,6 @@ void XTensor::SetAscendingOrder(int dim)
CheckNTErrors((dim >= 0 && dim < order), "Wrong dimension specified!");
CheckNTErrors((dataType == X_INT), "TODO!");
int dimRDI = order - dim - 1;
if(devID >= 0){
#ifdef USE_CUDA
CudaSetAscendingOrder(this, dim);
......@@ -898,13 +891,13 @@ void XTensor::SetAscendingOrder(int dim)
}
else{
int stride = 1;
int strideNum = dimSizeRDI[dimRDI];
for(int i = 0; i < dimRDI; i++)
stride *= dimSizeRDI[i];
int blockNum = 1;
for(int i = dimRDI + 1; i < order; i++)
blockNum *= dimSizeRDI[i];
int strideNum = dimSize[dim];
for(int i = 0; i < dim; i++)
blockNum *= dimSize[i];
for(int i = dim + 1; i < order; i++)
stride *= dimSize[i];
for(int k = 0; k < blockNum; k++){
for(int j = 0; j < strideNum; j++){
......@@ -939,17 +932,13 @@ void * XTensor::GetCell(int index[], int size) const
{
CheckNTErrors((size == order), "Illegal index!");
int * indexRDI = new int[size];
for (int i = 0; i < size; i++)
indexRDI[size - i - 1] = index[i];
int offset = indexRDI[size - 1];
for(int i = size - 2; i >= 0; i--){
CheckNTErrors((indexRDI[i] < dimSizeRDI[i]), "Index is out of range!");
offset = offset * dimSizeRDI[i] + indexRDI[i];
int offset = index[0];
for(int i = 1; i < size; ++i){
CheckNTErrors((index[i] < dimSize[i]), "Index is out of range!");
offset = offset * dimSize[i] + index[i];
}
delete[] indexRDI;
if(isSparse){
DTYPE value;
......@@ -1365,7 +1354,6 @@ bool XTensor::Resize(const int myOrder, const int * myDimSize,
bool zeroData = false;
for(int i = 0; i < order; i++){
dimSize[i] = abs(myDimSize[i]);
dimSizeRDI[order - i - 1] = dimSize[i];
if(myDimSize[i] < 0)
filledData = false;
if(myDimSize[i] == 0)
......@@ -1564,7 +1552,7 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int beg,
if (isSparse) {
int num = 0;
for (int i = 0; i < order; i++)
num *= dimSizeRDI[i];
num *= dimSize[i];
num = int(num * denseRatio + 1);
int tupleSize = sizeof(int) + sizeof(DTYPE);
int size = sizeof(int) + tupleSize*(num);
......@@ -1756,8 +1744,8 @@ void XTensor::Read(FILE * file, const char * label)
int ds[MAX_TENSOR_DIM_NUM];
for (int i = 0; i < order; i++) {
ds[i] = key % dimSizeRDI[i];
key /= dimSizeRDI[i];
ds[i] = key % dimSize[i];
key /= dimSize[i];
}
Set(value, ds);
}
......
......@@ -95,8 +95,6 @@ public:
/* size of each dimension */
int dimSize[MAX_TENSOR_DIM_NUM];
/* size of each dimension by Reversed Dimension Indexing (RDI) Mode */
int dimSizeRDI[MAX_TENSOR_DIM_NUM];
/* data unit - data type for every cell */
TENSOR_DATA_TYPE dataType;
......
......@@ -41,7 +41,6 @@ where i is the index of the item
*/
void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
{
int leadingDimRDI = a->order - leadingDim - 1;
CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum),
"Unmatched tensors in multiplication!");
CheckNTErrors((a->order == b->order && a->order == c->order),
......@@ -59,17 +58,17 @@ void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int le
int blockSizeB = 1;
int blockSizeC = 1;
int blockNum = 1;
int dimensionSizeA = a->dimSizeRDI[leadingDimRDI];
int dimensionSizeB = b->dimSizeRDI[leadingDimRDI];
int dimensionSizeC = c->dimSizeRDI[leadingDimRDI];
int dimensionSizeA = a->dimSize[leadingDim];
int dimensionSizeB = b->dimSize[leadingDim];
int dimensionSizeC = c->dimSize[leadingDim];
for (int i = 0; i < a->order; i++) {
if (i != leadingDimRDI) {
CheckNTErrors((a->dimSizeRDI[i] == b->dimSizeRDI[i] && a->dimSizeRDI[i] == c->dimSizeRDI[i]),
if (i != leadingDim) {
CheckNTErrors((a->dimSize[i] == b->dimSize[i] && a->dimSize[i] == c->dimSize[i]),
"Unmatched tensors!");
}
if (i < leadingDimRDI)
stride *= a->dimSizeRDI[i];
if (i > leadingDim)
stride *= a->dimSize[i];
}
blockSizeA = stride * dimensionSizeA;
......
......@@ -122,7 +122,6 @@ where i is the item index
*/
void _CudaDiv(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
{
int leadingDimRDI = a->order - leadingDim - 1;
CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum),
"Unmatched tensors in multiplication!");
CheckNTErrors((a->order == b->order && a->order == c->order), "Unmatched tensors!");
......@@ -130,18 +129,18 @@ void _CudaDiv(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, in
int stride = 1;
int blockSizeA = 1;
int blockNum = 1;
int dimensionSizeA = a->dimSizeRDI[leadingDimRDI];
int dimensionSizeB = b->dimSizeRDI[leadingDimRDI];
int dimensionSizeC = c->dimSizeRDI[leadingDimRDI];
int dimensionSizeA = a->dimSize[leadingDim];
int dimensionSizeB = b->dimSize[leadingDim];
int dimensionSizeC = c->dimSize[leadingDim];
for (int i = 0; i < a->order; i++) {
if (i != leadingDimRDI) {
CheckNTErrors((a->dimSizeRDI[i] == b->dimSizeRDI[i] &&
a->dimSizeRDI[i] == c->dimSizeRDI[i]),
if (i != leadingDim) {
CheckNTErrors((a->dimSize[i] == b->dimSize[i] &&
a->dimSize[i] == c->dimSize[i]),
"Unmatched tensors!");
}
if (i < leadingDimRDI)
stride *= a->dimSizeRDI[i];
if (i > leadingDim)
stride *= a->dimSize[i];
}
blockSizeA = stride * dimensionSizeA;
......
......@@ -77,18 +77,18 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
return;
}
int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1];
int bm = transposedB == X_TRANS ? b->dimSizeRDI[1] : b->dimSizeRDI[0];
int cn = c->dimSizeRDI[1];
int cm = c->dimSizeRDI[0];
int an = transposedA == X_TRANS ? a->dimSize[a->order - 1] : a->dimSize[a->order - 2];
int am = transposedA == X_TRANS ? a->dimSize[a->order - 2] : a->dimSize[a->order - 1];
int bn = transposedB == X_TRANS ? b->dimSize[b->order - 1] : b->dimSize[b->order - 2];
int bm = transposedB == X_TRANS ? b->dimSize[b->order - 2] : b->dimSize[b->order - 1];
int cn = c->dimSize[c->order - 2];
int cm = c->dimSize[c->order - 1];
CheckNTErrors((am == bn && an == cn && bm == cm), "Unmatched tensors in multiplication!");
int aBlockSize = a->dimSizeRDI[0] * a->dimSizeRDI[1];
int bBlockSize = b->dimSizeRDI[0] * b->dimSizeRDI[1];
int cBlockSize = c->dimSizeRDI[0] * c->dimSizeRDI[1];
int aBlockSize = a->dimSize[a->order - 1] * a->dimSize[a->order - 2];
int bBlockSize = b->dimSize[b->order - 1] * b->dimSize[b->order - 2];
int cBlockSize = c->dimSize[c->order - 1] * c->dimSize[c->order - 2];
int aRealBlockSize = aBlockSize * a->unitSize;
int bRealBlockSize = bBlockSize * b->unitSize;
int cRealBlockSize = cBlockSize * c->unitSize;
......@@ -96,24 +96,25 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
int bBlockNum = 1;
int cBlockNum = 1;
for (int i = 2; i < a->order; i++) {
CheckNTErrors(a->dimSizeRDI[i] == c->dimSizeRDI[i - 2 + b->order], "Incorrect tensor sizes!");
aBlockNum *= a->dimSizeRDI[i];
cBlockNum *= a->dimSizeRDI[i];
for (int i = 0; i < a->order - 2; i++) {
CheckNTErrors(a->dimSize[i] == c->dimSize[i], "Incorrect tensor sizes!");
aBlockNum *= a->dimSize[i];
cBlockNum *= a->dimSize[i];
}
for (int i = 2; i < b->order; i++) {
CheckNTErrors(b->dimSizeRDI[i] == c->dimSizeRDI[i], "Incorrect tensor sizes!");
bBlockNum *= b->dimSizeRDI[i];
cBlockNum *= b->dimSizeRDI[i];
for (int i = 0; i < b->order - 2; i++) {
CheckNTErrors(b->dimSize[i] == c->dimSize[i - 2 + a->order], "Incorrect tensor sizes!");
bBlockNum *= b->dimSize[i];
cBlockNum *= b->dimSize[i];
}
XList * aList = new XList(10);
XList * bList = new XList(10);
XList * cList = new XList(10);
int aDimSize[2] = { -a->dimSizeRDI[1], a->dimSizeRDI[0] };
int bDimSize[2] = { -b->dimSizeRDI[1], b->dimSizeRDI[0] };
int cDimSize[2] = { -c->dimSizeRDI[1], c->dimSizeRDI[0] };
int aDimSize[2] = { -a->dimSize[a->order - 2], a->dimSize[a->order - 1] };
int bDimSize[2] = { -b->dimSize[b->order - 2], b->dimSize[b->order - 1] };
int cDimSize[2] = { -c->dimSize[c->order - 2], c->dimSize[c->order - 1] };
bool isSparseMul = false;
......@@ -230,20 +231,20 @@ XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
int an = transposedA == X_TRANS ? a.dimSizeRDI[0] : a.dimSizeRDI[1];
int am = transposedA == X_TRANS ? a.dimSizeRDI[1] : a.dimSizeRDI[0];
int bn = transposedB == X_TRANS ? b.dimSizeRDI[0] : b.dimSizeRDI[1];
int bm = transposedB == X_TRANS ? b.dimSizeRDI[1] : b.dimSizeRDI[0];
int an = transposedA == X_TRANS ? a.dimSize[a.order - 1] : a.dimSize[a.order - 2];
int am = transposedA == X_TRANS ? a.dimSize[a.order - 2] : a.dimSize[a.order - 1];
int bn = transposedB == X_TRANS ? b.dimSize[b.order - 1] : b.dimSize[b.order - 2];
int bm = transposedB == X_TRANS ? b.dimSize[b.order - 2] : b.dimSize[b.order - 1];
CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
int order = a.order + b.order - 2;
int sub = 0;
int * dimSize = new int[order];
for (int i = 2; i < a.order; i++)
dimSize[sub++] = a.dimSizeRDI[a.order + 1 - i];
for (int i = 2; i < b.order; i++)
dimSize[sub++] = b.dimSizeRDI[b.order + 1 - i];
for (int i = 0; i < a.order - 2; i++)
dimSize[sub++] = a.dimSize[i];
for (int i = 0; i < b.order - 2; i++)
dimSize[sub++] = b.dimSize[i];
dimSize[sub++] = an;
dimSize[sub++] = bm;
......@@ -280,20 +281,20 @@ XTensor MatrixMul(const XTensor &a, const XTensor &b,
CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
int an = a.dimSizeRDI[1];
int am = a.dimSizeRDI[0];
int bn = b.dimSizeRDI[1];
int bm = b.dimSizeRDI[0];
int an = a.dimSize[a.order - 2];
int am = a.dimSize[a.order - 1];
int bn = b.dimSize[b.order - 2];
int bm = b.dimSize[b.order - 1];
CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
int order = a.order + b.order - 2;
int sub = 0;
int * dimSize = new int[order];
for (int i = 2; i < a.order; i++)
dimSize[sub++] = a.dimSizeRDI[a.order + 1 - i];
for (int i = 2; i < b.order; i++)
dimSize[sub++] = b.dimSizeRDI[b.order + 1 - i];
for (int i = 0; i < a.order - 2; i++)
dimSize[sub++] = a.dimSize[i];
for (int i = 0; i < b.order - 2; i++)
dimSize[sub++] = b.dimSize[i];
dimSize[sub++] = an;
dimSize[sub++] = bm;
......
......@@ -56,7 +56,6 @@ void _MatrixMulBatched(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
"Input tensors must have a order >= 2!");
CheckNTErrors((a->order == b->order && a->order == c->order),
"Input tensor and output tensor must have same order!");
if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0)
_MatrixMulBatchedGPU(a, transposedA, b, transposedB, c, alpha, beta);
else
......@@ -94,27 +93,27 @@ void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
"Input tensor and output tensor must have same order!");
CheckNTErrors(a->devID >= 0 && b->devID >= 0 && c->devID >= 0, "The tensors must be on GPUs");
int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1];
int bm = transposedB == X_TRANS ? b->dimSizeRDI[1] : b->dimSizeRDI[0];
int cn = c->dimSizeRDI[1];
int cm = c->dimSizeRDI[0];
int an = transposedA == X_TRANS ? a->dimSize[a->order - 1] : a->dimSize[a->order - 2];
int am = transposedA == X_TRANS ? a->dimSize[a->order - 2] : a->dimSize[a->order - 1];
int bn = transposedB == X_TRANS ? b->dimSize[b->order - 1] : b->dimSize[b->order - 2];
int bm = transposedB == X_TRANS ? b->dimSize[b->order - 2] : b->dimSize[b->order - 1];
int cn = c->dimSize[c->order - 2];
int cm = c->dimSize[c->order - 1];
CheckNTErrors((am == bn && an == cn && bm == cm), "Unmatched tensors in multiplication!");
int aBlockSize = a->dimSizeRDI[0] * a->dimSizeRDI[1];
int bBlockSize = b->dimSizeRDI[0] * b->dimSizeRDI[1];
int cBlockSize = c->dimSizeRDI[0] * c->dimSizeRDI[1];
int aBlockSize = a->dimSize[a->order - 1] * a->dimSize[a->order - 2];
int bBlockSize = b->dimSize[b->order - 1] * b->dimSize[b->order - 2];
int cBlockSize = c->dimSize[c->order - 1] * c->dimSize[c->order - 2];
int aRealBlockSize = aBlockSize * a->unitSize;
int bRealBlockSize = bBlockSize * b->unitSize;
int cRealBlockSize = cBlockSize * c->unitSize;
int blockNum = 1;
for (int i = 2; i < a->order; i++) {
CheckNTErrors((a->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!");
CheckNTErrors((b->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!");
blockNum *= a->dimSizeRDI[i];
for (int i = 0; i < a->order - 2; i++) {
CheckNTErrors((a->dimSize[i] == c->dimSize[i]), "Incorrect tensor sizes!");
CheckNTErrors((b->dimSize[i] == c->dimSize[i]), "Incorrect tensor sizes!");
blockNum *= a->dimSize[i];
}
int devIDBackup = 0;
......@@ -125,9 +124,9 @@ void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
a->data, transposedA, a->dataType, aBlockSize,
b->data, transposedB, b->dataType, bBlockSize,
c->data, c->dataType, cBlockSize, blockNum,
a->dimSizeRDI[1], a->dimSizeRDI[0],
b->dimSizeRDI[1], b->dimSizeRDI[0],
c->dimSizeRDI[1], c->dimSizeRDI[0], alpha, beta);
a->dimSize[a->order - 2], a->dimSize[a->order - 1],
b->dimSize[b->order - 2], b->dimSize[b->order - 1],
c->dimSize[c->order - 2], c->dimSize[c->order - 1], alpha, beta);
BacktoCudaDev(a->devID, devIDBackup);
#endif
......@@ -163,32 +162,32 @@ CheckNTErrors((a && b && c), "Empty input tensors!");
"Input tensor and output tensor must have same order!");
int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1];
int bm = transposedB == X_TRANS ? b->dimSizeRDI[1] : b->dimSizeRDI[0];
int cn = c->dimSizeRDI[1];
int cm = c->dimSizeRDI[0];
int an = transposedA == X_TRANS ? a->dimSize[a->order - 1] : a->dimSize[a->order - 2];
int am = transposedA == X_TRANS ? a->dimSize[a->order - 2] : a->dimSize[a->order - 1];
int bn = transposedB == X_TRANS ? b->dimSize[b->order - 1] : b->dimSize[b->order - 2];
int bm = transposedB == X_TRANS ? b->dimSize[b->order - 2] : b->dimSize[b->order - 1];
int cn = c->dimSize[c->order - 2];
int cm = c->dimSize[c->order - 1];
CheckNTErrors(am == bn && an == cn && bm == cm, "Unmatched tensors in multiplication!");
int aBlockSize = a->dimSizeRDI[0] * a->dimSizeRDI[1];
int bBlockSize = b->dimSizeRDI[0] * b->dimSizeRDI[1];
int cBlockSize = c->dimSizeRDI[0] * c->dimSizeRDI[1];
int aBlockSize = a->dimSize[a->order - 1] * a->dimSize[a->order - 2];
int bBlockSize = b->dimSize[b->order - 1] * b->dimSize[b->order - 2];
int cBlockSize = c->dimSize[c->order - 1] * c->dimSize[c->order - 2];
int aRealBlockSize = aBlockSize * a->unitSize;
int bRealBlockSize = bBlockSize * b->unitSize;
int cRealBlockSize = cBlockSize * c->unitSize;
int blockNum = 1;
for (int i = 2; i < a->order; i++) {
CheckNTErrors((a->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!");
CheckNTErrors((b->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!");
blockNum *= a->dimSizeRDI[i];
for (int i = 0; i < a->order - 2; i++) {
CheckNTErrors((a->dimSize[i] == c->dimSize[i]), "Incorrect tensor sizes!");
CheckNTErrors((b->dimSize[i] == c->dimSize[i]), "Incorrect tensor sizes!");
blockNum *= a->dimSize[i];
}
int aDimSize[2] = {-a->dimSizeRDI[1], a->dimSizeRDI[0]};
int bDimSize[2] = {-b->dimSizeRDI[1], b->dimSizeRDI[0]};
int cDimSize[2] = {-c->dimSizeRDI[1], c->dimSizeRDI[0]};
int aDimSize[2] = {-a->dimSize[a->order - 2], a->dimSize[a->order - 1]};
int bDimSize[2] = {-b->dimSize[b->order - 2], b->dimSize[b->order - 1]};
int cDimSize[2] = {-c->dimSize[c->order - 2], c->dimSize[c->order - 1]};
XTensor * ai = NewTensor2D(aDimSize[0], aDimSize[1], a->dataType, a->devID, a->mem);
XTensor * bi = NewTensor2D(bDimSize[0], bDimSize[1], b->dataType, b->devID, b->mem);
......@@ -230,7 +229,6 @@ void _MatrixMulBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA,
{
CheckNTErrors(a && b && c, "Empty input lists!");
CheckNTErrors(a->count == b->count && a->count == c->count, "Input lists must be of the same size!");
if (a->count == 0)
return;
......@@ -291,10 +289,10 @@ XTensor MatrixMulBatched(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const
CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
CheckNTErrors(a.order == b.order, "Input tensor and output tensor must have same order!");
int an = transposedA == X_TRANS ? a.dimSizeRDI[0] : a.dimSizeRDI[1];
int am = transposedA == X_TRANS ? a.dimSizeRDI[1] : a.dimSizeRDI[0];
int bn = transposedB == X_TRANS ? b.dimSizeRDI[0] : b.dimSizeRDI[1];
int bm = transposedB == X_TRANS ? b.dimSizeRDI[1] : b.dimSizeRDI[0];
int an = transposedA == X_TRANS ? a.dimSize[a.order - 1] : a.dimSize[a.order - 2];
int am = transposedA == X_TRANS ? a.dimSize[a.order - 2] : a.dimSize[a.order - 1];
int bn = transposedB == X_TRANS ? b.dimSize[b.order - 1] : b.dimSize[b.order - 2];
int bm = transposedB == X_TRANS ? b.dimSize[b.order - 2] : b.dimSize[b.order - 1];
CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
......@@ -347,10 +345,10 @@ XTensor MatrixMulBatched(const XTensor &a, const XTensor &b,
CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
CheckNTErrors(a.order == b.order, "Input tensor and output tensor must have same order!");
int an = a.dimSizeRDI[1];
int am = a.dimSizeRDI[0];
int bn = b.dimSizeRDI[1];
int bm = b.dimSizeRDI[0];
int an = a.dimSize[a.order - 2];
int am = a.dimSize[a.order - 1];
int bn = b.dimSize[b.order - 2];
int bm = b.dimSize[b.order - 1];
CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
......
......@@ -41,7 +41,6 @@ where i is the index of the item
*/
void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
{
int leadingDimRDI = a->order - leadingDim - 1;
CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum),
"Unmatched tensors in multiplication!");
CheckNTErrors((a->order == b->order && a->order == c->order),
......@@ -59,18 +58,18 @@ void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, i
int blockSizeB = 1;
int blockSizeC = 1;
int blockNum = 1;
int dimensionSizeA = a->dimSizeRDI[leadingDimRDI];
int dimensionSizeB = b->dimSizeRDI[leadingDimRDI];
int dimensionSizeC = c->dimSizeRDI[leadingDimRDI];
int dimensionSizeA = a->dimSize[leadingDim];
int dimensionSizeB = b->dimSize[leadingDim];
int dimensionSizeC = c->dimSize[leadingDim];
for (int i = 0; i < a->order; i++) {
if (i != leadingDimRDI) {
CheckNTErrors((a->dimSizeRDI[i] == b->dimSizeRDI[i] &&
a->dimSizeRDI[i] == c->dimSizeRDI[i]),
if (i != leadingDim) {
CheckNTErrors((a->dimSize[i] == b->dimSize[i] &&
a->dimSize[i] == c->dimSize[i]),
"Unmatched tensors!");
}
if (i < leadingDimRDI)
stride *= a->dimSizeRDI[i];
if (i > leadingDim)
stride *= a->dimSize[i];
}
blockSizeA = stride * dimensionSizeA;
......
......@@ -122,7 +122,6 @@ where i is the item index
*/
void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
{
int leadingDimRDI = a->order - leadingDim - 1;
CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum),
"Unmatched tensors in multiplication!");
CheckNTErrors((a->order == b->order && a->order == c->order), "Unmatched tensors!");
......@@ -130,18 +129,18 @@ void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alph
int stride = 1;
int blockSizeA = 1;
int blockNum = 1;
int dimensionSizeA = a->dimSizeRDI[leadingDimRDI];
int dimensionSizeB = b->dimSizeRDI[leadingDimRDI];
int dimensionSizeC = c->dimSizeRDI[leadingDimRDI];
int dimensionSizeA = a->dimSize[leadingDim];
int dimensionSizeB = b->dimSize[leadingDim];
int dimensionSizeC = c->dimSize[leadingDim];
for (int i = 0; i < a->order; i++) {
if (i != leadingDimRDI) {
CheckNTErrors((a->dimSizeRDI[i] == b->dimSizeRDI[i] &&
a->dimSizeRDI[i] == c->dimSizeRDI[i]),
if (i != leadingDim) {
CheckNTErrors((a->dimSize[i] == b->dimSize[i] &&
a->dimSize[i] == c->dimSize[i]),
"Unmatched tensors!");
}
if (i < leadingDimRDI)
stride *= a->dimSizeRDI[i];
if (i > leadingDim)
stride *= a->dimSize[i];
}
blockSizeA = stride * dimensionSizeA;
......
......@@ -75,7 +75,7 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
else {
if (!a->isSparse && !b->isSparse) {
CheckNTErrors(!c->isSparse, "Illegal use of sparse tensor in addition!");
if (a->dataType == DEFAULT_DTYPE &&
b->dataType == DEFAULT_DTYPE &&
c->dataType == DEFAULT_DTYPE)
......
......@@ -41,14 +41,14 @@ void _SumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c, DTYPE bet
{
CheckNTErrors((a && b && c), "Empty input tensors!");
CheckNTErrors((XTensor::IsSameShaped(a, c)), "Unmatched tensors in addition!");
CheckNTErrors((b->order == 2 && b->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]),
CheckNTErrors((b->order == 2 && b->dimSize[b->order - 1] == 1 && b->dimSize[b->order - 2] == a->dimSize[a->order - 2]),
"Illegal input vector size!");
int rowNum = a->dimSize[0];
int colNum = a->dimSize[1];
int blockNum = 1;
for (int i = 2; i < a->order; i++)
blockNum *= a->dimSizeRDI[i];
for (int i = 0; i < a->order - 2; i++)
blockNum *= a->dimSize[i];
int blockSize = colNum * rowNum;
if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
......
......@@ -68,7 +68,7 @@ void _CudaSumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
{
CheckNTErrors((a && b && c), "Empty input tensors!");
CheckNTErrors((XTensor::IsSameShaped(a, c)), "Unmatched tensors in addition!");
CheckNTErrors((b->order == 2 && b->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]),
CheckNTErrors((b->order == 2 && b->dimSize[b->order - 1] == 1 && b->dimSize[b->order - 2] == a->dimSize[a->order - 2]),
"Illegal input vector size!");
CheckNTErrors((a->dataType == DEFAULT_DTYPE && b->dataType == DEFAULT_DTYPE &&
c->dataType == DEFAULT_DTYPE), "TODO");
......@@ -76,8 +76,8 @@ void _CudaSumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
int rowNum = a->dimSize[0];
int colNum = a->dimSize[1];
int blockNum = 1;
for (int i = 2; i < a->order; i++)
blockNum *= a->dimSizeRDI[i];
for (int i = 0; i < a->order - 2; i++)
blockNum *= a->dimSize[i];
int cudaGridSize[3];
int cudaBlockSize[3];
......
......@@ -41,7 +41,7 @@ void _SumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c, DTYPE bet
{
CheckNTErrors((a && b && c), "Empty input tensors!");
CheckNTErrors((XTensor::IsSameShaped(a, c)), "Unmatched tensors in addition!");
CheckNTErrors((a->order == 2 && a->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]),
CheckNTErrors((a->order == 2 && a->dimSize[a->order - 1] == 1 && b->dimSize[b->order - 2] == a->dimSize[a->order - 2]),
"Illegal input vector size!");
if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
......@@ -53,8 +53,8 @@ void _SumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c, DTYPE bet
int rowNum = b->dimSize[0];
int colNum = b->dimSize[1];
int blockNum = 1;
for (int i = 2; i < b->order; i++)
blockNum *= b->dimSizeRDI[i];
for (int i = 0; i < b->order - 2; i++)
blockNum *= b->dimSize[i];
int blockSize = colNum * rowNum;
if (!a->isSparse && !b->isSparse) {
......
......@@ -84,7 +84,7 @@ void _CudaSumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
{
CheckNTErrors((a && b && c), "Empty input tensors!");
CheckNTErrors((XTensor::IsSameShaped(a, c)), "Unmatched tensors in addition!");
CheckNTErrors((a->order == 2 && a->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]),
CheckNTErrors((a->order == 2 && a->dimSize[a->order - 1] == 1 && b->dimSize[b->order - 2] == a->dimSize[a->order - 2]),
"Illegal input vector size!");
CheckNTErrors((a->dataType == DEFAULT_DTYPE && b->dataType == DEFAULT_DTYPE &&
c->dataType == DEFAULT_DTYPE), "TODO");
......@@ -92,13 +92,13 @@ void _CudaSumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
int rowNum = b->dimSize[0];
int colNum = b->dimSize[1];
int blockNum = 1;
for (int i = 2; i < b->order; i++)
blockNum *= b->dimSizeRDI[i];
for (int i = 0; i < b->order - 2; i++)
blockNum *= b->dimSize[i];
int cudaGridSize[3];
int cudaBlockSize[3];
GDevs.GetCudaThread(c->devID, a->dimSizeRDI[1], cudaGridSize, cudaBlockSize);
GDevs.GetCudaThread(c->devID, a->dimSize[a->order - 2], cudaGridSize, cudaBlockSize);
int devIDBackup = 0;
ProtectCudaDev(a->devID, devIDBackup);
......
......@@ -54,7 +54,7 @@ void _MatrixMULCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
int bm = b->dimSize[1];
int cn = c->dimSize[0];
int cm = c->dimSize[1];
if (transposedA == X_NOTRANS && transposedB == X_NOTRANS)
GEMM(CblasRowMajor, CblasNoTrans, CblasNoTrans, cn, cm, am, alpha, (DTYPE*)a->data, am, (DTYPE*)b->data, bm, beta, (DTYPE*)c->data, cm);
else if (transposedA == X_TRANS && transposedB == X_NOTRANS)
......
......@@ -44,26 +44,25 @@ where a and b are the scalar and bias respectively, and \epsilon is the adjustme
*/
void _Normalize(const XTensor * input, XTensor * output, int dim, const XTensor * mean, const XTensor * var, const XTensor * a, const XTensor * b, DTYPE epsilon)
{
int dimRDI = input->order - dim - 1;
CheckNTErrors((XTensor::IsSameShaped(input, output)), "Unmatched input tensors!");
CheckNTErrors((XTensor::IsSameShaped(a, b)), "Unmatched input tensors");
CheckNTErrors((XTensor::IsSameShaped(mean, var)), "Unmatched input tensors");
CheckNTErrors((input && output && mean && var && a && b), "Empty input tensors!");
CheckNTErrors((dimRDI >= 0 && dimRDI < input->order), "Incorrect reduction dimension!");
CheckNTErrors((dim >= 0 && dim < input->order), "Incorrect reduction dimension!");
CheckNTErrors((input->order == mean->order + 1), "Incorrect reduction dimension!");
int stride = 1;
int strideNum = input->dimSizeRDI[dimRDI];
int strideNum = input->dimSize[dim];
int blockSize = 1;
int blockNum = 1;
for (int i = 0; i < input->order; i++) {
if (i < dimRDI) {
CheckNTErrors((input->dimSizeRDI[i] == mean->dimSizeRDI[i]), "Wrong size!");
stride *= input->dimSizeRDI[i];
if (i < dim) {
CheckNTErrors((input->dimSize[i] == mean->dimSize[i]), "Wrong size!");
blockNum *= input->dimSize[i];
}
else if (i > dimRDI) {
CheckNTErrors((input->dimSizeRDI[i] == mean->dimSizeRDI[i - 1]), "Wrong size!");
blockNum *= input->dimSizeRDI[i];
else if (i > dim) {
CheckNTErrors((input->dimSize[i] == mean->dimSize[i - 1]), "Wrong size!");
stride *= input->dimSize[i];
}
}
blockSize = stride * strideNum;
......
......@@ -95,15 +95,14 @@ void _CudaNormalize(const XTensor * input, XTensor * output, int dim,
{
CheckNTErrors((input->dataType == DEFAULT_DTYPE), "TODO!");
int dimRDI = input->order - dim - 1;
int stride = 1;
int strideNum = input->dimSizeRDI[dimRDI];
int strideNum = input->dimSize[dim];
int blockNum = 1;
for (int i = 0; i < input->order; i++) {
if (i < dimRDI)
stride *= input->dimSizeRDI[i];
else if (i > dimRDI)
blockNum *= input->dimSizeRDI[i];
if (i > dim)
stride *= input->dimSize[i];
else if (i < dim)
blockNum *= input->dimSize[i];
}
int cudaGridSize[3];
......
......@@ -40,12 +40,11 @@ void _CopyInGrid(const XTensor * s, XTensor * t, int * index, int blockDim, int
{
CheckNTErrors((XTensor::IsSameShaped(s, t)), "Unmatched tensors!");
int blockDimRDI = s->order - blockDim - 1;
int blockSize = 1;
int blockNum = blockNumInGrid;
int gridNum = 1;
for (int i = 0; i < blockDimRDI; i++)
blockSize *= s->dimSizeRDI[i];
for (int i = blockDim; i < s->order; i++)
blockSize *= s->dimSize[i];
CheckNTErrors((s->unitNum % (blockSize * blockNum) == 0), "Illegal block number!");
gridNum = s->unitNum / (blockSize * blockNum);
......
......@@ -52,26 +52,28 @@ void _CopyIndexed(const XTensor * s, XTensor * t, int dim,
CheckNTErrors((dim < s->order && dim < t->order), "A too larget dimension specified!");
CheckNTErrors((s->unitSize == t->unitSize), "Unmatched tensors!");
int dimRDI = s->order - dim - 1;
int blockSizeSrc = 1;
int blockSizeTgt = 1;
int blockNumSrc = 1;
int blockNumTgt = 1;
int leadDimSizeSrc = s->dimSizeRDI[dimRDI];
int leadDimSizeTgt = t->dimSizeRDI[dimRDI];
int leadDimSizeSrc = s->dimSize[dim];
int leadDimSizeTgt = t->dimSize[dim];
int indexOffsetNum = 1;
for (int i = 0; i < dimRDI; i++) {
blockSizeSrc *= s->dimSizeRDI[i];
blockSizeTgt *= t->dimSizeRDI[i];
for (int i = dim + 1; i < s->order; i++) {
blockSizeSrc *= s->dimSize[i];
}
for (int i = dim + 1; i < t->order; i++) {
blockSizeTgt *= t->dimSize[i];
}
for (int i = 0; i <= dim; i++)
{
blockNumSrc *= s->dimSize[i];
blockNumTgt *= t->dimSize[i];
}
for (int i = dimRDI; i < s->order; i++)
blockNumSrc *= s->dimSizeRDI[i];
for (int i = dimRDI; i < t->order; i++)
blockNumTgt *= t->dimSizeRDI[i];
CheckNTErrors((blockSizeSrc == blockSizeTgt), "Unmatched tensors!");
indexOffsetNum = blockNumSrc / s->dimSizeRDI[dimRDI];
indexOffsetNum = blockNumSrc / s->dimSize[dim];
int realIndexSize = indexOffsetNum * indexSize * copyNum;
int * realSrcIndex = new int[realIndexSize];
......
......@@ -160,16 +160,14 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
int devID = srcIndex->devID;
XMem * mem = s->mem;
int dimRDI = srcIndex->order - dim - 1;
int stride = 1;
int indexSize = srcIndex->unitNum;
int strideNum = srcIndex->dimSizeRDI[dimRDI];
for (int i = 0; i < dimRDI; i++)
stride *= srcIndex->dimSizeRDI[i];
int blockNum = 1;
for (int i = dimRDI + 1; i < srcIndex->order; i++)
blockNum *= srcIndex->dimSizeRDI[i];
int indexSize = srcIndex->unitNum;
int strideNum = srcIndex->dimSize[dim];
for (int i = 0; i < dim; i++)
blockNum *= srcIndex->dimSize[i];
for (int i = dim + 1; i < srcIndex->order; i++)
stride *= srcIndex->dimSize[i];
int * sIndex = NULL;
if (srcIndex->devID < 0) {
......
......@@ -43,17 +43,16 @@ void _ReduceMax(const XTensor * input, XTensor * output, int dim)
CheckNTErrors((input->order == output->order + 1), "Incorrect tensor sizes!");
CheckNTErrors((input->order > dim && dim >=0), "Illegal dimension to reduce!");
CheckNTErrors((input->dataType == output->dataType), "Unmatched data types!");
int dimRDI = input->order - dim - 1;
CheckNTErrors(dimRDI >= 0, "Wrong dimension!");
CheckNTErrors(dim < input->order, "Wrong dimension!");
for(int i = 0; i < input->order; i++){
if(i < dimRDI){
CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i]),
if(i < dim){
CheckNTErrors((input->dimSize[i] == output->dimSize[i]),
"Unmatched tensors!");
}
else if(i > dimRDI){
CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i - 1]),
else if(i > dim){
CheckNTErrors((input->dimSize[i] == output->dimSize[i - 1]),
"Unmatched tensors!");
}
}
......@@ -67,31 +66,31 @@ void _ReduceMax(const XTensor * input, XTensor * output, int dim)
CheckNTErrors((input->dataType == DEFAULT_DTYPE), "TODO!");
int stride = 1;
int strideNum = input->dimSizeRDI[dimRDI];
int strideNum = input->dimSize[dim];
int blockSize = 1;
int blockNum = 1;
for (int i = 0; i < input->order; i++) {
if (i < dimRDI)
stride *= input->dimSizeRDI[i];
else if (i > dimRDI)
blockNum *= input->dimSizeRDI[i];
if (i > dim)
stride *= input->dimSize[i];
else if (i < dim)
blockNum *= input->dimSize[i];
}
blockSize = stride * strideNum;
if(input->dimSizeRDI[0] % (4 * 32 / sizeof(DTYPE)) == 0 && input->dimSizeRDI[0] >= 32){
if(input->dimSize[input->order - 1] % (4 * 32 / sizeof(DTYPE)) == 0 && input->dimSize[input->order - 1] >= 32){
int vecBufLength = 32 / sizeof(DTYPE);
if(dimRDI == 0){
if (dim == input->order - 1) {
//data is contiguous in dim 0
for(int i = 0; i < blockNum; i++){
for (int i = 0; i < blockNum; i++) {
DTYPE * ip = (DTYPE*)input->data + blockSize * i;
DTYPE * op = (DTYPE*)output->data + i;
VectorBuffer vecBuf[4];
for(int j = 0; j < 4; j++){
vecBuf[j] = VectorBuffer::loadu((DTYPE*)(ip) + j * vecBufLength);
for (int j = 0; j < 4; j++) {
vecBuf[j] = VectorBuffer::loadu((DTYPE*)(ip)+j * vecBufLength);
}
for(int j = 1; j < strideNum / 32; j++){
for (int j = 1; j < strideNum / 32; j++) {
const DTYPE* ptr = (DTYPE*)(ip + j * vecBufLength);
vecBuf[0] = vecBuf[0].maxData(VectorBuffer::loadu(ptr + 0 * vecBufLength));
vecBuf[1] = vecBuf[1].maxData(VectorBuffer::loadu(ptr + 1 * vecBufLength));
......@@ -102,16 +101,17 @@ void _ReduceMax(const XTensor * input, XTensor * output, int dim)
vecBuf[0] = vecBuf[0].maxData(vecBuf[2]);
vecBuf[0] = vecBuf[0].maxData(vecBuf[3]);
DTYPE maxN = DTYPE_MIN;
for(int k = 0; k < vecBufLength; k++){
maxN = MAX(maxN,vecBuf[0][k]);
for (int k = 0; k < vecBufLength; k++) {
maxN = MAX(maxN, vecBuf[0][k]);
}
*op = maxN;
}
} else{
}
else {
//data is separated
for(int i = 0; i < blockNum; i++){
for(int j = 0; j < input->dimSizeRDI[0] / 32; j++){
for(int j = 0; j < input->dimSize[input->order - 1] / 32; j++){
DTYPE * ip = (DTYPE*)input->data + blockSize * i;
DTYPE * op = (DTYPE*)output->data + stride * i;
VectorBuffer vecBuf[4];
......
......@@ -504,13 +504,12 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
CheckNTErrors(input->order > dim && dim >=0, "Illegal dimension to reduce!");
CheckNTErrors(input->dataType == output->dataType, "Unmatched data types!");
int dimRDI = input->order - dim - 1;
for(int i = 0; i < input->order; i++){
if(i < dimRDI){
CheckNTErrors(input->dimSizeRDI[i] == output->dimSizeRDI[i], "Unmatched tensors!");
if(i < dim){
CheckNTErrors(input->dimSize[i] == output->dimSize[i], "Unmatched tensors!");
}
else if(i > dimRDI){
CheckNTErrors(input->dimSizeRDI[i] == output->dimSizeRDI[i - 1], "Unmatched tensors!");
else if(i > dim){
CheckNTErrors(input->dimSize[i] == output->dimSize[i - 1], "Unmatched tensors!");
}
}
......@@ -518,15 +517,15 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
int cudaBlockSize[3];
int iter = 0;
int stride = 1;
int strideNum = input->dimSizeRDI[dimRDI];
int strideNum = input->dimSize[dim];
int blockSize = 1;
int blockNum = 1;
for (int i = 0; i < input->order; i++) {
if (i < dimRDI)
stride *= input->dimSizeRDI[i];
else if (i > dimRDI)
blockNum *= input->dimSizeRDI[i];
if (i < dim)
blockNum *= input->dimSize[i];
else if (i > dim)
stride *= input->dimSize[i];
}
blockSize = stride * strideNum;
......
......@@ -39,8 +39,7 @@ void _ReduceMean(const XTensor * input, XTensor * output, int dim)
{
CheckNTErrors((input->order > dim), "Illegal dimension specified!");
int dimRDI = input->order - dim - 1;
int num = input->dimSizeRDI[dimRDI];
int num = input->dimSize[dim];
_ReduceSum(input, output, dim);
_ScaleAndShiftMe(output, (DTYPE)1/num, 0);
......
......@@ -53,15 +53,14 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor
CheckNTErrors((input->dataType == output->dataType), "Unmatched data types!");
CheckNTErrors((shift == NULL || XTensor::IsSameShaped(output, shift)), "Incorrect shift tensor size!");
int dimRDI = input->order - dim - 1;
CheckNTErrors(dimRDI >= 0, "Wrong dimension!");
CheckNTErrors(dim < input->order, "Wrong dimension!");
for(int i = 0; i < input->order; i++){
if(i < dimRDI){
CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i]), "Unmatched tensors!");
if(i < dim){
CheckNTErrors((input->dimSize[i] == output->dimSize[i]), "Unmatched tensors!");
}
else if(i > dimRDI){
CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i - 1]), "Unmatched tensors!");
else if(i > dim){
CheckNTErrors((input->dimSize[i] == output->dimSize[i - 1]), "Unmatched tensors!");
}
}
......@@ -74,21 +73,21 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor
CheckNTErrors((input->dataType == DEFAULT_DTYPE), "TODO!");
int stride = 1;
int strideNum = input->dimSizeRDI[dimRDI];
int strideNum = input->dimSize[dim];
int blockSize = 1;
int blockNum = 1;
for (int i = 0; i < input->order; i++) {
if (i < dimRDI)
stride *= input->dimSizeRDI[i];
else if (i > dimRDI)
blockNum *= input->dimSizeRDI[i];
if (i < dim)
blockNum *= input->dimSize[i];
else if (i > dim)
stride *= input->dimSize[i];
}
blockSize = stride * strideNum;
if(input->dimSizeRDI[0] % (4 * 32 / sizeof(DTYPE)) == 0 && input->dimSizeRDI[0] >= 32){
if(input->dimSize[input->order - 1] % (4 * 32 / sizeof(DTYPE)) == 0 && input->dimSize[input->order - 1] >= 32){
int vecBufLength = 32 / sizeof(DTYPE);
if(dimRDI == 0){
if(dim == input->order - 1){
//data is contiguous in dim 0
for(int i = 0; i < blockNum; i++){
// stride = 1
......@@ -122,7 +121,7 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor
} else{
//data is separated
for(int i = 0; i < blockNum; i++){
for(int j = 0; j < input->dimSizeRDI[0] / 32; j++){
for(int j = 0; j < input->dimSize[input->order - 1] / 32; j++){
DTYPE * ip = (DTYPE*)input->data + blockSize * i;
DTYPE * op = (DTYPE*)output->data + stride * i;
DTYPE * sp = shift != NULL ? (DTYPE*)shift->data + stride * i : NULL;
......
......@@ -692,13 +692,12 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
CheckNTErrors(input->dataType == output->dataType, "Unmatched data types!");
CheckNTErrors(shift == NULL || output->unitNum == shift->unitNum, "Incorrect shift tensor size!");
int dimRDI = input->order - dim - 1;
for(int i = 0; i < input->order; i++){
if(i < dimRDI){
CheckNTErrors(input->dimSizeRDI[i] == output->dimSizeRDI[i], "Unmatched tensors!");
if(i < dim){
CheckNTErrors(input->dimSize[i] == output->dimSize[i], "Unmatched tensors!");
}
else if(i > dimRDI){
CheckNTErrors(input->dimSizeRDI[i] == output->dimSizeRDI[i - 1], "Unmatched tensors!");
else if(i > dim){
CheckNTErrors(input->dimSize[i] == output->dimSize[i - 1], "Unmatched tensors!");
}
}
......@@ -709,15 +708,15 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
int cudaBlockSize[3];
int iter = 0;
int stride = 1;
int strideNum = input->dimSizeRDI[dimRDI];
int strideNum = input->dimSize[dim];
int blockSize = 1;
int blockNum = 1;
for (int i = 0; i < input->order; i++) {
if (i < dimRDI)
stride *= input->dimSizeRDI[i];
else if (i > dimRDI)
blockNum *= input->dimSizeRDI[i];
if (i < dim)
blockNum *= input->dimSize[i];
else if (i > dim)
stride *= input->dimSize[i];
}
blockSize = stride * strideNum;
......
......@@ -38,8 +38,7 @@ For a 1-dimensional data array a, variance = 1/n * \sum_i (a_i - mean)^2
*/
void _ReduceVariance(const XTensor * input, XTensor * output, int dim, const XTensor * mean)
{
int dimRDI = input->order - dim - 1;
int num = input->dimSizeRDI[dimRDI];
int num = input->dimSize[dim];
_ReduceSum(input, output, dim, mean, 2.0F);
_ScaleAndShiftMe(output, (DTYPE)1 / num, 0);
}
......
......@@ -20,7 +20,7 @@
*/
#include "VectorBuffer.h"
#include "math.h"
namespace nts {
/* data size for each buffer */
int VectorBuffer::size()
......
......@@ -39,30 +39,29 @@ void _ConcatenateSolely(const XList * smalls, XTensor * big, int dim)
CheckNTErrors(big->order > dim && dim >= 0, "Illegal dimension to concatenate!");
int catDimSize = 0;
int dimRDI = big->order - dim - 1;
for (int i = 0; i < smalls->count; i++) {
XTensor * tensor = (XTensor*)smalls->GetItem(i);
CheckNTErrors((big->order == tensor->order), "Unmatched tensor orders!");
for (int j = 0; j < big->order; j++) {
if (j != dimRDI) {
CheckNTErrors((big->dimSizeRDI[j] == tensor->dimSizeRDI[j]), "Unmatched tensor sizes!");
if (j != dim) {
CheckNTErrors((big->dimSize[j] == tensor->dimSize[j]), "Unmatched tensor sizes!");
}
else {
catDimSize += tensor->dimSizeRDI[j];
catDimSize += tensor->dimSize[j];
}
}
}
CheckNTErrors((catDimSize == big->dimSizeRDI[dimRDI]), "Unmatched tensor sizes!");
CheckNTErrors((catDimSize == big->dimSize[dim]), "Unmatched tensor sizes!");
int stride = 1;
for (int i = 0; i < dimRDI; i++)
stride *= big->dimSizeRDI[i];
int blockNum = 1;
for (int i = dimRDI + 1; i < big->order; i++)
blockNum *= big->dimSizeRDI[i];
for (int i = 0; i < dim; i++)
blockNum *= big->dimSize[i];
for (int i = dim + 1; i < big->order; i++)
stride *= big->dimSize[i];
int offset = 0;
......@@ -74,8 +73,8 @@ void _ConcatenateSolely(const XList * smalls, XTensor * big, int dim)
if (smalls->count <= MIN_TENSOR_CAT_NUM) {
for (int i = 0; i < smalls->count; i++) {
XTensor * tensor = (XTensor*)smalls->GetItem(i);
int sPitch = stride * tensor->dimSizeRDI[dimRDI] * tensor->unitSize;
int tPitch = stride * big->dimSizeRDI[dimRDI] * big->unitSize;
int sPitch = stride * tensor->dimSize[dim] * tensor->unitSize;
int tPitch = stride * big->dimSize[dim] * big->unitSize;
int mSize = sPitch;
int n = blockNum;
XMemCopy2D((char*)big->data + offset, tPitch, big->devID,
......@@ -89,7 +88,7 @@ void _ConcatenateSolely(const XList * smalls, XTensor * big, int dim)
int * blockSizes = new int[smalls->count];
for (int i = 0; i < smalls->count; i++) {
XTensor * tensor = (XTensor*)smalls->GetItem(i);
blockSizes[i] = stride * tensor->dimSizeRDI[dimRDI] * tensor->unitSize;
blockSizes[i] = stride * tensor->dimSize[dim] * tensor->unitSize;
sourceArrays->Add(tensor->data);
}
......
......@@ -45,10 +45,8 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
if(leadingDim < 0)
leadingDim = 0;
int whereToMergeRDI = s->order - whereToMerge - 1;
int leadingDimRDI = s->order - leadingDim - 1;
if (leadingDimRDI < 0)
leadingDimRDI = s->order - 1;
if (leadingDim >= s->order)
leadingDim = leadingDim - s->order;
CheckNTErrors((s != NULL && t != NULL), "Invalid tensors!");
CheckNTErrors((s->devID == t->devID || (s->devID < 0 && t->devID < 0)),
......@@ -56,19 +54,20 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
CheckNTErrors((s->unitNum == t->unitNum && s->unitSize == t->unitSize), "Unmatched tensors!");
CheckNTErrors((s->order == t->order + 1), "Unmatched tensors!");
CheckNTErrors((leadingDimRDI > whereToMergeRDI), "Invalid leading dimension!");
CheckNTErrors((leadingDim < whereToMerge), "Invalid leading dimension!");
for (int i = 0; i < s->order; i++) {
if (i == whereToMergeRDI) {
CheckNTErrors((t->dimSizeRDI[i] == s->dimSizeRDI[i] * s->dimSizeRDI[leadingDimRDI]),
if (i == whereToMerge) {
CheckNTErrors((t->dimSize[i - 1] == s->dimSize[i] * s->dimSize[leadingDim]),
"Unmatched tensor sizes!");
}
else if (i < leadingDimRDI){
CheckNTErrors((s->dimSizeRDI[i] == t->dimSizeRDI[i]),
else if (i < leadingDim){
CheckNTErrors((s->dimSize[i] == t->dimSize[i]),
"Unmatched tensor sizes!");
}
else if (i > leadingDimRDI) {
CheckNTErrors((s->dimSizeRDI[i] == t->dimSizeRDI[i - 1]),
else if (i > leadingDim) {
CheckNTErrors((s->dimSize[i] == t->dimSize[i - 1]),
"Unmatched tensor sizes!");
}
}
......@@ -77,14 +76,14 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
int blockNum = 1;
int gridSize = 1;
int gridNum = 1;
int mergedNum = s->dimSizeRDI[leadingDimRDI];
int mergedNum = s->dimSize[leadingDim];
for (int i = 0; i < s->order; i++) {
if (i <= leadingDimRDI) {
if (i <= whereToMergeRDI)
blockSize *= s->dimSizeRDI[i];
if (i >= leadingDim) {
if (i >= whereToMerge)
blockSize *= s->dimSize[i];
else
blockNum *= s->dimSizeRDI[i];
blockNum *= s->dimSize[i];
}
}
......@@ -121,7 +120,7 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
if (!isOnSameDevice)
dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(mem->devID, size);
int blockNumInMerge = s->dimSizeRDI[leadingDimRDI];
int blockNumInMerge = s->dimSize[leadingDim];
int splitSizeInGrid = gridSize / blockNumInMerge;
int realBlockSize = blockSize * t->unitSize;
......@@ -238,12 +237,11 @@ void _Merge(const XList * smalls, XTensor * big, int whereToMerge)
int mergedNum = smalls->count;
XTensor * s0 = (XTensor*)smalls->GetItem(0);
int whereToMergeRDI = s0->order - whereToMerge - 1;
for (int i = 0; i < s0->order; i++) {
if (i <= whereToMergeRDI)
blockSize *= s0->dimSizeRDI[i];
if (i >= whereToMerge)
blockSize *= s0->dimSize[i];
else
blockNum *= s0->dimSizeRDI[i];
blockNum *= s0->dimSize[i];
}
CheckNTErrors((s0->unitNum % (blockSize * blockNum) == 0), "Incorrect size!");
......
......@@ -46,23 +46,22 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
CheckNTErrors((s->unitNum == t->unitNum && s->unitSize == t->unitSize), "Unmatched tensors!");
CheckNTErrors((s->order == t->order - 1), "Unmatched tensors!");
CheckNTErrors((t->dimSizeRDI[t->order - 1] == splitNum), "Incorrect tensor sizes!");
CheckNTErrors((t->dimSize[0] == splitNum), "Incorrect tensor sizes!");
int whereToSplitRDI = s->order - whereToSplit - 1;
for (int i = 0; i < s->order; i++) {
if (i == whereToSplitRDI) {
CheckNTErrors((s->dimSizeRDI[i] == t->dimSizeRDI[i] * splitNum),
if (i == whereToSplit) {
CheckNTErrors((s->dimSize[i] == t->dimSize[i + 1] * splitNum),
"Unmatched tensor sizes!");
}
else {
CheckNTErrors((s->dimSizeRDI[i] == t->dimSizeRDI[i]),
CheckNTErrors((s->dimSize[i] == t->dimSize[i + 1]),
"Unmatched tensor sizes!");
}
}
/* for the case that we split the last dimension. Actually
(N, M) and (N, M/3, 3) have the same memory layout */
if (s->order - 1 == whereToSplitRDI) {
if (0 == whereToSplit) {
XMemCopy(t->data, t->devID, s->data, s->devID, s->unitNum * s->unitSize);
return;
}
......@@ -70,14 +69,14 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
int blockSize = 1;
int blockNum = 1;
for (int i = 0; i < s->order; i++) {
if (i == whereToSplitRDI) {
blockSize *= s->dimSizeRDI[i] / splitNum;
if (i == whereToSplit) {
blockSize *= s->dimSize[i] / splitNum;
blockNum *= splitNum;
}
else if (i < whereToSplitRDI)
blockSize *= s->dimSizeRDI[i];
else if (i > whereToSplit)
blockSize *= s->dimSize[i];
else
blockNum *= s->dimSizeRDI[i];
blockNum *= s->dimSize[i];
}
CheckNTErrors((blockNum % splitNum == 0), "Incorrect split number!");
......@@ -215,7 +214,6 @@ void _Split(const XTensor * big, XList * smalls, int whereToSplit, int splitNum)
CheckNTErrors((smalls->count == splitNum), "Unmatched tensors!");
CheckNTErrors((smalls->count > 0), "Wrong input!");
int whereToSplitRDI = big->order - whereToSplit - 1;
bool uniform = true;
for (int i = 0; i < smalls->count; i++) {
......@@ -231,14 +229,14 @@ void _Split(const XTensor * big, XList * smalls, int whereToSplit, int splitNum)
int blockSize = 1;
int blockNum = 1;
for (int i = 0; i < big->order; i++) {
if (i == whereToSplitRDI) {
blockSize *= big->dimSizeRDI[i] / splitNum;
if (i == whereToSplit) {
blockSize *= big->dimSize[i] / splitNum;
blockNum *= splitNum;
}
else if (i < whereToSplitRDI)
blockSize *= big->dimSizeRDI[i];
else if (i > whereToSplit)
blockSize *= big->dimSize[i];
else
blockNum *= big->dimSizeRDI[i];
blockNum *= big->dimSize[i];
}
CheckNTErrors((blockNum % splitNum == 0), "Incorrect split number!");
......
......@@ -42,16 +42,15 @@ void _Unsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
CheckNTErrors((a->order == b->order - 1), "Unmatched tensors!");
CheckNTErrors((a->unitSize == b->unitSize), "Unmatched tensors!");
int dimRDI = b->order - dim - 1;
for (int i = 0; i < b->order; i++) {
if (i < dimRDI) {
CheckNTErrors((a->dimSizeRDI[i] == b->dimSizeRDI[i]), "Unmatched tensors!");
if (i < dim) {
CheckNTErrors((a->dimSize[i] == b->dimSize[i]), "Unmatched tensors!");
}
else if (i > dimRDI) {
CheckNTErrors((a->dimSizeRDI[i - 1] == b->dimSizeRDI[i]), "Unmatched tensors!");
else if (i > dim) {
CheckNTErrors((a->dimSize[i - 1] == b->dimSize[i]), "Unmatched tensors!");
}
else {
CheckNTErrors((dSize == b->dimSizeRDI[i]), "Unmatched tensors!");
CheckNTErrors((dSize == b->dimSize[i]), "Unmatched tensors!");
}
}
......@@ -60,8 +59,8 @@ void _Unsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
int blockNumA = 1;
int blockNumB = 1;
for (int i = 0; i < dimRDI; i++)
blockSize *= a->dimSizeRDI[i];
for (int i = dim; i < a->order; i++)
blockSize *= a->dimSize[i];
realBlockSize = blockSize * a->unitSize;
......
......@@ -235,9 +235,8 @@ void _CudaUnsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
int blockSize = 1;
int blockNumA = 1;
int blockNumB = 1;
int dimRDI = b->order - dim - 1;
for (int i = 0; i < dimRDI; i++)
blockSize *= a->dimSizeRDI[i];
for (int i = dim; i < a->order; i++)
blockSize *= a->dimSize[i];
blockNumA = a->unitNum / blockSize;
blockNumB = b->unitNum / blockSize;
......@@ -250,7 +249,7 @@ void _CudaUnsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
int devIDBackup = 0;
ProtectCudaDev(a->devID, devIDBackup);
if (dimRDI == 0) {
if (dim == b->order - 1) {
GDevs.GetCudaThread2D(a->devID, dSize, blockNumA, MAX_INT, cudaGrids, cudaBlocks);
if (a->dataType == X_FLOAT && b->dataType == X_FLOAT) {
......
......@@ -42,7 +42,6 @@ void _Sort(const XTensor * a, XTensor * b, XTensor * index, int dim)
CheckNTErrors((a->order == index->order), "Unmatched input tensors!");
CheckNTErrors((index->dataType == X_INT), "Wrong data type!");
int dimRDI = a->order - dim - 1;
/* make the index tensor */
index->SetAscendingOrder(dim);
......@@ -55,13 +54,13 @@ void _Sort(const XTensor * a, XTensor * b, XTensor * index, int dim)
}
else {
int stride = 1;
int strideNum = a->dimSizeRDI[dimRDI];
for (int i = 0; i < dimRDI; i++)
stride *= a->dimSizeRDI[i];
int blockNum = 1;
for (int i = dimRDI + 1; i < a->order; i++)
blockNum *= a->dimSizeRDI[i];
int strideNum = a->dimSize[dim];
for (int i = 0; i < dim; i++)
blockNum *= a->dimSize[i];
for (int i = dim + 1; i < a->order; i++)
stride *= a->dimSize[i];
int blockSize = stride * strideNum;
_CopyValues(a, b);
......
......@@ -217,20 +217,19 @@ void _CudaSortBig(const XTensor * a, XTensor * b, XTensor * indexA, XTensor * in
CheckNTErrors((a->order > dim && dim >= 0), "Incorrect dimension specified!");
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
int dimRDI = a->order - dim - 1;
if (k < 0 || k > b->dimSizeRDI[dimRDI])
k = b->dimSizeRDI[dimRDI];
if (k < 0 || k > b->dimSize[dim])
k = b->dimSize[dim];
XMem * mem = a->mem;
int stride = 1;
int strideNum = a->dimSizeRDI[dimRDI];
for (int i = 0; i < dimRDI; i++)
stride *= a->dimSizeRDI[i];
int blockNum = 1;
for (int i = dimRDI + 1; i < a->order; i++)
blockNum *= a->dimSizeRDI[i];
int strideNum = a->dimSize[dim];
for (int i = 0; i < dim; i++)
blockNum *= a->dimSize[i];
for (int i = dim + 1; i < a->order; i++)
stride *= a->dimSize[i];
int m = GetNextPower2(strideNum);
int n = stride * blockNum;
......
......@@ -41,15 +41,14 @@ void _TopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
CheckNTErrors((index == NULL || a->order == index->order), "Unmatched input tensors!");
CheckNTErrors((index->dataType == X_INT), "Wrong data type!");
int dimRDI = a->order - dim - 1;
for (int i = 0; i < a->order; i++) {
if (i == dimRDI) {
CheckNTErrors((b->dimSizeRDI[i] == k), "A too large K");
CheckNTErrors((index == NULL || index->dimSizeRDI[i] == k), "Wrong size!");
if (i == dim) {
CheckNTErrors((b->dimSize[i] == k), "A too large K");
CheckNTErrors((index == NULL || index->dimSize[i] == k), "Wrong size!");
}
else {
CheckNTErrors((b->dimSizeRDI[i] == a->dimSizeRDI[i]), "Wrong size!");
CheckNTErrors((index == NULL || index->dimSizeRDI[i] == a->dimSizeRDI[i]), "Wrong size!");
CheckNTErrors((b->dimSize[i] == a->dimSize[i]), "Wrong size!");
CheckNTErrors((index == NULL || index->dimSize[i] == a->dimSize[i]), "Wrong size!");
}
}
......@@ -64,14 +63,14 @@ void _TopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
int stride = 1;
int strideNumA = a->dimSizeRDI[dimRDI];
int strideNumB = b->dimSizeRDI[dimRDI];
for (int i = 0; i < dimRDI; i++)
stride *= a->dimSizeRDI[i];
int blockNum = 1;
for (int i = dimRDI + 1; i < a->order; i++)
blockNum *= a->dimSizeRDI[i];
int strideNumA = a->dimSize[dim];
int strideNumB = b->dimSize[dim];
for (int i = 0; i < dim; i++)
blockNum *= a->dimSize[i];
for (int i = dim + 1; i < a->order; i++)
stride *= a->dimSize[i];
int blockSizeA = stride * strideNumA;
int blockSizeB = stride * strideNumB;
......
......@@ -811,15 +811,14 @@ void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
CheckNTErrors((index->dataType == X_INT), "Wrong data type!");
CheckNTErrors((b->dimSize[dim] == k), "A too large K");
int dimRDI = a->order - dim - 1;
int stride = 1;
int strideNumA = a->dimSizeRDI[dimRDI];
for (int i = 0; i < dimRDI; i++)
stride *= a->dimSizeRDI[i];
int blockNum = 1;
for (int i = dimRDI + 1; i < a->order; i++)
blockNum *= a->dimSizeRDI[i];
int strideNumA = a->dimSize[dim];
for (int i = 0; i < dim; i++)
blockNum *= a->dimSize[i];
for (int i = dim + 1; i < a->order; i++)
stride *= a->dimSize[i];
int workerNum = blockNum < 16 ? 64 : 32;
/* adjust the thread num according size of k for fitting the share memory size */
......
......@@ -67,15 +67,14 @@ void CudaSetAscendingOrder(XTensor * a, int dim)
{
CheckNTErrors((a->dataType == X_INT), "TODO!");
int dimRDI = a->order - dim - 1;
int stride = 1;
int strideNum = a->dimSizeRDI[dimRDI];
for(int i = 0; i < dimRDI; i++)
stride *= a->dimSizeRDI[i];
int blockNum = 1;
for(int i = dimRDI + 1; i < a->order; i++)
blockNum *= a->dimSizeRDI[i];
int strideNum = a->dimSize[dim];
for(int i = 0; i < dim; i++)
blockNum *= a->dimSize[i];
for(int i = dim + 1; i < a->order; i++)
stride *= a->dimSize[i];
int gridSize[3];
int blockSize[3];
......
......@@ -49,7 +49,6 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
return;
}
int leadDimRDI = x->order - leadDim - 1;
if (!x->isSparse && !y->isSparse &&
x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE)
{
......@@ -69,13 +68,13 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
XTensor * blockMax = NULL;
XTensor * blockSum = NULL;
int dimensionSize = y->dimSizeRDI[leadDimRDI];
int dimensionSize = y->dimSize[leadDim];
int stride = 1;
int blockSize = 1;
int blockNum = 1;
for (int i = 0; i < leadDimRDI; i++)
stride *= y->dimSizeRDI[i];
for (int i = leadDim + 1; i < y->order; i++)
stride *= y->dimSize[i];
blockSize = stride * dimensionSize;
blockNum = y->unitNum / blockSize;
......@@ -86,7 +85,7 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
_ReduceSum(x, sum, leadDim, max, 1.0F, true);
if (x->devID >= 0) {
if(leadDimRDI == 0){
if(leadDim == x->order - 1){
blockSize = y->unitNum;
blockNum = 1;
blockx = NewTensor2D(blockSize/dimensionSize, -dimensionSize, x->dataType, x->devID, mem);
......@@ -137,7 +136,7 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
blockMax->data = mp;
blockSum->data = sp;
#ifdef USE_CUDA
if(leadDimRDI == 0)
if(leadDim == x->order - 1)
_CudaLogSoftmaxSumMax(blockx, blocky, 1, blockSum, blockMax);
else
_CudaLogSoftmaxSumMax(blockx, blocky, leadDim, blockSum, blockMax);
......@@ -289,7 +288,6 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
if(leadDim < 0)
leadDim = y->order - 1;
int leadDimRDI = y->order - leadDim - 1;
#ifdef USE_CUDA
if (gold->devID >= 0) {
_CudaLogSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
......@@ -297,12 +295,12 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
}
#endif
int dimensionSize = y->dimSizeRDI[leadDimRDI];
int dimensionSize = y->dimSize[leadDim];
int stride = 1;
int blockSize = 1;
int blockNum = 1;
for (int i = 0; i < leadDimRDI; i++)
stride *= y->dimSizeRDI[i];
for (int i = leadDim + 1; i < y->order - 1; i++)
stride *= y->dimSize[i];
blockSize = stride * dimensionSize;
blockNum = y->unitNum / blockSize;
......@@ -329,10 +327,10 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
int key = gold->GetKeyInSparse(i);
DTYPE value = gold->GetInSparse(i);
int offset = key;
if (dedx->dimSizeRDI[0] != gm) {
if (dedx->dimSize[dedx->order - 1] != gm) {
int mi = key % gm;
int ni = key / gm;
int key2 = ni * dedx->dimSizeRDI[0] + mi;
int key2 = ni * dedx->dimSize[dedx->order - 1] + mi;
offset = key2;
}
if (key >= 0 && key < size)
......@@ -386,10 +384,10 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
int key = gold->GetKeyInSparse(i);
DTYPE value = gold->GetInSparse(i);
int offset = key;
if (dedx->dimSizeRDI[0] != gm) {
if (dedx->dimSize[dedx->order - 1] != gm) {
int mi = key % gm;
int ni = key / gm;
int key2 = ni * dedx->dimSizeRDI[0] + mi;
int key2 = ni * dedx->dimSize[dedx->order - 1] + mi;
offset = key2;
}
if (key >= 0 && key < size)
......@@ -421,11 +419,11 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
/* for columns with no xs we set dE/ds = 0 */
if (gold != NULL && gold->isSparse) {
CheckNTErrors((gold->order == 2), "The gold standard tensor must be of order 2!");
if ((gold->dimSize[1] > 1 && !gold->isAllValued[0]) || gold->dimSize[1] != dedx->dimSizeRDI[0]) {
if ((gold->dimSize[1] > 1 && !gold->isAllValued[0]) || gold->dimSize[1] != dedx->dimSize[dedx->order - 1]) {
int gn = gold->dimSize[0];
int gm = gold->dimSize[1];
int sm = dedx->dimSizeRDI[0];
int sn = dedx->dimSizeRDI[1];
int sm = dedx->dimSize[dedx->order - 1];
int sn = dedx->dimSize[dedx->order - 2];
int * flags = new int[sm];
memset(flags, 0, sizeof(int)*sm);
......
......@@ -384,13 +384,12 @@ void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
"Tensors used in log softmax are not on the same GPU.");
CheckNTErrors((gold != NULL), "No x gold standard is found!");
int leadDimRDI = y->order - leadDim - 1;
int dimensionSize = y->dimSizeRDI[leadDimRDI];
int dimensionSize = y->dimSize[leadDim];
int stride = 1;
int blockSize = 1;
int blockNum = 1;
for (int i = 0; i < leadDimRDI; i++)
stride *= y->dimSizeRDI[i];
for (int i = leadDim + 1; i < y->order; i++)
stride *= y->dimSize[i];
blockSize = stride * dimensionSize;
blockNum = y->unitNum / blockSize;
......
......@@ -49,18 +49,17 @@ DTYPE _LossCompute(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
if (output->devID < 0) {
CheckNTErrors((gLen >= 0 && gLen <= output->unitNum), "Illegal input length!");
CheckNTErrors((XTensor::IsSameShaped(gold, output)), "The input tensors must be of the same size!");
CheckNTErrors((gold->dimSizeRDI[0] == 1 && output->dimSizeRDI[0] == 1), "TODO!");
CheckNTErrors((gold->dimSize[gold->order - 1] == 1 && output->dimSize[output->order - 1] == 1), "TODO!");
CheckNTErrors((gold->order > leadDim && leadDim >= 0), "Illegal leading dimension!");
CheckNTErrors((gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE), "TODO!");
int leadDimRDI = output->order - leadDim - 1;
int dimensionSize = output->dimSizeRDI[leadDimRDI];
int dimensionSize = output->dimSize[leadDim];
int stride = 1;
int blockSize = 1;
int blockNum = 1;
for(int i = 0; i < leadDimRDI; i++)
stride *= output->dimSizeRDI[i];
for(int i = leadDim + 1; i < output->order; i++)
stride *= output->dimSize[i];
blockSize = stride * dimensionSize;
blockNum = output->unitNum / blockSize;
......@@ -206,18 +205,17 @@ DTYPE _LossComputeForLogScale(XTensor * gold, XTensor * output,
{
CheckNTErrors(gLen >= 0 && gLen <= output->unitNum, "Illegal input length!");
CheckNTErrors(XTensor::IsSameShaped(gold, output), "The input tensors must be of the same size!");
CheckNTErrors(gold->dimSizeRDI[0] == 1 && output->dimSizeRDI[0] == 1, "TODO!");
CheckNTErrors(gold->dimSize[gold->order - 1] == 1 && output->dimSize[output->order - 1] == 1, "TODO!");
CheckNTErrors(gold->order > leadDim && leadDim >= 0, "Illegal leading dimension!");
CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, "TODO!");
int leadDimRDI = output->order - leadDim - 1;
int dimensionSize = output->dimSizeRDI[leadDimRDI];
int dimensionSize = output->dimSize[leadDim];
int stride = 1;
int blockSize = 1;
int blockNum = 1;
for(int i = 0; i < leadDimRDI; i++)
stride *= output->dimSizeRDI[i];
for(int i = leadDim + 1; i < output->order; i++)
stride *= output->dimSize[i];
blockSize = stride * dimensionSize;
blockNum = output->unitNum / blockSize;
......@@ -408,21 +406,21 @@ void _LossBackward(XTensor * dedy, XTensor * t, XTensor * y,
CheckNTErrors(t->order > leadDim, "Illegal leading dimension!");
CheckNTErrors(t->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE, "TODO!");
int leadDimRDI = leadDim >= 0 ? y->order - leadDim - 1 : -1;
if(leadDimRDI < 0){
leadDimRDI = y->order - 1;
if(leadDim < 0){
leadDim = 0;
tBeg = 0;
yBeg = 0;
tLen = y->dimSizeRDI[leadDimRDI];
tLen = y->dimSize[leadDim];
printf("%d", tLen);
}
int dimensionSize = y->dimSizeRDI[leadDimRDI];
int dimensionSize = y->dimSize[leadDim];
int stride = 1;
int blockSize = 1;
int blockNum = 1;
for(int i = 0; i < leadDimRDI; i++)
stride *= y->dimSizeRDI[i];
for(int i = leadDim + 1; i < y->order; i++)
stride *= y->dimSize[i];
blockSize = stride * dimensionSize;
blockNum = y->unitNum / blockSize;
......
......@@ -56,7 +56,7 @@ DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
{
CheckNTErrors((gLen >= 0 && gLen <= y->unitNum), "Illegal input length!");
CheckNTErrors((XTensor::IsSameShaped(gold, y)), "The input tensors must be of the same size!");
CheckNTErrors((gold->dimSizeRDI[0] == 1 && y->dimSizeRDI[0] == 1), "TODO!");
CheckNTErrors((gold->dimSize[gold->order - 1] == 1 && y->dimSize[y->order - 1] == 1), "TODO!");
CheckNTErrors((gold->order > leadDim && leadDim >= 0), "Illegal leading dimension!");
CheckNTErrors((gold->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE), "TODO!");
CheckNTErrors((gold->devID == y->devID), "Tensors must be on the same device!");
......@@ -91,7 +91,7 @@ DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
diffNew->order = 2;
diffNew->dimSize[1] = diffNew->dimSize[0];
diffNew->dimSize[0] = 1;
diffNew->dimSizeRDI[1] = 1;
diffNew->dimSize[diffNew->order - 2] = 1;
}
delete diff;
diff = diffNew;
......@@ -125,7 +125,7 @@ DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
diffNew->order = 2;
diffNew->dimSize[1] = diffNew->dimSize[0];
diffNew->dimSize[0] = 1;
diffNew->dimSizeRDI[1] = 1;
diffNew->dimSize[diffNew->order - 2] = 1;
}
delete diff;
diff = diffNew;
......@@ -162,7 +162,7 @@ DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
diffNew->order = 2;
diffNew->dimSize[1] = diffNew->dimSize[0];
diffNew->dimSize[0] = 1;
diffNew->dimSizeRDI[1] = 1;
diffNew->dimSize[diffNew->order - 2] = 1;
}
delete diff;
diff = diffNew;
......@@ -349,22 +349,22 @@ void _CudaLossBackward(XTensor * dedy, XTensor * t, XTensor * y,
"The vectors must be on the same GPU.");
CheckNTErrors((tBeg == yBeg), "TODO!");
int leadDimRDI = leadDim >= 0 ? y->order - leadDim - 1 : -1;
if(leadDimRDI < 0){
leadDimRDI = y->order - 1;
if(leadDim < 0){
leadDim = 0;
tBeg = 0;
yBeg = 0;
tLen = y->dimSizeRDI[leadDimRDI];
tLen = y->dimSize[leadDim];
printf("%d", tLen);
}
int dimensionSize = y->dimSizeRDI[leadDimRDI];
int dimensionSize = y->dimSize[leadDim];
int stride = 1;
int blockSize = 1;
int blockNum = 1;
int size = 1;
for(int i = 0; i < leadDimRDI; i++)
stride *= y->dimSizeRDI[i];
for(int i = leadDim + 1; i < y->order; i++)
stride *= y->dimSize[i];
size = tLen * stride;
blockSize = stride * dimensionSize;
blockNum = y->unitNum / blockSize;
......
......@@ -40,7 +40,6 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)
if(leadDim < 0)
leadDim = x->order - 1;
int leadDimRDI = x->order - leadDim - 1;
if(!x->isSparse && !y->isSparse && x->dataType == y->dataType){
int * dimSize = new int[x->order - 1];
for(int i = 0; i < x->order; i++){
......@@ -70,13 +69,13 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)
else{
CheckNTErrors((x->dataType == DEFAULT_DTYPE), "TODO!");
int dimensionSize = y->dimSizeRDI[leadDimRDI];
int dimensionSize = y->dimSize[leadDim];
int stride = 1;
int blockSize = 1;
int blockNum = 1;
for(int i = 0; i < leadDimRDI; i++)
stride *= y->dimSizeRDI[i];
for(int i = leadDim + 1; i < y->order; i++)
stride *= y->dimSize[i];
blockSize = stride * dimensionSize;
blockNum = y->unitNum / blockSize;
......@@ -184,7 +183,6 @@ void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
if(leadDim < 0)
leadDim = y->order - 1;
int leadDimRDI = y->order - leadDim - 1;
#ifdef USE_CUDA
if(y->devID >= 0){
......@@ -193,12 +191,12 @@ void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
}
#endif
int dimensionSize = y->dimSizeRDI[leadDimRDI];
int dimensionSize = y->dimSize[leadDim];
int stride = 1;
int blockSize = 1;
int blockNum = 1;
for(int i = 0; i < leadDimRDI; i++)
stride *= y->dimSizeRDI[i];
for(int i = leadDim + 1; i < y->order; i++)
stride *= y->dimSize[i];
blockSize = stride * dimensionSize;
blockNum = y->unitNum / blockSize;
......
......@@ -225,14 +225,13 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s
CheckNTErrors((x->devID == y->devID), "Tensors used in softmax are not on the same GPU.");
CheckNTErrors((XTensor::IsSameShaped(x, y)), "Input tensors must be of the same size!");
int leadDimRDI = y->order - leadDim - 1;
int dimensionSize = y->dimSizeRDI[leadDimRDI];
int dimensionSize = y->dimSize[leadDim];
int stride = 1;
int blockSize = 1;
int blockNum = 1;
for(int i = 0; i < leadDimRDI; i++)
stride *= y->dimSizeRDI[i];
for(int i = leadDim + 1; i < y->order; i++)
stride *= y->dimSize[i];
blockSize = stride * dimensionSize;
blockNum = y->unitNum / blockSize;
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论