Commit f5149a15 by liyinqiao

Merge with Yuhao branch (with little bit change).

parent f0b49d6d
......@@ -30,8 +30,9 @@
#include "XDevice.h"
#include "./test/Test.h"
#include "./core/CHeader.h"
#include "./loss/CrossEntropy.h"
#include "./XBLAS.h"
#include "./core/sort/TopK.h"
#include "./core/movement/Gather.h"
//#define CRTDBG_MAP_ALLOC
//#include <stdlib.h>
//#include <crtdbg.h>
......
......@@ -50,14 +50,6 @@ int CONST_MINUSONE = -1;
bool CONST_TRUE = true;
int verboseLevel = 0;
bool useBLAS = false;
#ifdef USE_CUDA
bool useCUDA = true;
#else
bool useCUDA = false;
#endif
FILE * tmpLog = NULL;
double myTime = 0;
......
......@@ -135,8 +135,6 @@ extern bool CONST_TRUE;
#define NIUTRANSNNDEBUG
extern int verboseLevel;
extern bool useBLAS;
extern bool useCUDA;
#define FFLUSH(FILEH) \
{ \
......
......@@ -1562,9 +1562,9 @@ void XMemManager::GetBufferSize(MTYPE freeMem, MTYPE * myBufSize)
if (freeMem >= MILLION * 512){
*myBufSize = MILLION * 128;
if (freeMem >= MILLION * 1024) {
*myBufSize = MILLION * 256;
*myBufSize = MILLION * 128;
if (freeMem >= MILLION * 2048)
*myBufSize = MILLION * 512;
*myBufSize = MILLION * 128;
}
}
}
......
......@@ -266,7 +266,6 @@ void XTensor::Init()
devID = -1;
order = -1;
memset(dimSize, 0, sizeof(int) * MAX_TENSOR_DIM_NUM);
memset(dimSizeRDI, 0, sizeof(int) * MAX_TENSOR_DIM_NUM);
dataType = DEFAULT_DTYPE;
unitSize = sizeof(float);
unitNum = 0;
......@@ -314,7 +313,6 @@ void XTensor::ShallowCopy(const XTensor &tensor)
order = tensor.order;
enableGrad = tensor.enableGrad;
memcpy(dimSize, tensor.dimSize, sizeof(int) * MAX_TENSOR_DIM_NUM);
memcpy(dimSizeRDI, tensor.dimSizeRDI, sizeof(int) * MAX_TENSOR_DIM_NUM);
dataType = tensor.dataType;
unitSize = tensor.unitSize;
unitNum = tensor.unitNum;
......@@ -533,7 +531,7 @@ void XTensor::SetDevice(int myDevId, XMem * myMem)
bool XTensor::IsReduceShaped(const XTensor * a, const XTensor * b, int dim)
{
if (a == NULL || b == NULL)
if(a == NULL || b == NULL)
return false;
if ((a->order - 1) != b->order)
......@@ -570,7 +568,6 @@ void XTensor::SetDim(int * myDimSize)
{
for (int i = 0; i < order; i++) {
dimSize[i] = myDimSize[i];
dimSizeRDI[order - i - 1] = myDimSize[i];
}
}
......@@ -598,20 +595,17 @@ reshape the tensor
void XTensor::Reshape(const int myOrder, const int * myDimSize)
{
int dims[MAX_TENSOR_DIM_NUM];
int dimsRDI[MAX_TENSOR_DIM_NUM];
int num = 1;
for(int i = 0; i < myOrder; i++){
num *= myDimSize[i];
dims[i] = abs(myDimSize[i]);
dimsRDI[myOrder - i - 1] = dims[i];
}
CheckNTErrors(abs(num) == unitNum, "Wrong size found when we reshape the tensor!");
order = myOrder;
memcpy(dimSize, dims, sizeof(int) * order);
memcpy(dimSizeRDI, dimsRDI, sizeof(int) * order);
}
/*
......@@ -997,18 +991,12 @@ void * XTensor::GetCell(int index[], int size) const
{
CheckNTErrors((size == order), "Illegal index!");
int * indexRDI = new int[size];
for (int i = 0; i < size; i++)
indexRDI[size - i - 1] = index[i];
int offset = indexRDI[size - 1];
for(int i = size - 2; i >= 0; i--){
CheckNTErrors((indexRDI[i] < dimSizeRDI[i]), "Index is out of range!");
offset = offset * dimSizeRDI[i] + indexRDI[i];
int offset = index[0];
for(int i = 1; i < size; ++i){
CheckNTErrors((index[i] < dimSize[i]), "Index is out of range!");
offset = offset * dimSize[i] + index[i];
}
delete[] indexRDI;
if(isSparse){
DTYPE value;
void * p;
......@@ -1469,7 +1457,6 @@ bool XTensor::Resize(const int myOrder, const int * myDimSize,
bool zeroData = false;
for(int i = 0; i < order; i++){
dimSize[i] = abs(myDimSize[i]);
dimSizeRDI[order - i - 1] = dimSize[i];
if(myDimSize[i] < 0)
filledData = false;
if(myDimSize[i] == 0)
......@@ -1668,7 +1655,7 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int beg,
if (isSparse) {
int num = 0;
for (int i = 0; i < order; i++)
num *= dimSizeRDI[i];
num *= dimSize[i];
num = int(num * denseRatio + 1);
int tupleSize = sizeof(int) + sizeof(DTYPE);
int size = sizeof(int) + tupleSize*(num);
......@@ -1880,8 +1867,8 @@ void XTensor::Read(FILE * file, const char * label)
int ds[MAX_TENSOR_DIM_NUM];
for (int i = 0; i < order; i++) {
ds[i] = key % dimSizeRDI[i];
key /= dimSizeRDI[i];
ds[i] = key % dimSize[i];
key /= dimSize[i];
}
Set(value, ds);
}
......
......@@ -100,9 +100,6 @@ public:
/* size of each dimension */
int dimSize[MAX_TENSOR_DIM_NUM];
/* size of each dimension by Reversed Dimension Indexing (RDI) Mode */
int dimSizeRDI[MAX_TENSOR_DIM_NUM];
/* data unit - data type for every cell */
TENSOR_DATA_TYPE dataType;
......
......@@ -49,9 +49,6 @@ void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int le
"Unmatched tensors!");
CheckDev(a->devID, b->devID);
int leadingDimRDI = a->order - leadingDim - 1;
#ifdef USE_CUDA
if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
_CudaDiv(a, b, c, alpha, leadingDim);
......@@ -64,17 +61,17 @@ void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int le
int blockSizeB = 1;
int blockSizeC = 1;
int blockNum = 1;
int dimensionSizeA = a->dimSizeRDI[leadingDimRDI];
int dimensionSizeB = b->dimSizeRDI[leadingDimRDI];
int dimensionSizeC = c->dimSizeRDI[leadingDimRDI];
int dimensionSizeA = a->dimSize[leadingDim];
int dimensionSizeB = b->dimSize[leadingDim];
int dimensionSizeC = c->dimSize[leadingDim];
for (int i = 0; i < a->order; i++) {
if (i != leadingDimRDI) {
CheckNTErrors((a->dimSizeRDI[i] == b->dimSizeRDI[i] && a->dimSizeRDI[i] == c->dimSizeRDI[i]),
if (i != leadingDim) {
CheckNTErrors((a->dimSize[i] == b->dimSize[i] && a->dimSize[i] == c->dimSize[i]),
"Unmatched tensors!");
}
if (i < leadingDimRDI)
stride *= a->dimSizeRDI[i];
if (i > leadingDim)
stride *= a->dimSize[i];
}
blockSizeA = stride * dimensionSizeA;
......
......@@ -122,7 +122,6 @@ where i is the item index
*/
void _CudaDiv(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
{
int leadingDimRDI = a->order - leadingDim - 1;
CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum),
"Unmatched tensors in multiplication!");
CheckNTErrors((a->order == b->order && a->order == c->order), "Unmatched tensors!");
......@@ -130,18 +129,18 @@ void _CudaDiv(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, in
int stride = 1;
int blockSizeA = 1;
int blockNum = 1;
int dimensionSizeA = a->dimSizeRDI[leadingDimRDI];
int dimensionSizeB = b->dimSizeRDI[leadingDimRDI];
int dimensionSizeC = c->dimSizeRDI[leadingDimRDI];
int dimensionSizeA = a->dimSize[leadingDim];
int dimensionSizeB = b->dimSize[leadingDim];
int dimensionSizeC = c->dimSize[leadingDim];
for (int i = 0; i < a->order; i++) {
if (i != leadingDimRDI) {
CheckNTErrors((a->dimSizeRDI[i] == b->dimSizeRDI[i] &&
a->dimSizeRDI[i] == c->dimSizeRDI[i]),
if (i != leadingDim) {
CheckNTErrors((a->dimSize[i] == b->dimSize[i] &&
a->dimSize[i] == c->dimSize[i]),
"Unmatched tensors!");
}
if (i < leadingDimRDI)
stride *= a->dimSizeRDI[i];
if (i > leadingDim)
stride *= a->dimSize[i];
}
blockSizeA = stride * dimensionSizeA;
......
......@@ -77,18 +77,18 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
return;
}
int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1];
int bm = transposedB == X_TRANS ? b->dimSizeRDI[1] : b->dimSizeRDI[0];
int cn = c->dimSizeRDI[1];
int cm = c->dimSizeRDI[0];
int an = transposedA == X_TRANS ? a->dimSize[a->order - 1] : a->dimSize[a->order - 2];
int am = transposedA == X_TRANS ? a->dimSize[a->order - 2] : a->dimSize[a->order - 1];
int bn = transposedB == X_TRANS ? b->dimSize[b->order - 1] : b->dimSize[b->order - 2];
int bm = transposedB == X_TRANS ? b->dimSize[b->order - 2] : b->dimSize[b->order - 1];
int cn = c->dimSize[c->order - 2];
int cm = c->dimSize[c->order - 1];
CheckNTErrors((am == bn && an == cn && bm == cm), "Unmatched tensors in multiplication!");
int aBlockSize = a->dimSizeRDI[0] * a->dimSizeRDI[1];
int bBlockSize = b->dimSizeRDI[0] * b->dimSizeRDI[1];
int cBlockSize = c->dimSizeRDI[0] * c->dimSizeRDI[1];
int aBlockSize = a->dimSize[a->order - 1] * a->dimSize[a->order - 2];
int bBlockSize = b->dimSize[b->order - 1] * b->dimSize[b->order - 2];
int cBlockSize = c->dimSize[c->order - 1] * c->dimSize[c->order - 2];
int aRealBlockSize = aBlockSize * a->unitSize;
int bRealBlockSize = bBlockSize * b->unitSize;
int cRealBlockSize = cBlockSize * c->unitSize;
......@@ -96,24 +96,25 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
int bBlockNum = 1;
int cBlockNum = 1;
for (int i = 2; i < a->order; i++) {
CheckNTErrors(a->dimSizeRDI[i] == c->dimSizeRDI[i - 2 + b->order], "Incorrect tensor sizes!");
aBlockNum *= a->dimSizeRDI[i];
cBlockNum *= a->dimSizeRDI[i];
for (int i = 0; i < a->order - 2; i++) {
CheckNTErrors(a->dimSize[i] == c->dimSize[i], "Incorrect tensor sizes!");
aBlockNum *= a->dimSize[i];
cBlockNum *= a->dimSize[i];
}
for (int i = 2; i < b->order; i++) {
CheckNTErrors(b->dimSizeRDI[i] == c->dimSizeRDI[i], "Incorrect tensor sizes!");
bBlockNum *= b->dimSizeRDI[i];
cBlockNum *= b->dimSizeRDI[i];
for (int i = 0; i < b->order - 2; i++) {
CheckNTErrors(b->dimSize[i] == c->dimSize[i - 2 + a->order], "Incorrect tensor sizes!");
bBlockNum *= b->dimSize[i];
cBlockNum *= b->dimSize[i];
}
TensorList * aList = new TensorList(10);
TensorList * bList = new TensorList(10);
TensorList * cList = new TensorList(10);
int aDimSize[2] = { -a->dimSizeRDI[1], a->dimSizeRDI[0] };
int bDimSize[2] = { -b->dimSizeRDI[1], b->dimSizeRDI[0] };
int cDimSize[2] = { -c->dimSizeRDI[1], c->dimSizeRDI[0] };
int aDimSize[2] = { -a->dimSize[a->order - 2], a->dimSize[a->order - 1] };
int bDimSize[2] = { -b->dimSize[b->order - 2], b->dimSize[b->order - 1] };
int cDimSize[2] = { -c->dimSize[c->order - 2], c->dimSize[c->order - 1] };
bool isSparseMul = false;
......@@ -215,20 +216,20 @@ bool CheckMMulShape(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
if (!(a->order >= 2 && b->order >= 2 && c->order >= 2))
return false;
int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1];
int bm = transposedB == X_TRANS ? b->dimSizeRDI[1] : b->dimSizeRDI[0];
int an = transposedA == X_TRANS ? a->dimSize[a->order - 1] : a->dimSize[a->order - 2];
int am = transposedA == X_TRANS ? a->dimSize[a->order - 2] : a->dimSize[a->order - 1];
int bn = transposedB == X_TRANS ? b->dimSize[b->order - 1] : b->dimSize[b->order - 2];
int bm = transposedB == X_TRANS ? b->dimSize[b->order - 2] : b->dimSize[b->order - 1];
CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
int order = a->order + b->order - 2;
int sub = 0;
int * dimSize = new int[order];
for (int i = 2; i < a->order; i++)
dimSize[sub++] = a->dimSizeRDI[a->order + 1 - i];
for (int i = 2; i < b->order; i++)
dimSize[sub++] = b->dimSizeRDI[b->order + 1 - i];
for (int i = 0; i < a->order - 2; i++)
dimSize[sub++] = a->dimSize[i];
for (int i = 0; i < b->order - 2; i++)
dimSize[sub++] = b->dimSize[i];
dimSize[sub++] = an;
dimSize[sub++] = bm;
......@@ -271,20 +272,20 @@ XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
int an = transposedA == X_TRANS ? a.dimSizeRDI[0] : a.dimSizeRDI[1];
int am = transposedA == X_TRANS ? a.dimSizeRDI[1] : a.dimSizeRDI[0];
int bn = transposedB == X_TRANS ? b.dimSizeRDI[0] : b.dimSizeRDI[1];
int bm = transposedB == X_TRANS ? b.dimSizeRDI[1] : b.dimSizeRDI[0];
int an = transposedA == X_TRANS ? a.dimSize[a.order - 1] : a.dimSize[a.order - 2];
int am = transposedA == X_TRANS ? a.dimSize[a.order - 2] : a.dimSize[a.order - 1];
int bn = transposedB == X_TRANS ? b.dimSize[b.order - 1] : b.dimSize[b.order - 2];
int bm = transposedB == X_TRANS ? b.dimSize[b.order - 2] : b.dimSize[b.order - 1];
CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
int order = a.order + b.order - 2;
int sub = 0;
int * dimSize = new int[order];
for (int i = 2; i < a.order; i++)
dimSize[sub++] = a.dimSizeRDI[a.order + 1 - i];
for (int i = 2; i < b.order; i++)
dimSize[sub++] = b.dimSizeRDI[b.order + 1 - i];
for (int i = 0; i < a.order - 2; i++)
dimSize[sub++] = a.dimSize[i];
for (int i = 0; i < b.order - 2; i++)
dimSize[sub++] = b.dimSize[i];
dimSize[sub++] = an;
dimSize[sub++] = bm;
......@@ -318,20 +319,20 @@ void MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
if (!c.isInit || !CheckMMulShape(&a, transposedA, &b, transposedB, &c)) {
int an = transposedA == X_TRANS ? a.dimSizeRDI[0] : a.dimSizeRDI[1];
int am = transposedA == X_TRANS ? a.dimSizeRDI[1] : a.dimSizeRDI[0];
int bn = transposedB == X_TRANS ? b.dimSizeRDI[0] : b.dimSizeRDI[1];
int bm = transposedB == X_TRANS ? b.dimSizeRDI[1] : b.dimSizeRDI[0];
int an = transposedA == X_TRANS ? a.dimSize[a.order - 1] : a.dimSize[a.order - 2];
int am = transposedA == X_TRANS ? a.dimSize[a.order - 2] : a.dimSize[a.order - 1];
int bn = transposedB == X_TRANS ? b.dimSize[b.order - 1] : b.dimSize[b.order - 2];
int bm = transposedB == X_TRANS ? b.dimSize[b.order - 2] : b.dimSize[b.order - 1];
CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
int order = a.order + b.order - 2;
int sub = 0;
int * dimSize = new int[order];
for (int i = 2; i < a.order; i++)
dimSize[sub++] = a.dimSizeRDI[a.order + 1 - i];
for (int i = 2; i < b.order; i++)
dimSize[sub++] = b.dimSizeRDI[b.order + 1 - i];
for (int i = 0; i < a.order - 2; i++)
dimSize[sub++] = a.dimSize[i];
for (int i = 0; i < b.order - 2; i++)
dimSize[sub++] = b.dimSize[i];
dimSize[sub++] = an;
dimSize[sub++] = bm;
......@@ -370,20 +371,20 @@ XTensor MatrixMul(const XTensor &a, const XTensor &b,
CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
int an = a.dimSizeRDI[1];
int am = a.dimSizeRDI[0];
int bn = b.dimSizeRDI[1];
int bm = b.dimSizeRDI[0];
int an = a.dimSize[a.order - 2];
int am = a.dimSize[a.order - 1];
int bn = b.dimSize[b.order - 2];
int bm = b.dimSize[b.order - 1];
CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
int order = a.order + b.order - 2;
int sub = 0;
int * dimSize = new int[order];
for (int i = 2; i < a.order; i++)
dimSize[sub++] = a.dimSizeRDI[a.order + 1 - i];
for (int i = 2; i < b.order; i++)
dimSize[sub++] = b.dimSizeRDI[b.order + 1 - i];
for (int i = 0; i < a.order - 2; i++)
dimSize[sub++] = a.dimSize[i];
for (int i = 0; i < b.order - 2; i++)
dimSize[sub++] = b.dimSize[i];
dimSize[sub++] = an;
dimSize[sub++] = bm;
......@@ -416,20 +417,20 @@ void MatrixMul(const XTensor &a, const XTensor &b, XTensor &c,
if (!c.isInit || !CheckMMulShape(&a, X_NOTRANS, &b, X_NOTRANS, &c)) {
int an = a.dimSizeRDI[1];
int am = a.dimSizeRDI[0];
int bn = b.dimSizeRDI[1];
int bm = b.dimSizeRDI[0];
int an = a.dimSize[a.order - 2];
int am = a.dimSize[a.order - 1];
int bn = b.dimSize[b.order - 2];
int bm = b.dimSize[b.order - 1];
CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
int order = a.order + b.order - 2;
int sub = 0;
int * dimSize = new int[order];
for (int i = 2; i < a.order; i++)
dimSize[sub++] = a.dimSizeRDI[a.order + 1 - i];
for (int i = 2; i < b.order; i++)
dimSize[sub++] = b.dimSizeRDI[b.order + 1 - i];
for (int i = 0; i < a.order - 2; i++)
dimSize[sub++] = a.dimSize[i];
for (int i = 0; i < b.order - 2; i++)
dimSize[sub++] = b.dimSize[i];
dimSize[sub++] = an;
dimSize[sub++] = bm;
......
......@@ -95,27 +95,27 @@ void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
"Input tensor and output tensor must have same order!");
CheckNTErrors(a->devID >= 0 && b->devID >= 0 && c->devID >= 0, "The tensors must be on GPUs");
int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1];
int bm = transposedB == X_TRANS ? b->dimSizeRDI[1] : b->dimSizeRDI[0];
int cn = c->dimSizeRDI[1];
int cm = c->dimSizeRDI[0];
int an = transposedA == X_TRANS ? a->dimSize[a->order - 1] : a->dimSize[a->order - 2];
int am = transposedA == X_TRANS ? a->dimSize[a->order - 2] : a->dimSize[a->order - 1];
int bn = transposedB == X_TRANS ? b->dimSize[b->order - 1] : b->dimSize[b->order - 2];
int bm = transposedB == X_TRANS ? b->dimSize[b->order - 2] : b->dimSize[b->order - 1];
int cn = c->dimSize[c->order - 2];
int cm = c->dimSize[c->order - 1];
CheckNTErrors((am == bn && an == cn && bm == cm), "Unmatched tensors in multiplication!");
int aBlockSize = a->dimSizeRDI[0] * a->dimSizeRDI[1];
int bBlockSize = b->dimSizeRDI[0] * b->dimSizeRDI[1];
int cBlockSize = c->dimSizeRDI[0] * c->dimSizeRDI[1];
int aBlockSize = a->dimSize[a->order - 1] * a->dimSize[a->order - 2];
int bBlockSize = b->dimSize[b->order - 1] * b->dimSize[b->order - 2];
int cBlockSize = c->dimSize[c->order - 1] * c->dimSize[c->order - 2];
int aRealBlockSize = aBlockSize * a->unitSize;
int bRealBlockSize = bBlockSize * b->unitSize;
int cRealBlockSize = cBlockSize * c->unitSize;
int blockNum = 1;
for (int i = 2; i < a->order; i++) {
CheckNTErrors((a->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!");
CheckNTErrors((b->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!");
blockNum *= a->dimSizeRDI[i];
for (int i = 0; i < a->order - 2; i++) {
CheckNTErrors((a->dimSize[i] == c->dimSize[i]), "Incorrect tensor sizes!");
CheckNTErrors((b->dimSize[i] == c->dimSize[i]), "Incorrect tensor sizes!");
blockNum *= a->dimSize[i];
}
int devIDBackup = 0;
......@@ -126,9 +126,9 @@ void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
a->data, transposedA, a->dataType, aBlockSize,
b->data, transposedB, b->dataType, bBlockSize,
c->data, c->dataType, cBlockSize, blockNum,
a->dimSizeRDI[1], a->dimSizeRDI[0],
b->dimSizeRDI[1], b->dimSizeRDI[0],
c->dimSizeRDI[1], c->dimSizeRDI[0], alpha, beta);
a->dimSize[a->order - 2], a->dimSize[a->order - 1],
b->dimSize[b->order - 2], b->dimSize[b->order - 1],
c->dimSize[c->order - 2], c->dimSize[c->order - 1], alpha, beta);
BacktoCudaDev(a->devID, devIDBackup);
#endif
......@@ -164,32 +164,32 @@ void _MatrixMulBatchedCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
"Input tensor and output tensor must have same order!");
int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1];
int bm = transposedB == X_TRANS ? b->dimSizeRDI[1] : b->dimSizeRDI[0];
int cn = c->dimSizeRDI[1];
int cm = c->dimSizeRDI[0];
int an = transposedA == X_TRANS ? a->dimSize[a->order - 1] : a->dimSize[a->order - 2];
int am = transposedA == X_TRANS ? a->dimSize[a->order - 2] : a->dimSize[a->order - 1];
int bn = transposedB == X_TRANS ? b->dimSize[b->order - 1] : b->dimSize[b->order - 2];
int bm = transposedB == X_TRANS ? b->dimSize[b->order - 2] : b->dimSize[b->order - 1];
int cn = c->dimSize[c->order - 2];
int cm = c->dimSize[c->order - 1];
CheckNTErrors(am == bn && an == cn && bm == cm, "Unmatched tensors in multiplication!");
int aBlockSize = a->dimSizeRDI[0] * a->dimSizeRDI[1];
int bBlockSize = b->dimSizeRDI[0] * b->dimSizeRDI[1];
int cBlockSize = c->dimSizeRDI[0] * c->dimSizeRDI[1];
int aBlockSize = a->dimSize[a->order - 1] * a->dimSize[a->order - 2];
int bBlockSize = b->dimSize[b->order - 1] * b->dimSize[b->order - 2];
int cBlockSize = c->dimSize[c->order - 1] * c->dimSize[c->order - 2];
int aRealBlockSize = aBlockSize * a->unitSize;
int bRealBlockSize = bBlockSize * b->unitSize;
int cRealBlockSize = cBlockSize * c->unitSize;
int blockNum = 1;
for (int i = 2; i < a->order; i++) {
CheckNTErrors((a->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!");
CheckNTErrors((b->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!");
blockNum *= a->dimSizeRDI[i];
for (int i = 0; i < a->order - 2; i++) {
CheckNTErrors((a->dimSize[i] == c->dimSize[i]), "Incorrect tensor sizes!");
CheckNTErrors((b->dimSize[i] == c->dimSize[i]), "Incorrect tensor sizes!");
blockNum *= a->dimSize[i];
}
int aDimSize[2] = {-a->dimSizeRDI[1], a->dimSizeRDI[0]};
int bDimSize[2] = {-b->dimSizeRDI[1], b->dimSizeRDI[0]};
int cDimSize[2] = {-c->dimSizeRDI[1], c->dimSizeRDI[0]};
int aDimSize[2] = {-a->dimSize[a->order - 2], a->dimSize[a->order - 1]};
int bDimSize[2] = {-b->dimSize[b->order - 2], b->dimSize[b->order - 1]};
int cDimSize[2] = {-c->dimSize[c->order - 2], c->dimSize[c->order - 1]};
XTensor * ai = NewTensor2D(aDimSize[0], aDimSize[1], a->dataType, a->devID, a->mem);
XTensor * bi = NewTensor2D(bDimSize[0], bDimSize[1], b->dataType, b->devID, b->mem);
......@@ -292,10 +292,10 @@ XTensor MatrixMulBatched(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const
CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
CheckNTErrors(a.order == b.order, "Input tensor and output tensor must have same order!");
int an = transposedA == X_TRANS ? a.dimSizeRDI[0] : a.dimSizeRDI[1];
int am = transposedA == X_TRANS ? a.dimSizeRDI[1] : a.dimSizeRDI[0];
int bn = transposedB == X_TRANS ? b.dimSizeRDI[0] : b.dimSizeRDI[1];
int bm = transposedB == X_TRANS ? b.dimSizeRDI[1] : b.dimSizeRDI[0];
int an = transposedA == X_TRANS ? a.dimSize[a.order - 1] : a.dimSize[a.order - 2];
int am = transposedA == X_TRANS ? a.dimSize[a.order - 2] : a.dimSize[a.order - 1];
int bn = transposedB == X_TRANS ? b.dimSize[b.order - 1] : b.dimSize[b.order - 2];
int bm = transposedB == X_TRANS ? b.dimSize[b.order - 2] : b.dimSize[b.order - 1];
CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
......@@ -350,10 +350,10 @@ XTensor MatrixMulBatched(const XTensor &a, const XTensor &b,
CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
CheckNTErrors(a.order == b.order, "Input tensor and output tensor must have same order!");
int an = a.dimSizeRDI[1];
int am = a.dimSizeRDI[0];
int bn = b.dimSizeRDI[1];
int bm = b.dimSizeRDI[0];
int an = a.dimSize[a.order - 2];
int am = a.dimSize[a.order - 1];
int bn = b.dimSize[b.order - 2];
int bm = b.dimSize[b.order - 1];
CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
......
......@@ -71,20 +71,21 @@ XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
CheckNTErrors(x.dataType == w.dataType, "Input tensors should have the same data type!");
CheckNTErrors(x.order >= 2 && w.order >= 2, "Input tensors must have a order >= 2!");
int xn = x.dimSizeRDI[1];
int xm = x.dimSizeRDI[0];
int wn = w.dimSizeRDI[1];
int wm = w.dimSizeRDI[0];
int xn = x.dimSize[x.order - 2];
int xm = x.dimSize[x.order - 1];
int wn = w.dimSize[w.order - 2];
int wm = w.dimSize[w.order - 1];
CheckNTErrors(xm == wn, "Unmatched tensors in multiplication!");
int order = x.order + w.order - 2;
int sub = 0;
int * dimSize = new int[order];
for (int i = 2; i < x.order; i++)
dimSize[sub++] = x.dimSizeRDI[x.order + 1 - i];
for (int i = 2; i < w.order; i++)
dimSize[sub++] = w.dimSizeRDI[w.order + 1 - i];
for (int i = 0; i < x.order - 2; i++)
dimSize[sub++] = x.dimSize[i];
for (int i = 0; i < w.order - 2; i++)
dimSize[sub++] = w.dimSize[i];
dimSize[sub++] = xn;
dimSize[sub++] = wm;
......@@ -148,18 +149,18 @@ XTensor MulAndShift(const XTensor& x, MATRIX_TRANS_TYPE transposedA,
CheckNTErrors(x.dataType == w.dataType, "Input tensors should have the same data type!");
CheckNTErrors(x.order >= 2 && w.order >= 2, "Input tensors must have a order >= 2!");
int xn = transposedA == X_TRANS ? x.dimSizeRDI[0] : x.dimSizeRDI[1];
int xm = transposedA == X_TRANS ? x.dimSizeRDI[1] : x.dimSizeRDI[0];
int wn = transposedB == X_TRANS ? w.dimSizeRDI[0] : w.dimSizeRDI[1];
int wm = transposedB == X_TRANS ? w.dimSizeRDI[1] : w.dimSizeRDI[0];
int xn = transposedA == X_TRANS ? x.dimSize[x.order - 1] : x.dimSize[x.order - 2];
int xm = transposedA == X_TRANS ? x.dimSize[x.order - 2] : x.dimSize[x.order - 1];
int wn = transposedB == X_TRANS ? w.dimSize[w.order - 1] : w.dimSize[w.order - 2];
int wm = transposedB == X_TRANS ? w.dimSize[w.order - 2] : w.dimSize[w.order - 1];
int order = x.order + w.order - 2;
int sub = 0;
int * dimSize = new int[order];
for (int i = 2; i < x.order; i++)
dimSize[sub++] = x.dimSizeRDI[x.order + 1 - i];
for (int i = 2; i < w.order; i++)
dimSize[sub++] = w.dimSizeRDI[w.order + 1 - i];
for (int i = 0; i < x.order - 2; i++)
dimSize[sub++] = x.dimSize[i];
for (int i = 0; i < w.order - 2; i++)
dimSize[sub++] = w.dimSize[i];
dimSize[sub++] = xn;
dimSize[sub++] = wm;
......
......@@ -49,9 +49,6 @@ void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, i
"Unmatched tensors!");
CheckDev(a->devID, b->devID);
int leadingDimRDI = a->order - leadingDim - 1;
#ifdef USE_CUDA
if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
_CudaMultiply(a, b, c, alpha, leadingDim);
......@@ -64,18 +61,18 @@ void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, i
int blockSizeB = 1;
int blockSizeC = 1;
int blockNum = 1;
int dimensionSizeA = a->dimSizeRDI[leadingDimRDI];
int dimensionSizeB = b->dimSizeRDI[leadingDimRDI];
int dimensionSizeC = c->dimSizeRDI[leadingDimRDI];
int dimensionSizeA = a->dimSize[leadingDim];
int dimensionSizeB = b->dimSize[leadingDim];
int dimensionSizeC = c->dimSize[leadingDim];
for (int i = 0; i < a->order; i++) {
if (i != leadingDimRDI) {
CheckNTErrors((a->dimSizeRDI[i] == b->dimSizeRDI[i] &&
a->dimSizeRDI[i] == c->dimSizeRDI[i]),
if (i != leadingDim) {
CheckNTErrors((a->dimSize[i] == b->dimSize[i] &&
a->dimSize[i] == c->dimSize[i]),
"Unmatched tensors!");
}
if (i < leadingDimRDI)
stride *= a->dimSizeRDI[i];
if (i > leadingDim)
stride *= a->dimSize[i];
}
blockSizeA = stride * dimensionSizeA;
......
......@@ -122,26 +122,25 @@ where i is the item index
*/
void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
{
int leadingDimRDI = a->order - leadingDim - 1;
CheckNTErrors(a->unitNum <= c->unitNum && b->unitNum <= c->unitNum,
CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum),
"Unmatched tensors in multiplication!");
CheckNTErrors(a->order == b->order && a->order == c->order, "Unmatched tensors!");
CheckNTErrors((a->order == b->order && a->order == c->order), "Unmatched tensors!");
int stride = 1;
int blockSizeA = 1;
int blockNum = 1;
int dimensionSizeA = a->dimSizeRDI[leadingDimRDI];
int dimensionSizeB = b->dimSizeRDI[leadingDimRDI];
int dimensionSizeC = c->dimSizeRDI[leadingDimRDI];
int dimensionSizeA = a->dimSize[leadingDim];
int dimensionSizeB = b->dimSize[leadingDim];
int dimensionSizeC = c->dimSize[leadingDim];
for (int i = 0; i < a->order; i++) {
if (i != leadingDimRDI) {
CheckNTErrors((a->dimSizeRDI[i] == b->dimSizeRDI[i] &&
a->dimSizeRDI[i] == c->dimSizeRDI[i]),
if (i != leadingDim) {
CheckNTErrors((a->dimSize[i] == b->dimSize[i] &&
a->dimSize[i] == c->dimSize[i]),
"Unmatched tensors!");
}
if (i < leadingDimRDI)
stride *= a->dimSizeRDI[i];
if (i > leadingDim)
stride *= a->dimSize[i];
}
blockSizeA = stride * dimensionSizeA;
......
......@@ -70,20 +70,6 @@ void _SumDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE bet
return;
}
/*int dims[MAX_TENSOR_DIM_NUM];
for(int i = 0; i < a->order; i++)
dims[i] = 1;
dims[n] = a->GetDim(n);
XTensor * b2 = NewTensor(a->order, dims, b->dataType, b->denseRatio, b->devID, b->mem);
_CopyValues(b, b2);
_SumBroadcast(a, b2, c, beta);
DelTensor(b2);
return;*/
if(a->devID >= 0 || b->devID >= 0 || c->devID >= 0){
#ifdef USE_CUDA
_CudaSumDim(a, b, c, n, beta);
......
......@@ -87,17 +87,17 @@ void KernelAddWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize
int col = colIndex % colNum;
int block = colIndex / colNum;
if (row >= rowNum || block >= blockNum)
if(row >= rowNum || block >= blockNum)
return;
if (threadIdx.x == 0)
if(threadIdx.x == 0)
bv[threadIdx.y] = b[row];
__syncthreads();
int offset = block * blockSize + row * colNum + col;
if (betaFired)
if(betaFired)
c[offset] = a[offset] + bv[threadIdx.y] * beta;
else
c[offset] = a[offset] + bv[threadIdx.y];
......
......@@ -139,6 +139,47 @@ void _IndexToOnehot(const XTensor * index, XTensor * onehot,
}
/*
convert index tensor to onehot tensor
>> index - index tensor, which value is an integer num
>> onehot - onehot tensor, which value is 0 or 1
>> size - the last dimension size of the onehot tensor
*/
void _IndexToOnehot(int * index, int n, XTensor * onehot, int size, float labelSmoothingP)
{
/*CheckNTErrors(onehot->GetDim(-1) == size, "Illegal tensor dimension!");
CheckNTErrors(onehot->dataType == X_INT, "The onehot tensor must be in X_INT!")
onehot->SetZeroAll();
#ifdef USE_CUDA
if (onehot->devID >= 0) {
delete[] cudaIndex;
return;
}
#endif
int blockNum = n;
int stride = size;
int * indexData = (int *)index;
int * onehotData = (int *)onehot->data;
for (int i = 0; i < blockNum; i++) {
int id = indexData[i];
int * od = onehotData + i * stride;
od[id] = 1;
}*/
XTensor* cudaIndex = NewTensor1D(n, X_INT, onehot->devID);
cudaIndex->SetData(index, n);
_IndexToOnehot(cudaIndex, onehot, size, labelSmoothingP);
delete[] cudaIndex;
}
/*
convert onehot tensor to index tensor (return an XTensor structure)
make a new tensor to keep the result and return it
......
......@@ -36,6 +36,9 @@ XTensor OnehotToIndex(const XTensor & onehot, int num);
/* convert index tensor to onehot tensor */
void _IndexToOnehot(const XTensor * index, XTensor * onehot, int size, float labelSmoothingP);
/* convert index tensor to onehot tensor */
void _IndexToOnehot(int * index, int n, XTensor * onehot, int size, float labelSmoothingP);
/* convert index tensor to onehot tensor (return an XTensor structure)
make a new tensor to keep the result and return it */
XTensor IndexToOnehot(const XTensor & index, int num, float labelSmoothingP);
......
......@@ -25,6 +25,82 @@
namespace nts{ // namespace nts(NiuTrans.Tensor)
/*
generate a tensor with selected data in index along the given dimension
c = select(a)
>> a - input tensor
>> c - result tensor
>> index - the selected index
>> dim - the dimension along with which we do the job
*/
void _Select(const XTensor * a, XTensor * c, int* index, int dim)
{
CheckNTErrors(a != NULL && c != NULL, "empty tensors!");
CheckNTErrors(a->order == c->order, "The input and output tensors must in the same order!");
CheckNTErrors(dim >= 0 && dim < a->order, "The input dimension is out of bounds!");
CheckNTErrors(a->dataType == c->dataType, "The tensor must be of the same data type!");
int stride = 1;
for (int i = dim + 1; i < a->order; i++)
stride *= a->dimSize[i];
printf("\n%d %d\n", a->order - dim - 1,stride);
int copyTimes = 1;
for (int i = 0; i < dim; i++)
{
copyTimes *= a->dimSize[i];
}
int cot = c->dimSize[dim];
int blockSize = stride * a->unitSize;
int stepSizeS = stride * a->dimSize[dim] * a->unitSize;
int stepSizeT = stride * c->dimSize[dim] * a->unitSize;
char * s = (char*)a->data;
char * t = (char*)c->data;
for (int i = 0; i < copyTimes; i++) {
for (int j = 0; j < cot; ++j) {
XMemCopy(t + j * blockSize, c->devID, s + index[j] * blockSize, a->devID, blockSize);
}
s += stepSizeS;
t += stepSizeT;
}
}
/*
generate a tensor with selected data in index along the given dimension
c = select(a)
>> a - input tensor
>> c - result tensor
>> index - the selected index
>> dim - the dimension along with which we do the job
*/
void _Select(const XTensor * a, XTensor * c, XTensor* index, int dim)
{
if (index->devID >= 0)
{
int* indexCPU = new int[index->unitNum];
XMemCopy(indexCPU, -1, index->data,index->devID, index->unitNum * sizeof(int));
_Select(a, c, indexCPU, dim);
delete[] indexCPU;
}
else
{
_Select(a, c, (int *)index->data, dim);
}
}
/*
*/
/*XTensor Select(const XTensor &a, int* index, int dim)
{
}*/
/*
generate a tensor with selected data in range[low,high] along the given dimension
......@@ -58,13 +134,12 @@ void _SelectRange(const XTensor * a, XTensor * c, int dim, int low, int high)
}
int stride = 1;
int dimRDI = a->order - dim - 1;
for(int i = 0; i < dimRDI; i++)
stride *= a->dimSizeRDI[i];
for(int i = dim + 1; i < a->order; i++)
stride *= a->dimSize[i];
int copyTimes = 1;
for (int i = dimRDI + 1; i < a->order; i++)
copyTimes *= a->dimSizeRDI[i];
for (int i = 0; i < dim; i++)
copyTimes *= a->dimSize[i];
int blockSize = stride * (high - low) * a->unitSize;
int stepSizeS = stride * a->dimSize[dim] * a->unitSize;
......@@ -117,12 +192,10 @@ XTensor SelectRange(const XTensor &a, int dim, int low, int high)
_SelectRange(&a, &c, dim, low, high);
/* tensor connection */
if (a.enableGrad) {
XLink::MakeLink(&a, NULL, &c, GETANDSET_SELECT);
XLink::AddParamToHeadInt(&c, dim);
XLink::AddParamToHeadInt(&c, low);
XLink::AddParamToHeadInt(&c, high);
}
XLink::MakeLink(&a, NULL, &c, GETANDSET_SELECT);
XLink::AddParamToHeadInt(&c, dim);
XLink::AddParamToHeadInt(&c, low);
XLink::AddParamToHeadInt(&c, high);
/* destroy variables */
delete[] dimSize;
......
......@@ -27,7 +27,10 @@
namespace nts{ // namespace nts(NiuTrans.Tensor)
/* generate a tensor with selected data c = select(a) */
void _Select(const XTensor * a, XTensor * c, XTensor * indexCPU);
void _Select(const XTensor * a, XTensor * c, int* index, int dim);
/* generate a tensor with selected data c = select(a) */
void _Select(const XTensor * a, XTensor * c, XTensor* index, int dim);
/*
generate a tensor with selected data c = select(a) (returna a XTensor structure)
......
......@@ -47,26 +47,25 @@ void _Normalize(const XTensor * input, XTensor * output, int dim,
const XTensor * mean, const XTensor * var,
const XTensor * a, const XTensor * b, DTYPE epsilon)
{
int dimRDI = input->order - dim - 1;
CheckNTErrors((_IsSameShaped(input, output)), "Unmatched input tensors!");
CheckNTErrors((_IsSameShaped(a, b)), "Unmatched input tensors");
CheckNTErrors((_IsSameShaped(mean, var)), "Unmatched input tensors");
CheckNTErrors((input && output && mean && var && a && b), "Empty input tensors!");
CheckNTErrors((dimRDI >= 0 && dimRDI < input->order), "Incorrect reduction dimension!");
CheckNTErrors((dim >= 0 && dim < input->order), "Incorrect reduction dimension!");
CheckNTErrors((input->order == mean->order + 1), "Incorrect reduction dimension!");
int stride = 1;
int strideNum = input->dimSizeRDI[dimRDI];
int strideNum = input->dimSize[dim];
int blockSize = 1;
int blockNum = 1;
for (int i = 0; i < input->order; i++) {
if (i < dimRDI) {
CheckNTErrors((input->dimSizeRDI[i] == mean->dimSizeRDI[i]), "Wrong size!");
stride *= input->dimSizeRDI[i];
if (i < dim) {
CheckNTErrors((input->dimSize[i] == mean->dimSize[i]), "Wrong size!");
blockNum *= input->dimSize[i];
}
else if (i > dimRDI) {
CheckNTErrors((input->dimSizeRDI[i] == mean->dimSizeRDI[i - 1]), "Wrong size!");
blockNum *= input->dimSizeRDI[i];
else if (i > dim) {
CheckNTErrors((input->dimSize[i] == mean->dimSize[i - 1]), "Wrong size!");
stride *= input->dimSize[i];
}
}
blockSize = stride * strideNum;
......
......@@ -95,15 +95,14 @@ void _CudaNormalize(const XTensor * input, XTensor * output, int dim,
{
CheckNTErrors((input->dataType == DEFAULT_DTYPE), "TODO!");
int dimRDI = input->order - dim - 1;
int stride = 1;
int strideNum = input->dimSizeRDI[dimRDI];
int strideNum = input->dimSize[dim];
int blockNum = 1;
for (int i = 0; i < input->order; i++) {
if (i < dimRDI)
stride *= input->dimSizeRDI[i];
else if (i > dimRDI)
blockNum *= input->dimSizeRDI[i];
if (i > dim)
stride *= input->dimSize[i];
else if (i < dim)
blockNum *= input->dimSize[i];
}
int cudaGridSize[3];
......
......@@ -41,12 +41,11 @@ void _CopyInGrid(const XTensor * s, XTensor * t, int * index, int blockDim, int
{
CheckNTErrors((_IsSameShaped(s, t)), "Unmatched tensors!");
int blockDimRDI = s->order - blockDim - 1;
int blockSize = 1;
int blockNum = blockNumInGrid;
int gridNum = 1;
for (int i = 0; i < blockDimRDI; i++)
blockSize *= s->dimSizeRDI[i];
for (int i = blockDim; i < s->order; i++)
blockSize *= s->dimSize[i];
CheckNTErrors((s->unitNum % (blockSize * blockNum) == 0), "Illegal block number!");
gridNum = s->unitNum / (blockSize * blockNum);
......
......@@ -53,26 +53,28 @@ void _CopyIndexed(const XTensor * s, XTensor * t, int dim,
CheckNTErrors(dim < s->order && dim < t->order, "A too larget dimension specified!");
CheckNTErrors(s->unitSize == t->unitSize, "Unmatched tensors!");
int dimRDI = s->order - dim - 1;
int blockSizeSrc = 1;
int blockSizeTgt = 1;
int blockNumSrc = 1;
int blockNumTgt = 1;
int leadDimSizeSrc = s->dimSizeRDI[dimRDI];
int leadDimSizeTgt = t->dimSizeRDI[dimRDI];
int leadDimSizeSrc = s->dimSize[dim];
int leadDimSizeTgt = t->dimSize[dim];
int indexOffsetNum = 1;
for (int i = 0; i < dimRDI; i++) {
blockSizeSrc *= s->dimSizeRDI[i];
blockSizeTgt *= t->dimSizeRDI[i];
for (int i = dim + 1; i < s->order; i++) {
blockSizeSrc *= s->dimSize[i];
}
for (int i = dim + 1; i < t->order; i++) {
blockSizeTgt *= t->dimSize[i];
}
for (int i = 0; i <= dim; i++)
{
blockNumSrc *= s->dimSize[i];
blockNumTgt *= t->dimSize[i];
}
for (int i = dimRDI; i < s->order; i++)
blockNumSrc *= s->dimSizeRDI[i];
for (int i = dimRDI; i < t->order; i++)
blockNumTgt *= t->dimSizeRDI[i];
CheckNTErrors(blockSizeSrc == blockSizeTgt, "Unmatched tensors!");
indexOffsetNum = blockNumSrc / s->dimSizeRDI[dimRDI];
indexOffsetNum = blockNumSrc / s->dimSize[dim];
int realIndexSize = indexOffsetNum * indexSize * copyNum;
int * realSrcIndex = new int[realIndexSize];
......@@ -219,14 +221,14 @@ make a new tensor to keep the result and return it
>> s - the source tensor
>> dim - the leading dimension to define "sub-tensors"
e.g., for a tensor of size (4, 2, 3) and dim = 0,
we have 4 sub-tensors of size (2, 3)
e.g., for a tensor of size (3, 2, 4) and dim = 2,
we have 4 sub-tensors of size (3,2)
>> srcIndex - index of the source sub-tensors
>> indexSize - length of srcIndex (and tgtIndex)
>> tgtIndex - index of the target sub-tensors
>> copyNum - number of the sub-tensors we copy for each source index,
e.g., for srcIndex = [0,1] and copyNum = 2,
we actually copy the source sub-tensors 0, 1, 1 and 2
e.g., for srcIndex = [1,4] and copyNum = 2,
we actually copy the source sub-tensors 1, 2, 4, 5
<< return - the result of copying indexed sub-tensors
*/
XTensor CopyIndexed(const XTensor & s, int dim,
......@@ -277,14 +279,14 @@ make a new tensor to keep the result and return it
>> s - the source tensor
>> dim - the leading dimension to define "sub-tensors"
e.g., for a tensor of size (4, 2, 3) and dim = 0,
we have 4 sub-tensors of size (2, 3)
e.g., for a tensor of size (3, 2, 4) and dim = 2,
we have 4 sub-tensors of size (3,2)
>> srcIndex - index of the source sub-tensors
>> indexSize - length of srcIndex (and tgtIndex)
>> tgtIndex - index of the target sub-tensors
>> copyNum - number of the sub-tensors we copy for each source index,
e.g., for srcIndex = [0,1] and copyNum = 2,
we actually copy the source sub-tensors 0, 1, 1 and 2
e.g., for srcIndex = [1,4] and copyNum = 2,
we actually copy the source sub-tensors 1, 2, 4, 5
<< return - the result of copying indexed sub-tensors
*/
XTensor CopyIndexed(const XTensor &s, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum)
......
......@@ -33,6 +33,51 @@ gather indexed sub-tensors
>> s - the source tensor
>> t - the target tensor
>> dim - the leading dimension to define "sub-tensors"
e.g., for a tensor of size (3, 2, 4) and dim = 2,
we have 4 sub-tensors of size (3, 2)
>> srcIndex - index of the source sub-tensors
>> indexSize - length of srcIndex (and tgtIndex)
*/
void _Gather(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize)
{
int * tgtIndex = new int[indexSize];
for(int i = 0; i < indexSize; i++)
tgtIndex[i] = i;
_CopyIndexed(s, t, dim, srcIndex, indexSize, tgtIndex, 1);
delete[] tgtIndex;
}
/*
gather indexed sub-tensors
>> s - the source tensor
>> t - the target tensor
>> srcIndex - index of the source sub-tensors
>> dim - the leading dimension to define "sub-tensors"
e.g., for a tensor of size (3, 2, 4) and dim = 2,
we have 4 sub-tensors of size (3, 2)
*/
void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
{
CheckNTErrors((s && t), "Invalid tensors!");
CheckNTErrors(s->devID == t->devID, "the data must be kept on the same device!");
CheckNTErrors((t->unitSize == srcIndex->unitSize), "Unmatched tensors!");
#ifdef USE_CUDA
if (s->devID >= 0 && t->devID >= 0) {
_CudaGather(s, t, srcIndex, dim);
return;
}
#endif
}
/*
gather indexed sub-tensors
>> s - the source tensor
>> t - the target tensor
>> srcIndex - the tensor to save the index of the source tensor
*/
void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex)
......@@ -79,10 +124,15 @@ XTensor Gather(XTensor &s, XTensor &index)
CheckNTErrors(s.order == 2, "The order of the input tensor must be 2!");
int order = index.order + 1;
int order = s.order;
int * dimSize = new int[order];
memcpy(dimSize, index.dimSize, index.order * sizeof(int));
dimSize[index.order] = s.GetDim(-1);
for (int i = 0; i < s.order; i++) {
if (i == dim)
dimSize[i] = index.unitNum;
else
dimSize[i] = s.dimSize[i];
}
float dr = (!s.isSparse) ? 1.0F : s.denseRatio;
XTensor t(order, dimSize, s.dataType, dr, s.devID, s.mem);
......@@ -93,11 +143,22 @@ XTensor Gather(XTensor &s, XTensor &index)
_Gather(&s, &t, &index);
/* tensor connection */
if (s.enableGrad) {
XLink::MakeLink(&s, &index, &t, MOVEMENT_GATHER);
}
XLink::MakeLink(&s, &index, &t, MOVEMENT_GATHER);
if(index.order > 1) {
int * dims = new int[index.order + 1];
memcpy(dims, index.dimSize, index.order * sizeof(int));
dims[index.order] = t.GetDim(-1);
return t;
XTensor tt;
tt = Reshape(t, index.order + 1, dims);
delete[] dims;
return tt;
}
else {
return t;
}
}
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
......@@ -68,6 +68,36 @@ void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int indexSize, int
/*
gather indexed sub-tensors(cuda version)
>> sData - the data pointer of the source tensor
>> tData - the data pointer of the target tensor
>> sIndex - the index of the source tensor
>> indexSize - the size of the srcIndex
>> stride - stride of a data block
>> strideNum - strideNum of a data block
>> blockNum - block size of data
*/
__global__
void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int stride, int strideNum, int blockNum)
{
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int idy = blockDim.y * blockIdx.y + threadIdx.y;
int blockIndex = idy / stride;
int offsetInBlock = idy % stride;
int size = stride * strideNum * blockNum;
#pragma unroll
for (int i = idx * stride + stride * strideNum * blockIndex + offsetInBlock;
i < stride * strideNum * blockIndex + offsetInBlock + stride * strideNum && i < size;
i += stride * blockDim.x) {
tData[i] = sData[sIndex[i]];
}
}
/*
gather indexed sub-tensors(cuda version)
>> s - the source tensor
>> t - the target tensor
>> srcIndex - the tensor to save the index of the source tensor
......@@ -117,6 +147,44 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex)
BacktoCudaDev(devID, devIDBackup);
}
/*
gather indexed sub-tensors(cuda version)
>> s - the source tensor
>> t - the target tensor
>> srcIndex - the tensor to save the index of the source tensor
>> dim - the leading dimension to define "sub-tensors"
*/
void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
{
int devID = srcIndex->devID;
XMem * mem = s->mem;
int stride = 1;
int blockNum = 1;
int indexSize = srcIndex->unitNum;
int strideNum = srcIndex->dimSize[dim];
for (int i = 0; i < dim; i++)
blockNum *= srcIndex->dimSize[i];
for (int i = dim + 1; i < srcIndex->order; i++)
stride *= srcIndex->dimSize[i];
int * sIndex = NULL;
if (srcIndex->devID < 0) {
sIndex = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
(int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize);
}
else
sIndex = (int *)srcIndex->data;
int cudaGrids[3];
int cudaBlocks[3];
GDevs.GetCudaThread2D(devID, max(32, strideNum), stride*blockNum, MAX_INT, cudaGrids, cudaBlocks);
KernelGather << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> > ((DTYPE *)s->data, (DTYPE *)t->data, sIndex, stride, strideNum, blockNum);
}
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
......@@ -32,6 +32,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* gather indexed sub-tensors(cuda version) */
void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex);
void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex,int dim);
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
......
......@@ -27,8 +27,14 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* gather selected sub-tensors */
void _Gather(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize);
/* gather selected sub-tensors */
void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex);
/* gather selected sub-tensors accoding to the dimension */
void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim);
/* gather selected sub-tensors (return an XTensor structure)
make a new tensor to keep the result and return it */
XTensor Gather(XTensor &s, XTensor &index);
......
......@@ -31,6 +31,9 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
/* get the max-valued items along a dimension of the tensor (cuda version) */
void _CudaReduceMax(const XTensor * input, XTensor * output, int dim);
/* get the min-valued items along a dimension of the tensor (cuda version) */
void _CudaReduceMin(const XTensor * input, XTensor * output, int dim);
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
......
......@@ -29,14 +29,20 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
/* get the max value of the items along a dimension of the tensor. */
void _ReduceMax(const XTensor * input, XTensor * output, int dim);
/* get the min value of the items along a dimension of the tensor. */
void _ReduceMin(const XTensor * input, XTensor * output, int dim);
/*
get the max value of the items along a dimension of the tensor (return an XTensor structure)
make a new tensor to keep the result and return it
*/
XTensor ReduceMax(const XTensor &input, int dim);
/* get the max value of the items along a dimension of the tensor. */
void ReduceMax(const XTensor &input, XTensor &output, int dim);
/*
get the min value of the items along a dimension of the tensor (return an XTensor structure)
make a new tensor to keep the result and return it
*/
XTensor ReduceMin(const XTensor &input, int dim);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -39,8 +39,7 @@ void _ReduceMean(const XTensor * input, XTensor * output, int dim)
{
CheckNTErrors((input->order > dim), "Illegal dimension specified!");
int dimRDI = input->order - dim - 1;
int num = input->dimSizeRDI[dimRDI];
int num = input->dimSize[dim];
_ReduceSum(input, output, dim);
_ScaleAndShiftMe(output, (DTYPE)1/num, 0);
......
......@@ -54,15 +54,14 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor
CheckNTErrors((input->dataType == output->dataType), "Unmatched data types!");
CheckNTErrors((shift == NULL || _IsSameShaped(output, shift)), "Incorrect shift tensor size!");
int dimRDI = input->order - dim - 1;
CheckNTErrors(dimRDI >= 0, "Wrong dimension!");
CheckNTErrors(dim < input->order, "Wrong dimension!");
for(int i = 0; i < input->order; i++){
if(i < dimRDI){
CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i]), "Unmatched tensors!");
if(i < dim){
CheckNTErrors((input->dimSize[i] == output->dimSize[i]), "Unmatched tensors!");
}
else if(i > dimRDI){
CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i - 1]), "Unmatched tensors!");
else if(i > dim){
CheckNTErrors((input->dimSize[i] == output->dimSize[i - 1]), "Unmatched tensors!");
}
}
......@@ -75,21 +74,21 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor
CheckNTErrors((input->dataType == DEFAULT_DTYPE), "TODO!");
int stride = 1;
int strideNum = input->dimSizeRDI[dimRDI];
int strideNum = input->dimSize[dim];
int blockSize = 1;
int blockNum = 1;
for (int i = 0; i < input->order; i++) {
if (i < dimRDI)
stride *= input->dimSizeRDI[i];
else if (i > dimRDI)
blockNum *= input->dimSizeRDI[i];
if (i < dim)
blockNum *= input->dimSize[i];
else if (i > dim)
stride *= input->dimSize[i];
}
blockSize = stride * strideNum;
if(input->dimSizeRDI[0] % (4 * 32 / sizeof(DTYPE)) == 0 && input->dimSizeRDI[0] >= 32){
if(input->dimSize[input->order - 1] % (4 * 32 / sizeof(DTYPE)) == 0 && input->dimSize[input->order - 1] >= 32){
int vecBufLength = 32 / sizeof(DTYPE);
if(dimRDI == 0){
if(dim == input->order - 1){
//data is contiguous in dim 0
for(int i = 0; i < blockNum; i++){
// stride = 1
......@@ -123,7 +122,7 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor
} else{
//data is separated
for(int i = 0; i < blockNum; i++){
for(int j = 0; j < input->dimSizeRDI[0] / 32; j++){
for(int j = 0; j < input->dimSize[input->order - 1] / 32; j++){
DTYPE * ip = (DTYPE*)input->data + blockSize * i;
DTYPE * op = (DTYPE*)output->data + stride * i;
DTYPE * sp = shift != NULL ? (DTYPE*)shift->data + stride * i : NULL;
......
......@@ -692,13 +692,12 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
CheckNTErrors(input->dataType == output->dataType, "Unmatched data types!");
CheckNTErrors(shift == NULL || output->unitNum == shift->unitNum, "Incorrect shift tensor size!");
int dimRDI = input->order - dim - 1;
for(int i = 0; i < input->order; i++){
if(i < dimRDI){
CheckNTErrors(input->dimSizeRDI[i] == output->dimSizeRDI[i], "Unmatched tensors!");
if(i < dim){
CheckNTErrors(input->dimSize[i] == output->dimSize[i], "Unmatched tensors!");
}
else if(i > dimRDI){
CheckNTErrors(input->dimSizeRDI[i] == output->dimSizeRDI[i - 1], "Unmatched tensors!");
else if(i > dim){
CheckNTErrors(input->dimSize[i] == output->dimSize[i - 1], "Unmatched tensors!");
}
}
......@@ -709,31 +708,23 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
int cudaBlockSize[3];
int iter = 0;
int stride = 1;
int strideNum = input->dimSizeRDI[dimRDI];
int strideNum = input->dimSize[dim];
int blockSize = 1;
int blockNum = 1;
for (int i = 0; i < input->order; i++) {
if (i < dimRDI)
stride *= input->dimSizeRDI[i];
else if (i > dimRDI)
blockNum *= input->dimSizeRDI[i];
if (i < dim)
blockNum *= input->dimSize[i];
else if (i > dim)
stride *= input->dimSize[i];
}
blockSize = stride * strideNum;
int devID = input->devID;
XMem * mem = input->mem;
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
int devIDBackup;
ProtectCudaDev(devID, devIDBackup);
int bufSize = input->unitSize * cudaGridSize[0] * stride * blockNum * 2;
DTYPE * buf = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(input->devID, bufSize);
DTYPE * buf1 = buf;
DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum;
DTYPE * sp = shift != NULL ? (DTYPE*)shift->data : NULL;
int devIDBackup;
ProtectCudaDev(input->devID, devIDBackup);
if (stride == 1 && blockNum >= 10) {
dim3 grids;
......@@ -761,6 +752,14 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
strideNum, blockNum,sp, power, isExp);
}
else {
XMem * mem = input->mem;
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
int bufSize = input->unitSize * cudaGridSize[0] * stride * blockNum * 2;
DTYPE * buf = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(devID, bufSize);
DTYPE * buf1 = buf;
DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum;
do {
if (input->dataType == DEFAULT_DTYPE) {
DTYPE * iData = NULL;
......@@ -904,13 +903,15 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
iter++;
} while (strideNum > 1);
if (mem != NULL)
mem->ReleaseBuf(mem->devID, bufSize);
else
XMemFree(devID, buf);
}
ProtectCudaDev(input->devID, devIDBackup);
if (mem != NULL)
mem->ReleaseBuf(mem->devID, bufSize);
else
XMemFree(input->devID, buf);
BacktoCudaDev(devID, devIDBackup);
}
#endif // USE_CUDA
......
......@@ -38,8 +38,7 @@ For a 1-dimensional data array a, variance = 1/n * \sum_i (a_i - mean)^2
*/
void _ReduceVariance(const XTensor * input, XTensor * output, int dim, const XTensor * mean)
{
int dimRDI = input->order - dim - 1;
int num = input->dimSizeRDI[dimRDI];
int num = input->dimSize[dim];
_ReduceSum(input, output, dim, mean, 2.0F);
_ScaleAndShiftMe(output, (DTYPE)1 / num, 0);
}
......
......@@ -20,7 +20,7 @@
*/
#include "VectorBuffer.h"
//#include "math.h"
namespace nts {
/* data size for each buffer */
int VectorBuffer::size()
......@@ -168,4 +168,13 @@ VectorBuffer VectorBuffer::maxData(const VectorBuffer &a) {
return *this;
}
/* conculte the max of two buffer */
VectorBuffer VectorBuffer::minData(const VectorBuffer &a) {
for (int i = 0; i != a.size(); i++) {
this->values[i] = MIN(a[i], this->values[i]);
printf("runhere");
}
return *this;
}
}/* end of the nts (NiuTrans.Tensor) namespace */
\ No newline at end of file
......@@ -20,7 +20,6 @@
*/
//#include <cstring>
#include <math.h>
#include "../../XGlobal.h"
namespace nts {
......@@ -49,5 +48,8 @@ public:
/* conculte the max of two buffer */
VectorBuffer maxData(const VectorBuffer &a);
/* conculte the max of two buffer */
VectorBuffer minData(const VectorBuffer &a);
};
}
\ No newline at end of file
......@@ -39,30 +39,29 @@ void _ConcatenateSolely(const TensorList * smalls, XTensor * big, int dim)
CheckNTErrors(big->order > dim && dim >= 0, "Illegal dimension to concatenate!");
int catDimSize = 0;
int dimRDI = big->order - dim - 1;
for (int i = 0; i < smalls->count; i++) {
XTensor * tensor = (XTensor*)smalls->GetItem(i);
CheckNTErrors((big->order == tensor->order), "Unmatched tensor orders!");
for (int j = 0; j < big->order; j++) {
if (j != dimRDI) {
CheckNTErrors((big->dimSizeRDI[j] == tensor->dimSizeRDI[j]), "Unmatched tensor sizes!");
if (j != dim) {
CheckNTErrors((big->dimSize[j] == tensor->dimSize[j]), "Unmatched tensor sizes!");
}
else {
catDimSize += tensor->dimSizeRDI[j];
catDimSize += tensor->dimSize[j];
}
}
}
CheckNTErrors((catDimSize == big->dimSizeRDI[dimRDI]), "Unmatched tensor sizes!");
CheckNTErrors((catDimSize == big->dimSize[dim]), "Unmatched tensor sizes!");
int stride = 1;
for (int i = 0; i < dimRDI; i++)
stride *= big->dimSizeRDI[i];
int blockNum = 1;
for (int i = dimRDI + 1; i < big->order; i++)
blockNum *= big->dimSizeRDI[i];
for (int i = 0; i < dim; i++)
blockNum *= big->dimSize[i];
for (int i = dim + 1; i < big->order; i++)
stride *= big->dimSize[i];
int offset = 0;
......@@ -74,8 +73,8 @@ void _ConcatenateSolely(const TensorList * smalls, XTensor * big, int dim)
if (smalls->count <= MIN_TENSOR_CAT_NUM) {
for (int i = 0; i < smalls->count; i++) {
XTensor * tensor = (XTensor*)smalls->GetItem(i);
int sPitch = stride * tensor->dimSizeRDI[dimRDI] * tensor->unitSize;
int tPitch = stride * big->dimSizeRDI[dimRDI] * big->unitSize;
int sPitch = stride * tensor->dimSize[dim] * tensor->unitSize;
int tPitch = stride * big->dimSize[dim] * big->unitSize;
int mSize = sPitch;
int n = blockNum;
XMemCopy2D((char*)big->data + offset, tPitch, big->devID,
......@@ -89,7 +88,7 @@ void _ConcatenateSolely(const TensorList * smalls, XTensor * big, int dim)
int * blockSizes = new int[smalls->count];
for (int i = 0; i < smalls->count; i++) {
XTensor * tensor = (XTensor*)smalls->GetItem(i);
blockSizes[i] = stride * tensor->dimSizeRDI[dimRDI] * tensor->unitSize;
blockSizes[i] = stride * tensor->dimSize[dim] * tensor->unitSize;
sourceArrays->Add((char*)tensor->data);
}
......
......@@ -39,7 +39,7 @@ bool _IsSameShaped(const XTensor * a, const XTensor * b)
return false;
for(int i = 0; i < a->order; i++){
if(a->dimSizeRDI[i] != b->dimSizeRDI[i])
if(a->dimSize[i] != b->dimSize[i])
return false;
}
......
......@@ -46,10 +46,8 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
if(leadingDim < 0)
leadingDim = 0;
int whereToMergeRDI = s->order - whereToMerge - 1;
int leadingDimRDI = s->order - leadingDim - 1;
if (leadingDimRDI < 0)
leadingDimRDI = s->order - 1;
if (leadingDim >= s->order)
leadingDim = leadingDim - s->order;
CheckNTErrors((s != NULL && t != NULL), "Invalid tensors!");
CheckNTErrors((s->devID == t->devID || (s->devID < 0 && t->devID < 0)),
......@@ -57,19 +55,20 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
CheckNTErrors((s->unitNum == t->unitNum && s->unitSize == t->unitSize), "Unmatched tensors!");
CheckNTErrors((s->order == t->order + 1), "Unmatched tensors!");
CheckNTErrors((leadingDimRDI > whereToMergeRDI), "Invalid leading dimension!");
CheckNTErrors((leadingDim < whereToMerge), "Invalid leading dimension!");
for (int i = 0; i < s->order; i++) {
if (i == whereToMergeRDI) {
CheckNTErrors((t->dimSizeRDI[i] == s->dimSizeRDI[i] * s->dimSizeRDI[leadingDimRDI]),
if (i == whereToMerge) {
CheckNTErrors((t->dimSize[i - 1] == s->dimSize[i] * s->dimSize[leadingDim]),
"Unmatched tensor sizes!");
}
else if (i < leadingDimRDI){
CheckNTErrors((s->dimSizeRDI[i] == t->dimSizeRDI[i]),
else if (i < leadingDim){
CheckNTErrors((s->dimSize[i] == t->dimSize[i]),
"Unmatched tensor sizes!");
}
else if (i > leadingDimRDI) {
CheckNTErrors((s->dimSizeRDI[i] == t->dimSizeRDI[i - 1]),
else if (i > leadingDim) {
CheckNTErrors((s->dimSize[i] == t->dimSize[i - 1]),
"Unmatched tensor sizes!");
}
}
......@@ -78,14 +77,14 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
int blockNum = 1;
int gridSize = 1;
int gridNum = 1;
int mergedNum = s->dimSizeRDI[leadingDimRDI];
int mergedNum = s->dimSize[leadingDim];
for (int i = 0; i < s->order; i++) {
if (i <= leadingDimRDI) {
if (i <= whereToMergeRDI)
blockSize *= s->dimSizeRDI[i];
if (i >= leadingDim) {
if (i >= whereToMerge)
blockSize *= s->dimSize[i];
else
blockNum *= s->dimSizeRDI[i];
blockNum *= s->dimSize[i];
}
}
......@@ -122,7 +121,7 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
if (!isOnSameDevice)
dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(mem->devID, size);
int blockNumInMerge = s->dimSizeRDI[leadingDimRDI];
int blockNumInMerge = s->dimSize[leadingDim];
int splitSizeInGrid = gridSize / blockNumInMerge;
int realBlockSize = blockSize * t->unitSize;
......@@ -311,12 +310,11 @@ void _Merge(const TensorList * smalls, XTensor * t, int whereToMerge)
int mergedNum = smalls->count;
XTensor * s0 = smalls->GetItem(0);
int whereToMergeRDI = s0->order - whereToMerge - 1;
for (int i = 0; i < s0->order; i++) {
if (i <= whereToMergeRDI)
blockSize *= s0->dimSizeRDI[i];
if (i >= whereToMerge)
blockSize *= s0->dimSize[i];
else
blockNum *= s0->dimSizeRDI[i];
blockNum *= s0->dimSize[i];
}
CheckNTErrors((s0->unitNum % (blockSize * blockNum) == 0), "Incorrect size!");
......
......@@ -46,8 +46,6 @@ void Merge(const TensorList &smalls, XTensor &t, int whereToMerge);
/* merge two tensors into a big tensor (return an XTensor structure) */
XTensor Merge(const XTensor &smallA, const XTensor &smallB, int whereToMerge);
void Merge(const XTensor &smallA, const XTensor &smallB, XTensor &t, int whereToMerge);
} // namespace nts(NiuTrans.Tensor)
#endif // __MERGE_H__
\ No newline at end of file
......@@ -31,7 +31,7 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
transform a tensor by splitting it, e.g., (N, M) -> (3, N/3, M)
transform a tensor by splitting it, e.g., (N, M) -> (N/3, M, 3)
>> s - the source tensor
>> t - the target tensor (for return)
......@@ -46,23 +46,22 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
CheckNTErrors((s->unitNum == t->unitNum && s->unitSize == t->unitSize), "Unmatched tensors!");
CheckNTErrors((s->order == t->order - 1), "Unmatched tensors!");
CheckNTErrors((t->dimSizeRDI[t->order - 1] == splitNum), "Incorrect tensor sizes!");
CheckNTErrors((t->dimSize[0] == splitNum), "Incorrect tensor sizes!");
int whereToSplitRDI = s->order - whereToSplit - 1;
for (int i = 0; i < s->order; i++) {
if (i == whereToSplitRDI) {
CheckNTErrors((s->dimSizeRDI[i] == t->dimSizeRDI[i] * splitNum),
if (i == whereToSplit) {
CheckNTErrors((s->dimSize[i] == t->dimSize[i + 1] * splitNum),
"Unmatched tensor sizes!");
}
else {
CheckNTErrors((s->dimSizeRDI[i] == t->dimSizeRDI[i]),
CheckNTErrors((s->dimSize[i] == t->dimSize[i + 1]),
"Unmatched tensor sizes!");
}
}
/* for the case that we split the last dimension. Actually
(N, M) and (3, N/3, M) have the same memory layout */
if (s->order - 1 == whereToSplitRDI) {
(N, M) and (N, M/3, 3) have the same memory layout */
if (0 == whereToSplit) {
XMemCopy(t->data, t->devID, s->data, s->devID, s->unitNum * s->unitSize);
return;
}
......@@ -70,14 +69,14 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
int blockSize = 1;
int blockNum = 1;
for (int i = 0; i < s->order; i++) {
if (i == whereToSplitRDI) {
blockSize *= s->dimSizeRDI[i] / splitNum;
if (i == whereToSplit) {
blockSize *= s->dimSize[i] / splitNum;
blockNum *= splitNum;
}
else if (i < whereToSplitRDI)
blockSize *= s->dimSizeRDI[i];
else if (i > whereToSplit)
blockSize *= s->dimSize[i];
else
blockNum *= s->dimSizeRDI[i];
blockNum *= s->dimSize[i];
}
CheckNTErrors((blockNum % splitNum == 0), "Incorrect split number!");
......@@ -184,7 +183,7 @@ bool CheckSplitSize(const XTensor * s, const XTensor * t, int whereToSplit, int
}
/*
transform a tensor by splitting it, e.g., (N, M) -> (3, N/3, M) (return an XTensor structure)
transform a tensor by splitting it, e.g., (N, M) -> (N/3, M, 3) (return an XTensor structure)
make a new tensor to keep the result and return it
>> s - the source tensor
......@@ -276,7 +275,6 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli
CheckNTErrors((smalls->count == splitNum), "Unmatched tensors!");
CheckNTErrors((smalls->count > 0), "Wrong input!");
int whereToSplitRDI = big->order - whereToSplit - 1;
bool uniform = true;
for (int i = 0; i < smalls->count; i++) {
......@@ -292,14 +290,14 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli
int blockSize = 1;
int blockNum = 1;
for (int i = 0; i < big->order; i++) {
if (i == whereToSplitRDI) {
blockSize *= big->dimSizeRDI[i] / splitNum;
if (i == whereToSplit) {
blockSize *= big->dimSize[i] / splitNum;
blockNum *= splitNum;
}
else if (i < whereToSplitRDI)
blockSize *= big->dimSizeRDI[i];
else if (i > whereToSplit)
blockSize *= big->dimSize[i];
else
blockNum *= big->dimSizeRDI[i];
blockNum *= big->dimSize[i];
}
CheckNTErrors((blockNum % splitNum == 0), "Incorrect split number!");
......
......@@ -42,16 +42,15 @@ void _Unsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
CheckNTErrors((a->order == b->order - 1), "Unmatched tensors!");
CheckNTErrors((a->unitSize == b->unitSize), "Unmatched tensors!");
int dimRDI = b->order - dim - 1;
for (int i = 0; i < b->order; i++) {
if (i < dimRDI) {
CheckNTErrors((a->dimSizeRDI[i] == b->dimSizeRDI[i]), "Unmatched tensors!");
if (i < dim) {
CheckNTErrors((a->dimSize[i] == b->dimSize[i]), "Unmatched tensors!");
}
else if (i > dimRDI) {
CheckNTErrors((a->dimSizeRDI[i - 1] == b->dimSizeRDI[i]), "Unmatched tensors!");
else if (i > dim) {
CheckNTErrors((a->dimSize[i - 1] == b->dimSize[i]), "Unmatched tensors!");
}
else {
CheckNTErrors((dSize == b->dimSizeRDI[i]), "Unmatched tensors!");
CheckNTErrors((dSize == b->dimSize[i]), "Unmatched tensors!");
}
}
......@@ -60,8 +59,8 @@ void _Unsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
int blockNumA = 1;
int blockNumB = 1;
for (int i = 0; i < dimRDI; i++)
blockSize *= a->dimSizeRDI[i];
for (int i = dim; i < a->order; i++)
blockSize *= a->dimSize[i];
realBlockSize = blockSize * a->unitSize;
......
......@@ -235,9 +235,8 @@ void _CudaUnsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
int blockSize = 1;
int blockNumA = 1;
int blockNumB = 1;
int dimRDI = b->order - dim - 1;
for (int i = 0; i < dimRDI; i++)
blockSize *= a->dimSizeRDI[i];
for (int i = dim; i < a->order; i++)
blockSize *= a->dimSize[i];
blockNumA = a->unitNum / blockSize;
blockNumB = b->unitNum / blockSize;
......@@ -250,7 +249,7 @@ void _CudaUnsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
int devIDBackup = 0;
ProtectCudaDev(a->devID, devIDBackup);
if (dimRDI == 0) {
if (dim == b->order - 1) {
GDevs.GetCudaThread2D(a->devID, dSize, blockNumA, MAX_INT, cudaGrids, cudaBlocks);
if (a->dataType == X_FLOAT && b->dataType == X_FLOAT) {
......
......@@ -47,7 +47,6 @@ void _Sort(const XTensor * a, XTensor * b, XTensor * index, int dim)
CheckNTErrors((a->order == index->order), "Unmatched input tensors!");
CheckNTErrors((index->dataType == X_INT), "Wrong data type!");
int dimRDI = a->order - dim - 1;
/* make the index tensor */
SetAscendingOrder(*index, dim);
......@@ -60,13 +59,13 @@ void _Sort(const XTensor * a, XTensor * b, XTensor * index, int dim)
}
else {
int stride = 1;
int strideNum = a->dimSizeRDI[dimRDI];
for (int i = 0; i < dimRDI; i++)
stride *= a->dimSizeRDI[i];
int blockNum = 1;
for (int i = dimRDI + 1; i < a->order; i++)
blockNum *= a->dimSizeRDI[i];
int strideNum = a->dimSize[dim];
for (int i = 0; i < dim; i++)
blockNum *= a->dimSize[i];
for (int i = dim + 1; i < a->order; i++)
stride *= a->dimSize[i];
int blockSize = stride * strideNum;
_CopyValues(a, b);
......
......@@ -217,20 +217,19 @@ void _CudaSortBig(const XTensor * a, XTensor * b, XTensor * indexA, XTensor * in
CheckNTErrors((a->order > dim && dim >= 0), "Incorrect dimension specified!");
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
int dimRDI = a->order - dim - 1;
if (k < 0 || k > b->dimSizeRDI[dimRDI])
k = b->dimSizeRDI[dimRDI];
if (k < 0 || k > b->dimSize[dim])
k = b->dimSize[dim];
XMem * mem = a->mem;
int stride = 1;
int strideNum = a->dimSizeRDI[dimRDI];
for (int i = 0; i < dimRDI; i++)
stride *= a->dimSizeRDI[i];
int blockNum = 1;
for (int i = dimRDI + 1; i < a->order; i++)
blockNum *= a->dimSizeRDI[i];
int strideNum = a->dimSize[dim];
for (int i = 0; i < dim; i++)
blockNum *= a->dimSize[i];
for (int i = dim + 1; i < a->order; i++)
stride *= a->dimSize[i];
int m = GetNextPower2(strideNum);
int n = stride * blockNum;
......
......@@ -45,15 +45,14 @@ void _TopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
CheckNTErrors(index == NULL || a->order == index->order, "Unmatched input tensors!");
CheckNTErrors(index->dataType == X_INT, "Wrong data type!");
int dimRDI = a->order - dim - 1;
for (int i = 0; i < a->order; i++) {
if (i == dimRDI) {
CheckNTErrors(b->dimSizeRDI[i] == k, "A too large K");
CheckNTErrors(index == NULL || index->dimSizeRDI[i] == k, "Wrong size!");
if (i == dim) {
CheckNTErrors((b->dimSize[i] == k), "A too large K");
CheckNTErrors((index == NULL || index->dimSize[i] == k), "Wrong size!");
}
else {
CheckNTErrors(b->dimSizeRDI[i] == a->dimSizeRDI[i], "Wrong size!");
CheckNTErrors(index == NULL || index->dimSizeRDI[i] == a->dimSizeRDI[i], "Wrong size!");
CheckNTErrors((b->dimSize[i] == a->dimSize[i]), "Wrong size!");
CheckNTErrors((index == NULL || index->dimSize[i] == a->dimSize[i]), "Wrong size!");
}
}
......@@ -68,14 +67,14 @@ void _TopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
int stride = 1;
int strideNumA = a->dimSizeRDI[dimRDI];
int strideNumB = b->dimSizeRDI[dimRDI];
for (int i = 0; i < dimRDI; i++)
stride *= a->dimSizeRDI[i];
int blockNum = 1;
for (int i = dimRDI + 1; i < a->order; i++)
blockNum *= a->dimSizeRDI[i];
int strideNumA = a->dimSize[dim];
int strideNumB = b->dimSize[dim];
for (int i = 0; i < dim; i++)
blockNum *= a->dimSize[i];
for (int i = dim + 1; i < a->order; i++)
stride *= a->dimSize[i];
int blockSizeA = stride * strideNumA;
int blockSizeB = stride * strideNumB;
......
......@@ -812,15 +812,14 @@ void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
CheckNTErrors((index->dataType == X_INT), "Wrong data type!");
CheckNTErrors((b->dimSize[dim] == k), "A too large K");
int dimRDI = a->order - dim - 1;
int stride = 1;
int strideNumA = a->dimSizeRDI[dimRDI];
for (int i = 0; i < dimRDI; i++)
stride *= a->dimSizeRDI[i];
int blockNum = 1;
for (int i = dimRDI + 1; i < a->order; i++)
blockNum *= a->dimSizeRDI[i];
int strideNumA = a->dimSize[dim];
for (int i = 0; i < dim; i++)
blockNum *= a->dimSize[i];
for (int i = dim + 1; i < a->order; i++)
stride *= a->dimSize[i];
int workerNum = blockNum < 16 ? 64 : 32;
/* adjust the thread num according size of k for fitting the share memory size */
......
......@@ -47,7 +47,6 @@ void SetAscendingOrder(XTensor & tensor, int dim)
return;
}
int dimRDI = tensor.order - dim - 1;
if(tensor.devID >= 0){
#ifdef USE_CUDA
CudaSetAscendingOrder(&tensor, dim);
......@@ -57,13 +56,13 @@ void SetAscendingOrder(XTensor & tensor, int dim)
}
else{
int stride = 1;
int strideNum = tensor.dimSizeRDI[dimRDI];
for(int i = 0; i < dimRDI; i++)
stride *= tensor.dimSizeRDI[i];
int blockNum = 1;
for(int i = dimRDI + 1; i < tensor.order; i++)
blockNum *= tensor.dimSizeRDI[i];
int strideNum = tensor.dimSize[dim];
for(int i = 0; i < dim; i++)
blockNum *= tensor.dimSize[i];
for(int i = dim + 1; i < tensor.order; i++)
stride *= tensor.dimSize[i];
for(int k = 0; k < blockNum; k++){
for(int j = 0; j < strideNum; j++){
......
......@@ -67,15 +67,14 @@ void CudaSetAscendingOrder(XTensor * a, int dim)
{
CheckNTErrors((a->dataType == X_INT), "TODO!");
int dimRDI = a->order - dim - 1;
int stride = 1;
int strideNum = a->dimSizeRDI[dimRDI];
for(int i = 0; i < dimRDI; i++)
stride *= a->dimSizeRDI[i];
int stride = 1;
int blockNum = 1;
for(int i = dimRDI + 1; i < a->order; i++)
blockNum *= a->dimSizeRDI[i];
int strideNum = a->dimSize[dim];
for(int i = 0; i < dim; i++)
blockNum *= a->dimSize[i];
for(int i = dim + 1; i < a->order; i++)
stride *= a->dimSize[i];
int gridSize[3];
int blockSize[3];
......
......@@ -50,7 +50,6 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
return;
}
int leadDimRDI = x->order - leadDim - 1;
if (!x->isSparse && !y->isSparse &&
x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE)
{
......@@ -70,13 +69,13 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
XTensor * blockMax = NULL;
XTensor * blockSum = NULL;
int dimensionSize = y->dimSizeRDI[leadDimRDI];
int dimensionSize = y->dimSize[leadDim];
int stride = 1;
int blockSize = 1;
int blockNum = 1;
for (int i = 0; i < leadDimRDI; i++)
stride *= y->dimSizeRDI[i];
for (int i = leadDim + 1; i < y->order; i++)
stride *= y->dimSize[i];
blockSize = stride * dimensionSize;
blockNum = y->unitNum / blockSize;
......@@ -87,7 +86,7 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
_ReduceSum(x, sum, leadDim, max, 1.0F, true);
if (x->devID >= 0) {
if(leadDimRDI == 0){
if(leadDim == x->order - 1){
blockSize = y->unitNum;
blockNum = 1;
blockx = NewTensor2D(blockSize/dimensionSize, -dimensionSize, x->dataType, x->devID, mem);
......@@ -138,7 +137,7 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
blockMax->data = mp;
blockSum->data = sp;
#ifdef USE_CUDA
if(leadDimRDI == 0)
if(leadDim == x->order - 1)
_CudaLogSoftmaxSumMax(blockx, blocky, 1, blockSum, blockMax);
else
_CudaLogSoftmaxSumMax(blockx, blocky, leadDim, blockSum, blockMax);
......@@ -299,7 +298,6 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
if(leadDim < 0)
leadDim = y->order - 1;
int leadDimRDI = y->order - leadDim - 1;
#ifdef USE_CUDA
if (gold->devID >= 0) {
_CudaLogSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
......@@ -307,12 +305,12 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
}
#endif
int dimensionSize = y->dimSizeRDI[leadDimRDI];
int dimensionSize = y->dimSize[leadDim];
int stride = 1;
int blockSize = 1;
int blockNum = 1;
for (int i = 0; i < leadDimRDI; i++)
stride *= y->dimSizeRDI[i];
for (int i = leadDim + 1; i < y->order; i++)
stride *= y->dimSize[i];
blockSize = stride * dimensionSize;
blockNum = y->unitNum / blockSize;
......@@ -339,10 +337,10 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
int key = gold->GetKeyInSparse(i);
DTYPE value = gold->GetInSparse(i);
int offset = key;
if (dedx->dimSizeRDI[0] != gm) {
if (dedx->dimSize[dedx->order - 1] != gm) {
int mi = key % gm;
int ni = key / gm;
int key2 = ni * dedx->dimSizeRDI[0] + mi;
int key2 = ni * dedx->dimSize[dedx->order - 1] + mi;
offset = key2;
}
if (key >= 0 && key < size)
......@@ -396,10 +394,10 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
int key = gold->GetKeyInSparse(i);
DTYPE value = gold->GetInSparse(i);
int offset = key;
if (dedx->dimSizeRDI[0] != gm) {
if (dedx->dimSize[dedx->order - 1] != gm) {
int mi = key % gm;
int ni = key / gm;
int key2 = ni * dedx->dimSizeRDI[0] + mi;
int key2 = ni * dedx->dimSize[dedx->order - 1] + mi;
offset = key2;
}
if (key >= 0 && key < size)
......@@ -431,11 +429,11 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
/* for columns with no xs we set dE/ds = 0 */
if (gold != NULL && gold->isSparse) {
CheckNTErrors((gold->order == 2), "The gold standard tensor must be of order 2!");
if ((gold->dimSize[1] > 1 && !gold->isAllValued[0]) || gold->dimSize[1] != dedx->dimSizeRDI[0]) {
if ((gold->dimSize[1] > 1 && !gold->isAllValued[0]) || gold->dimSize[1] != dedx->dimSize[dedx->order - 1]) {
int gn = gold->dimSize[0];
int gm = gold->dimSize[1];
int sm = dedx->dimSizeRDI[0];
int sn = dedx->dimSizeRDI[1];
int sm = dedx->dimSize[dedx->order - 1];
int sn = dedx->dimSize[dedx->order - 2];
int * flags = new int[sm];
memset(flags, 0, sizeof(int)*sm);
......
......@@ -385,13 +385,12 @@ void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
"Tensors used in log softmax are not on the same GPU.");
CheckNTErrors((gold != NULL), "No x gold standard is found!");
int leadDimRDI = y->order - leadDim - 1;
int dimensionSize = y->dimSizeRDI[leadDimRDI];
int dimensionSize = y->dimSize[leadDim];
int stride = 1;
int blockSize = 1;
int blockNum = 1;
for (int i = 0; i < leadDimRDI; i++)
stride *= y->dimSizeRDI[i];
for (int i = leadDim + 1; i < y->order; i++)
stride *= y->dimSize[i];
blockSize = stride * dimensionSize;
blockNum = y->unitNum / blockSize;
......
......@@ -50,18 +50,17 @@ DTYPE _LossCompute(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
if (output->devID < 0) {
CheckNTErrors((gLen >= 0 && gLen <= output->unitNum), "Illegal input length!");
CheckNTErrors((_IsSameShaped(gold, output)), "The input tensors must be of the same size!");
CheckNTErrors((gold->dimSizeRDI[0] == 1 && output->dimSizeRDI[0] == 1), "TODO!");
CheckNTErrors((gold->dimSize[gold->order - 1] == 1 && output->dimSize[output->order - 1] == 1), "TODO!");
CheckNTErrors((gold->order > leadDim && leadDim >= 0), "Illegal leading dimension!");
CheckNTErrors((gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE), "TODO!");
int leadDimRDI = output->order - leadDim - 1;
int dimensionSize = output->dimSizeRDI[leadDimRDI];
int dimensionSize = output->dimSize[leadDim];
int stride = 1;
int blockSize = 1;
int blockNum = 1;
for(int i = 0; i < leadDimRDI; i++)
stride *= output->dimSizeRDI[i];
for(int i = leadDim + 1; i < output->order; i++)
stride *= output->dimSize[i];
blockSize = stride * dimensionSize;
blockNum = output->unitNum / blockSize;
......@@ -207,18 +206,17 @@ DTYPE _LossComputeForLogScale(XTensor * gold, XTensor * output,
{
CheckNTErrors(gLen >= 0 && gLen <= output->unitNum, "Illegal input length!");
CheckNTErrors(_IsSameShaped(gold, output), "The input tensors must be of the same size!");
CheckNTErrors(gold->dimSizeRDI[0] == 1 && output->dimSizeRDI[0] == 1, "TODO!");
CheckNTErrors(gold->dimSize[gold->order - 1] == 1 && output->dimSize[output->order - 1] == 1, "TODO!");
CheckNTErrors(gold->order > leadDim && leadDim >= 0, "Illegal leading dimension!");
CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, "TODO!");
int leadDimRDI = output->order - leadDim - 1;
int dimensionSize = output->dimSizeRDI[leadDimRDI];
int dimensionSize = output->dimSize[leadDim];
int stride = 1;
int blockSize = 1;
int blockNum = 1;
for(int i = 0; i < leadDimRDI; i++)
stride *= output->dimSizeRDI[i];
for(int i = leadDim + 1; i < output->order; i++)
stride *= output->dimSize[i];
blockSize = stride * dimensionSize;
blockNum = output->unitNum / blockSize;
......@@ -409,21 +407,20 @@ void _LossBackward(XTensor * dedy, XTensor * t, XTensor * y,
CheckNTErrors(t->order > leadDim, "Illegal leading dimension!");
CheckNTErrors(t->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE, "TODO!");
int leadDimRDI = leadDim >= 0 ? y->order - leadDim - 1 : -1;
if(leadDimRDI < 0){
leadDimRDI = y->order - 1;
if (leadDim < 0) {
leadDim = 0;
tBeg = 0;
yBeg = 0;
tLen = y->dimSizeRDI[leadDimRDI];
tLen = y->dimSize[leadDim];
}
int dimensionSize = y->dimSizeRDI[leadDimRDI];
int dimensionSize = y->dimSize[leadDim];
int stride = 1;
int blockSize = 1;
int blockNum = 1;
for(int i = 0; i < leadDimRDI; i++)
stride *= y->dimSizeRDI[i];
for(int i = leadDim + 1; i < y->order; i++)
stride *= y->dimSize[i];
blockSize = stride * dimensionSize;
blockNum = y->unitNum / blockSize;
......
......@@ -56,7 +56,7 @@ DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
{
CheckNTErrors((gLen >= 0 && gLen <= y->unitNum), "Illegal input length!");
CheckNTErrors((_IsSameShaped(gold, y)), "The input tensors must be of the same size!");
CheckNTErrors((gold->dimSizeRDI[0] == 1 && y->dimSizeRDI[0] == 1), "TODO!");
CheckNTErrors((gold->dimSize[gold->order - 1] == 1 && y->dimSize[y->order - 1] == 1), "TODO!");
CheckNTErrors((gold->order > leadDim && leadDim >= 0), "Illegal leading dimension!");
CheckNTErrors((gold->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE), "TODO!");
CheckNTErrors((gold->devID == y->devID), "Tensors must be on the same device!");
......@@ -91,7 +91,7 @@ DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
diffNew->order = 2;
diffNew->dimSize[1] = diffNew->dimSize[0];
diffNew->dimSize[0] = 1;
diffNew->dimSizeRDI[1] = 1;
diffNew->dimSize[diffNew->order - 2] = 1;
}
delete diff;
diff = diffNew;
......@@ -125,7 +125,7 @@ DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
diffNew->order = 2;
diffNew->dimSize[1] = diffNew->dimSize[0];
diffNew->dimSize[0] = 1;
diffNew->dimSizeRDI[1] = 1;
diffNew->dimSize[diffNew->order - 2] = 1;
}
delete diff;
diff = diffNew;
......@@ -162,7 +162,7 @@ DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
diffNew->order = 2;
diffNew->dimSize[1] = diffNew->dimSize[0];
diffNew->dimSize[0] = 1;
diffNew->dimSizeRDI[1] = 1;
diffNew->dimSize[diffNew->order - 2] = 1;
}
delete diff;
diff = diffNew;
......@@ -349,22 +349,21 @@ void _CudaLossBackward(XTensor * dedy, XTensor * t, XTensor * y,
"The vectors must be on the same GPU.");
CheckNTErrors((tBeg == yBeg), "TODO!");
int leadDimRDI = leadDim >= 0 ? y->order - leadDim - 1 : -1;
if(leadDimRDI < 0){
leadDimRDI = y->order - 1;
if (leadDim < 0) {
leadDim = 0;
tBeg = 0;
yBeg = 0;
tLen = y->dimSizeRDI[leadDimRDI];
tLen = y->dimSize[leadDim];
}
int dimensionSize = y->dimSizeRDI[leadDimRDI];
int dimensionSize = y->dimSize[leadDim];
int stride = 1;
int blockSize = 1;
int blockNum = 1;
int size = 1;
for(int i = 0; i < leadDimRDI; i++)
stride *= y->dimSizeRDI[i];
for(int i = leadDim + 1; i < y->order; i++)
stride *= y->dimSize[i];
size = tLen * stride;
blockSize = stride * dimensionSize;
blockNum = y->unitNum / blockSize;
......
......@@ -41,7 +41,6 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)
if(leadDim < 0)
leadDim = x->order - 1;
int leadDimRDI = x->order - leadDim - 1;
if(!x->isSparse && !y->isSparse && x->dataType == y->dataType){
int * dimSize = new int[x->order - 1];
for(int i = 0; i < x->order; i++){
......@@ -71,13 +70,13 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)
else{
CheckNTErrors((x->dataType == DEFAULT_DTYPE), "TODO!");
int dimensionSize = y->dimSizeRDI[leadDimRDI];
int dimensionSize = y->dimSize[leadDim];
int stride = 1;
int blockSize = 1;
int blockNum = 1;
for(int i = 0; i < leadDimRDI; i++)
stride *= y->dimSizeRDI[i];
for(int i = leadDim + 1; i < y->order; i++)
stride *= y->dimSize[i];
blockSize = stride * dimensionSize;
blockNum = y->unitNum / blockSize;
......@@ -207,8 +206,6 @@ void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
if(leadDim < 0)
leadDim = y->order - 1;
int leadDimRDI = y->order - leadDim - 1;
#ifdef USE_CUDA
if(y->devID >= 0){
_CudaSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
......@@ -216,12 +213,12 @@ void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
}
#endif
int dimensionSize = y->dimSizeRDI[leadDimRDI];
int dimensionSize = y->dimSize[leadDim];
int stride = 1;
int blockSize = 1;
int blockNum = 1;
for(int i = 0; i < leadDimRDI; i++)
stride *= y->dimSizeRDI[i];
for(int i = leadDim + 1; i < y->order; i++)
stride *= y->dimSize[i];
blockSize = stride * dimensionSize;
blockNum = y->unitNum / blockSize;
......
......@@ -226,14 +226,13 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s
CheckNTErrors((x->devID == y->devID), "Tensors used in softmax are not on the same GPU.");
CheckNTErrors((_IsSameShaped(x, y)), "Input tensors must be of the same size!");
int leadDimRDI = y->order - leadDim - 1;
int dimensionSize = y->dimSizeRDI[leadDimRDI];
int dimensionSize = y->dimSize[leadDim];
int stride = 1;
int blockSize = 1;
int blockNum = 1;
for(int i = 0; i < leadDimRDI; i++)
stride *= y->dimSizeRDI[i];
for(int i = leadDim + 1; i < y->order; i++)
stride *= y->dimSize[i];
blockSize = stride * dimensionSize;
blockNum = y->unitNum / blockSize;
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论