Commit 7e9d7015 by xuchen

This is a fundamental integration!

parent b3a76184
...@@ -96,7 +96,7 @@ void XMathGrad::GradMultiply(XTensor * node) ...@@ -96,7 +96,7 @@ void XMathGrad::GradMultiply(XTensor * node)
XNoder::MakeGrad(a); XNoder::MakeGrad(a);
XNoder::MakeGrad(b); XNoder::MakeGrad(b);
CheckNTErrors(XTensor::IsIdentical(a, b), "Wrong sized input tensors!"); CheckNTErrors(XTensor::IsSameShaped(a, b), "Wrong sized input tensors!");
_Multiply(node->grad, b, a->grad, 1.0F); _Multiply(node->grad, b, a->grad, 1.0F);
_Multiply(node->grad, a, b->grad, 1.0F); _Multiply(node->grad, a, b->grad, 1.0F);
} }
......
...@@ -71,9 +71,11 @@ dE/da = split(dE/dc) ...@@ -71,9 +71,11 @@ dE/da = split(dE/dc)
void XShapeGrad::GradMerge(XTensor * node) void XShapeGrad::GradMerge(XTensor * node)
{ {
XLink &income = node->income; XLink &income = node->income;
CheckNTErrors(income.tailNum == 0, "Wrong input tensor number for MERGE!");
XTensor * input = income.tails[0]; XTensor * input = income.tails[0];
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for MERGE!");
CheckNTErrors(node->order == input->order - 1, "wrong tensor orders!");
int whereToMerge = income.GetParamInt(0); int whereToMerge = income.GetParamInt(0);
int leadDim = income.GetParamInt(1); int leadDim = income.GetParamInt(1);
...@@ -95,13 +97,13 @@ void XShapeGrad::GradMerge(XTensor * node) ...@@ -95,13 +97,13 @@ void XShapeGrad::GradMerge(XTensor * node)
} }
dims[0] = -dims[0]; dims[0] = -dims[0];
XTensor gradInputSmall(input->order - leadDim, dims, XTensor gradInputSmall(input->order - leadDim, dims,
input->dataType, input->denseRatio, input->dataType, input->denseRatio,
input->devID, input->mem); input->devID, input->mem);
dims[whereToMerge - leadDim] *= dims[0]; dims[whereToMerge - leadDim] *= dims[0];
XTensor gradNodeSmall(node->order - leadDim, dims, XTensor gradNodeSmall(node->order - leadDim, dims + leadDim + 1,
node->dataType, node->denseRatio, node->dataType, node->denseRatio,
node->devID, node->mem); node->devID, node->mem);
/* we can simply split the gradient tensor /* we can simply split the gradient tensor
if the input is used in merging only */ if the input is used in merging only */
...@@ -109,7 +111,7 @@ void XShapeGrad::GradMerge(XTensor * node) ...@@ -109,7 +111,7 @@ void XShapeGrad::GradMerge(XTensor * node)
for(int i = 0; i < blockNum; i++){ for(int i = 0; i < blockNum; i++){
gradNodeSmall.data = (char*)node->grad->data + i * blockSize; gradNodeSmall.data = (char*)node->grad->data + i * blockSize;
gradInputSmall.data = (char*)input->grad->data + i * blockSize; gradInputSmall.data = (char*)input->grad->data + i * blockSize;
_Split(&gradNodeSmall, &gradInputSmall, whereToMerge - leadDim, input->dimSize[leadDim]); _Split(&gradNodeSmall, &gradInputSmall, whereToMerge - leadDim - 1, input->dimSize[leadDim]);
} }
} }
...@@ -123,7 +125,7 @@ void XShapeGrad::GradMerge(XTensor * node) ...@@ -123,7 +125,7 @@ void XShapeGrad::GradMerge(XTensor * node)
for(int i = 0; i < blockNum; i++){ for(int i = 0; i < blockNum; i++){
gradNodeSmall.data = (char*)node->grad->data + i * blockSize; gradNodeSmall.data = (char*)node->grad->data + i * blockSize;
gradInputSmall.data = (char*)input->grad->data + i * blockSize; gradInputSmall.data = (char*)input->grad->data + i * blockSize;
_Split(&gradNodeSmall, &gradInputSmallBuf, whereToMerge - leadDim, input->dimSize[leadDim]); _Split(&gradNodeSmall, &gradInputSmallBuf, whereToMerge - leadDim - 1, input->dimSize[leadDim]);
_Sum(&gradInputSmall, &gradInputSmallBuf, &gradInputSmall); _Sum(&gradInputSmall, &gradInputSmallBuf, &gradInputSmall);
} }
} }
...@@ -162,7 +164,7 @@ void XShapeGrad::GradMergeList(XTensor * node) ...@@ -162,7 +164,7 @@ void XShapeGrad::GradMergeList(XTensor * node)
smallsGrad.Add(tail->grad); smallsGrad.Add(tail->grad);
if(i > 1){ if(i > 1){
CheckNTErrors(XTensor::IsIdentical(last, tail), CheckNTErrors(XTensor::IsSameShaped(last, tail),
"Input tensors must be of the same size!"); "Input tensors must be of the same size!");
} }
......
...@@ -29,7 +29,7 @@ void XNoder::MakeGrad(XTensor * node) ...@@ -29,7 +29,7 @@ void XNoder::MakeGrad(XTensor * node)
if(node == NULL) if(node == NULL)
return; return;
if(!XTensor::IsIdentical(node, node->grad)){ if(!XTensor::IsSameShaped(node, node->grad)){
delete node->grad; delete node->grad;
node->grad = NewTensor(node); node->grad = NewTensor(node);
node->grad->SetZeroAll(); node->grad->SetZeroAll();
......
...@@ -73,8 +73,7 @@ void MakeWordBatch(XTensor &batch, NGram * ngrams, int ngramNum, int n, int vSiz ...@@ -73,8 +73,7 @@ void MakeWordBatch(XTensor &batch, NGram * ngrams, int ngramNum, int n, int vSiz
void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net); void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net);
void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NAME loss, void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NAME loss,
FNNModel &model, FNNModel &grad, FNNNet &net); FNNModel &model, FNNModel &grad, FNNNet &net);
void FBInOne(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NAME loss, void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model);
FNNModel &model, XNet &net);
/* /*
entry of the program entry of the program
...@@ -415,7 +414,10 @@ void Train(const char * train, bool isShuffled, FNNModel &model) ...@@ -415,7 +414,10 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
} }
else{ else{
/* forward + backward process */ /* forward + backward process */
FBInOne(inputs, output, gold, CROSSENTROPY, model, autoDiffer); ForwardAutoDiff(inputs, output, model);
/* automatic differentiation */
autoDiffer.Backward(output, gold, CROSSENTROPY);
/* update model parameters */ /* update model parameters */
Update(model, grad, learningRate, true); Update(model, grad, learningRate, true);
...@@ -902,17 +904,14 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA ...@@ -902,17 +904,14 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
} }
/* /*
forward + backward in one procedure forward process (with tensor connections)
>> inputs - input word representations >> inputs - input word representations
>> output - output probability >> output - output probability
>> gold - gold standard
>> loss - loss function name
>> model - the fnn model >> model - the fnn model
*/ */
void FBInOne(XTensor inputs[], XTensor &output, XTensor &gold, void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model)
LOSS_FUNCTION_NAME loss, FNNModel &model, XNet &net)
{ {
int batchSize = gold.GetDim(0); int batchSize = inputs[0].GetDim(0);
int n = model.n; int n = model.n;
int depth = model.hDepth; int depth = model.hDepth;
...@@ -945,9 +944,6 @@ void FBInOne(XTensor inputs[], XTensor &output, XTensor &gold, ...@@ -945,9 +944,6 @@ void FBInOne(XTensor inputs[], XTensor &output, XTensor &gold,
/* output layer */ /* output layer */
output = LogSoftmax(MMul(hidden, model.outputW) + b, 1); output = LogSoftmax(MMul(hidden, model.outputW) + b, 1);
/* automatic differentiation */
net.Backward(output);
} }
/* /*
......
...@@ -127,7 +127,6 @@ struct FNNNet ...@@ -127,7 +127,6 @@ struct FNNNet
}; };
/* entry of the program */ /* entry of the program */
extern "C"
int FNNLMMain(int argc, const char ** argv); int FNNLMMain(int argc, const char ** argv);
}; };
......
...@@ -47,9 +47,9 @@ extern const char * GetDataTypeName(TENSOR_DATA_TYPE type); ...@@ -47,9 +47,9 @@ extern const char * GetDataTypeName(TENSOR_DATA_TYPE type);
extern TENSOR_DATA_TYPE GetDataType(const char * typeName); extern TENSOR_DATA_TYPE GetDataType(const char * typeName);
/* data conversion (for lower precision computation) */ /* data conversion (for lower precision computation) */
extern "C" unsigned short FloatToFloat16(float f); unsigned short FloatToFloat16(float f);
extern "C" float Float16ToFloat(unsigned short h); float Float16ToFloat(unsigned short h);
extern "C" void ConvertDataType(int devID, void ConvertDataType(int devID,
void * s, TENSOR_DATA_TYPE typeS, void * s, TENSOR_DATA_TYPE typeS,
void * t, TENSOR_DATA_TYPE typeT, int size); void * t, TENSOR_DATA_TYPE typeT, int size);
......
...@@ -321,7 +321,7 @@ void XLink::MakeLink(const XList * list, XTensor * h, int id) ...@@ -321,7 +321,7 @@ void XLink::MakeLink(const XList * list, XTensor * h, int id)
continue; continue;
XLink &outgo = t->outgo; XLink &outgo = t->outgo;
CheckNTErrors(outgo.head == NULL || outgo.head == t, CheckNTErrors(outgo.head == NULL || outgo.head == t,
"Wrong head of the hyperedge!"); "Wrong head of the hyperedge!");
outgo.SetHead(t); outgo.SetHead(t);
outgo.AddTail(h); outgo.AddTail(h);
} }
...@@ -349,6 +349,7 @@ void XLink::MakeLink(XTensor * t, XList * list, int id) ...@@ -349,6 +349,7 @@ void XLink::MakeLink(XTensor * t, XList * list, int id)
/* backward */ /* backward */
XLink &outgo = t->outgo; XLink &outgo = t->outgo;
outgo.SetHead(t);
CheckNTErrors(outgo.head == NULL || outgo.head == t, "Wrong head of the hyperedge!"); CheckNTErrors(outgo.head == NULL || outgo.head == t, "Wrong head of the hyperedge!");
for(int i = 0; i < list->count; i++){ for(int i = 0; i < list->count; i++){
XTensor * t = (XTensor*)list->GetItem(i); XTensor * t = (XTensor*)list->GetItem(i);
......
...@@ -193,7 +193,7 @@ XTensor::~XTensor() ...@@ -193,7 +193,7 @@ XTensor::~XTensor()
the connectivity of the graph. To kill memory the connectivity of the graph. To kill memory
leak, we release the data of the new tensor leak, we release the data of the new tensor
when its parent is deleted (see ClearIncoming). */ when its parent is deleted (see ClearIncoming). */
if(isTmp && outgo.tailNum > 0){ if(outgo.tailNum > 0){
int dims[MAX_TENSOR_DIM_NUM]; int dims[MAX_TENSOR_DIM_NUM];
memcpy(dims, dimSize, order * sizeof(int)); memcpy(dims, dimSize, order * sizeof(int));
dims[0] = -dims[0]; dims[0] = -dims[0];
...@@ -285,6 +285,27 @@ void XTensor::ShallowCopy(const XTensor &tensor) ...@@ -285,6 +285,27 @@ void XTensor::ShallowCopy(const XTensor &tensor)
/* overloading of the equal-sign */ /* overloading of the equal-sign */
XTensor& XTensor::operator= (const XTensor& tensor) XTensor& XTensor::operator= (const XTensor& tensor)
{ {
/* we must make a hard copy of the tensor if it is the input
of another node. */
if(outgo.tailNum > 0){
int dims[MAX_TENSOR_DIM_NUM];
memcpy(dims, dimSize, order * sizeof(int));
dims[0] = -dims[0];
XTensor * newTensor = new XTensor(order, dims, dataType, denseRatio, devID, mem);
newTensor->SetTMP();
newTensor->data = data;
newTensor->dataHost = dataHost;
XLink::Replace(this, newTensor);
XLink::ClearOutgoing(this);
XLink::ClearIncoming(this);
newTensor->ShallowCopy(this);
data = NULL;
dataHost = NULL;
}
/* hard copy of the data array */ /* hard copy of the data array */
int size = unitNum * unitSize; int size = unitNum * unitSize;
if( isInit && !isSparse && !tensor.isSparse && if( isInit && !isSparse && !tensor.isSparse &&
...@@ -349,7 +370,7 @@ judge whether the two matrices are in the same type and size ...@@ -349,7 +370,7 @@ judge whether the two matrices are in the same type and size
>> b - anther tensor to compare with >> b - anther tensor to compare with
<< return - whether the two input tensors are identical << return - whether the two input tensors are identical
*/ */
bool XTensor::IsIdentical(const XTensor * a, const XTensor * b) bool XTensor::IsSameShaped(const XTensor * a, const XTensor * b)
{ {
if(a == NULL || b == NULL) if(a == NULL || b == NULL)
return false; return false;
...@@ -381,9 +402,9 @@ judge whether the three matrices are in the same type and size ...@@ -381,9 +402,9 @@ judge whether the three matrices are in the same type and size
>> c - a tensor again >> c - a tensor again
<< return - whether the two input tensors are identical << return - whether the two input tensors are identical
*/ */
bool XTensor::IsIdentical(XTensor * a, XTensor * b, XTensor * c) bool XTensor::IsSameShaped(XTensor * a, XTensor * b, XTensor * c)
{ {
return IsIdentical(a, b) && IsIdentical(a, c); return IsSameShaped(a, b) && IsSameShaped(a, c);
} }
/* /*
......
...@@ -207,11 +207,11 @@ public: ...@@ -207,11 +207,11 @@ public:
/* judge whether the two matrices are in the same type and size */ /* judge whether the two matrices are in the same type and size */
static static
bool IsIdentical(const XTensor * a, const XTensor * b); bool IsSameShaped(const XTensor * a, const XTensor * b);
/* judge whether the three matrices are in the same type and size */ /* judge whether the three matrices are in the same type and size */
static static
bool IsIdentical(XTensor * a, XTensor * b, XTensor * c); bool IsSameShaped(XTensor * a, XTensor * b, XTensor * c);
/* set the size of each dimension */ /* set the size of each dimension */
void SetDim(int * myDimSize); void SetDim(int * myDimSize);
......
...@@ -486,9 +486,8 @@ quick sorting ...@@ -486,9 +486,8 @@ quick sorting
NOTE: this means that the items may not placed in a continuous memory space NOTE: this means that the items may not placed in a continuous memory space
>> comp - the comparison function >> comp - the comparison function
*/ */
void XQSort(void * dataA, void * dataB, void * index, int num, int width, int stride, int (*comp)(const void *, const void *)) void XQSort(void * data, void * index, int num, int width, int stride, int (*comp)(const void *, const void *))
{ {
XMemCopy(dataB, -1, dataA, -1, num * width);
char *lo, *hi; // ends of sub-array currently sorting char *lo, *hi; // ends of sub-array currently sorting
int *indexlo, *indexhi; int *indexlo, *indexhi;
char *mid; // points to middle of subarray char *mid; // points to middle of subarray
...@@ -507,8 +506,8 @@ void XQSort(void * dataA, void * dataB, void * index, int num, int width, int st ...@@ -507,8 +506,8 @@ void XQSort(void * dataA, void * dataB, void * index, int num, int width, int st
stackptr = 0; stackptr = 0;
lo = (char*)dataB; lo = (char*)data;
hi = (char*)dataB + realStride * (num - 1); hi = (char*)data + realStride * (num - 1);
indexlo = (int*)index; indexlo = (int*)index;
indexhi = index != NULL ? (int*)index + stride * (num - 1) : NULL; indexhi = index != NULL ? (int*)index + stride * (num - 1) : NULL;
......
...@@ -53,7 +53,7 @@ extern void XSleep(int sleepTime); ...@@ -53,7 +53,7 @@ extern void XSleep(int sleepTime);
extern double GetClock(); extern double GetClock();
extern double GetClockSec(); extern double GetClockSec();
extern void XQSort(void * dataA, void * dataB, void * index, int num, int width, int stride, int (*comp)(const void *, const void *)); extern void XQSort(void * data, void * index, int num, int width, int stride, int (*comp)(const void *, const void *));
extern int CompXFloat(const void * a, const void * b); extern int CompXFloat(const void * a, const void * b);
#ifdef USE_CUDA #ifdef USE_CUDA
......
...@@ -42,7 +42,7 @@ void _Absolute(const XTensor * a, XTensor * b) ...@@ -42,7 +42,7 @@ void _Absolute(const XTensor * a, XTensor * b)
} }
#endif #endif
CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!"); CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!"); CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
DTYPE * d = (DTYPE*)a->data; DTYPE * d = (DTYPE*)a->data;
DTYPE * db = (DTYPE*)b->data; DTYPE * db = (DTYPE*)b->data;
......
...@@ -60,10 +60,9 @@ set each entry to its absolute value ...@@ -60,10 +60,9 @@ set each entry to its absolute value
>> a - input tensor >> a - input tensor
>> b - output tensor >> b - output tensor
*/ */
extern "C"
void _CudaAbsolute(const XTensor * a, XTensor * b) void _CudaAbsolute(const XTensor * a, XTensor * b)
{ {
CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!"); CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
CheckNTErrors((a->isSparse == false), "TODO!"); CheckNTErrors((a->isSparse == false), "TODO!");
int gridSize[3]; int gridSize[3];
......
...@@ -34,7 +34,6 @@ __global__ ...@@ -34,7 +34,6 @@ __global__
void KernelAbsolute(__half * a, __half * b, int size); void KernelAbsolute(__half * a, __half * b, int size);
/* set each entry to its absolute value */ /* set each entry to its absolute value */
extern "C"
void _CudaAbsolute(const XTensor * a, XTensor * b); void _CudaAbsolute(const XTensor * a, XTensor * b);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -55,9 +55,9 @@ void _MatrixMULBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA, ...@@ -55,9 +55,9 @@ void _MatrixMULBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA,
XTensor * ai = (XTensor*)a->GetItem(i); XTensor * ai = (XTensor*)a->GetItem(i);
XTensor * bi = (XTensor*)b->GetItem(i); XTensor * bi = (XTensor*)b->GetItem(i);
XTensor * ci = (XTensor*)c->GetItem(i); XTensor * ci = (XTensor*)c->GetItem(i);
if (!XTensor::IsIdentical(aim, ai) || if (!XTensor::IsSameShaped(aim, ai) ||
!XTensor::IsIdentical(bim, bi) || !XTensor::IsSameShaped(bim, bi) ||
!XTensor::IsIdentical(cim, ci)) !XTensor::IsSameShaped(cim, ci))
{ {
isUniform = false; isUniform = false;
break; break;
......
...@@ -27,7 +27,6 @@ ...@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* matrix multiplication in batch mode (CPU code) */ /* matrix multiplication in batch mode (CPU code) */
extern "C"
void _MatrixMULBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA, const XList * b, MATRIX_TRANS_TYPE transposedB, void _MatrixMULBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA, const XList * b, MATRIX_TRANS_TYPE transposedB,
XList * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0); XList * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
......
...@@ -46,10 +46,10 @@ c = a * b * \alpha ...@@ -46,10 +46,10 @@ c = a * b * \alpha
>> cRowSize - row size of matrix c >> cRowSize - row size of matrix c
>> alpha - the scaling factor >> alpha - the scaling factor
*/ */
extern "C" __global__ __global__
void KernelMatrixMulDenseMSparseMV2(DTYPE * a, MATRIX_TRANS_TYPE transposedA, int aColSize, int aRowSize, void KernelMatrixMulDenseMSparseMV2(DTYPE * a, MATRIX_TRANS_TYPE transposedA, int aColSize, int aRowSize,
void * b, MATRIX_TRANS_TYPE transposedB, int bNonZeroNum, int bColSize, int bRowSize, void * b, MATRIX_TRANS_TYPE transposedB, int bNonZeroNum, int bColSize, int bRowSize,
DTYPE * c, int cColSize, int cRowSize, DTYPE alpha) DTYPE * c, int cColSize, int cRowSize, DTYPE alpha)
{ {
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
......
...@@ -32,17 +32,16 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -32,17 +32,16 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
mutilication of a dense matrix with a sparse vector mutilication of a dense matrix with a sparse vector
c = a * b * \alpha c = a * b * \alpha
*/ */
extern "C" __global__ __global__
void KernelMatrixMulDenseMSparseMV2(DTYPE * a, MATRIX_TRANS_TYPE transposedA, int aColSize, int aRowSize, void KernelMatrixMulDenseMSparseMV2(DTYPE * a, MATRIX_TRANS_TYPE transposedA, int aColSize, int aRowSize,
void * b, MATRIX_TRANS_TYPE transposedB, int bNonZeroNum, int bColSize, int bRowSize, void * b, MATRIX_TRANS_TYPE transposedB, int bNonZeroNum, int bColSize, int bRowSize,
DTYPE * c, int cColSize, int cRowSize, DTYPE alpha); DTYPE * c, int cColSize, int cRowSize, DTYPE alpha);
/* /*
matrix multiplication (for 2d tensors) (cuda version) matrix multiplication (for 2d tensors) (cuda version)
c = trans(a) * trans(b) * alpha + c * beta c = trans(a) * trans(b) * alpha + c * beta
where trans() return the transposed matrix if the flag is fired where trans() return the transposed matrix if the flag is fired
*/ */
extern "C"
void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c, void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XStream * stream = NULL); DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XStream * stream = NULL);
......
...@@ -30,7 +30,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -30,7 +30,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
matrix multiplication for a block (x1,y1) - (x2,y2) matrix multiplication for a block (x1,y1) - (x2,y2)
where (x1,y1) is the upper-left corner and (x2,y2) is the bottom-right corner where (x1,y1) is the upper-left corner and (x2,y2) is the bottom-right corner
*/ */
extern "C"
void _MatrixMul2DMultiTheading(XList * args); void _MatrixMul2DMultiTheading(XList * args);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -31,7 +31,6 @@ matrix multiplication (for 2d tensors) with multi-threading. ...@@ -31,7 +31,6 @@ matrix multiplication (for 2d tensors) with multi-threading.
c = trans(a) * trans(b) * alpha + c * beta c = trans(a) * trans(b) * alpha + c * beta
where trans() return the transposed matrix if the flag is fired. where trans() return the transposed matrix if the flag is fired.
*/ */
extern "C"
void _MatrixMul2DParallel(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, void _MatrixMul2DParallel(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB,
XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL); XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
......
...@@ -113,10 +113,10 @@ void _MatrixMulBatched(const XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -113,10 +113,10 @@ void _MatrixMulBatched(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
cublasHandle_t * handle = a->mem != NULL ? a->mem->GetCublasHandle() : GDevs.GetCudaHandle(a->devID); cublasHandle_t * handle = a->mem != NULL ? a->mem->GetCublasHandle() : GDevs.GetCudaHandle(a->devID);
_CudaBLASMatrixMULList(handle, _CudaBLASMatrixMULList(handle,
aList, transposedA, aList, transposedA,
bList, transposedB, bList, transposedB,
cList, aList->count, cList, aList->count,
alpha, beta); alpha, beta);
BacktoCudaDev(a->devID, devIDBackup); BacktoCudaDev(a->devID, devIDBackup);
#else #else
......
...@@ -34,7 +34,7 @@ multiplication of data arrays in a element-wise manner c(i) = a(i)*b(i) ...@@ -34,7 +34,7 @@ multiplication of data arrays in a element-wise manner c(i) = a(i)*b(i)
>> c - result data array >> c - result data array
>> size - size of c >> size - size of c
*/ */
extern "C" __global__ __global__
void KernelMulElementWise(DTYPE * a, DTYPE * b, DTYPE * c, int size) void KernelMulElementWise(DTYPE * a, DTYPE * b, DTYPE * c, int size)
{ {
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
...@@ -51,7 +51,7 @@ multiplication of data arrays in a element-wise manner c(i) = a(i)*b(i) + \alpha ...@@ -51,7 +51,7 @@ multiplication of data arrays in a element-wise manner c(i) = a(i)*b(i) + \alpha
>> size - size of c >> size - size of c
>> alpha - the coefficient >> alpha - the coefficient
*/ */
extern "C" __global__ __global__
void KernelMulElementWiseV2(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE alpha) void KernelMulElementWiseV2(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE alpha)
{ {
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
...@@ -120,7 +120,6 @@ where i is the item index ...@@ -120,7 +120,6 @@ where i is the item index
>> alpha - the coefficient >> alpha - the coefficient
>> leadingDim - dimension along which we perform broadcasting >> leadingDim - dimension along which we perform broadcasting
*/ */
extern "C"
void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim) void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
{ {
int leadingDimRDI = a->order - leadingDim - 1; int leadingDimRDI = a->order - leadingDim - 1;
......
...@@ -29,11 +29,11 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -29,11 +29,11 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* multiplication of two tensors in a element-wise manner c(i) = a(i)*b(i) */ /* multiplication of two tensors in a element-wise manner c(i) = a(i)*b(i) */
extern "C" __global__ __global__
void KernelMulElementWise(DTYPE * a, DTYPE * b, DTYPE * c, int size); void KernelMulElementWise(DTYPE * a, DTYPE * b, DTYPE * c, int size);
/* multiplication of two tensors in a element-wise manner c(i) = a(i)*b(i) + \alpha*c(i) */ /* multiplication of two tensors in a element-wise manner c(i) = a(i)*b(i) + \alpha*c(i) */
extern "C" __global__ __global__
void KernelMulElementWiseV2(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE alpha); void KernelMulElementWiseV2(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE alpha);
/* multiplication of two tensors in a element-wise manner c(i) = a(i)*b(i)+ \alpha*c(i) */ /* multiplication of two tensors in a element-wise manner c(i) = a(i)*b(i)+ \alpha*c(i) */
...@@ -41,7 +41,6 @@ template<int nonZeroAlpha>__global__ ...@@ -41,7 +41,6 @@ template<int nonZeroAlpha>__global__
void KernelMulElementWiseTensorDynamic(DTYPE * a, DTYPE * b, DTYPE * c, DTYPE alpha, int stride, int ldSizeA, int ldSizeB, int ldSizeC, int blockNum); void KernelMulElementWiseTensorDynamic(DTYPE * a, DTYPE * b, DTYPE * c, DTYPE alpha, int stride, int ldSizeA, int ldSizeB, int ldSizeC, int blockNum);
/* element-wise product of two tensors */ /* element-wise product of two tensors */
extern "C"
void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha = 0, int leadingDim = 0); void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha = 0, int leadingDim = 0);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -41,7 +41,7 @@ void _Negate(const XTensor * a, XTensor * b) ...@@ -41,7 +41,7 @@ void _Negate(const XTensor * a, XTensor * b)
} }
#endif #endif
CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!"); CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!"); CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
DTYPE * d = (DTYPE*)a->data; DTYPE * d = (DTYPE*)a->data;
DTYPE * db = (DTYPE*)b->data; DTYPE * db = (DTYPE*)b->data;
......
...@@ -68,10 +68,9 @@ set each entry to its negtive value ...@@ -68,10 +68,9 @@ set each entry to its negtive value
>> a - input tensor >> a - input tensor
>> b - output tensor >> b - output tensor
*/ */
extern "C"
void _CudaNegate(const XTensor * a, XTensor * b) void _CudaNegate(const XTensor * a, XTensor * b)
{ {
CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!"); CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
CheckNTErrors((a->isSparse == false), "TODO!"); CheckNTErrors((a->isSparse == false), "TODO!");
int gridSize[3]; int gridSize[3];
......
...@@ -37,7 +37,6 @@ __global__ ...@@ -37,7 +37,6 @@ __global__
void KernelNegate(__half * a, __half * b, int size); void KernelNegate(__half * a, __half * b, int size);
/* set each entry to its negtive value */ /* set each entry to its negtive value */
extern "C"
void _CudaNegate(const XTensor * a, XTensor * b); void _CudaNegate(const XTensor * a, XTensor * b);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -41,7 +41,7 @@ void _Sign(const XTensor * a, XTensor * b) ...@@ -41,7 +41,7 @@ void _Sign(const XTensor * a, XTensor * b)
} }
#endif #endif
CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!"); CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!"); CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
DTYPE * d = (DTYPE*)a->data; DTYPE * d = (DTYPE*)a->data;
DTYPE * db = (DTYPE*)b->data; DTYPE * db = (DTYPE*)b->data;
......
...@@ -66,10 +66,9 @@ set each entry to its sign value ...@@ -66,10 +66,9 @@ set each entry to its sign value
>> a - input tensor we are processing >> a - input tensor we are processing
>> b - output tensor we are processing >> b - output tensor we are processing
*/ */
extern "C"
void _CudaSign(const XTensor * a, XTensor * b) void _CudaSign(const XTensor * a, XTensor * b)
{ {
CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!"); CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
CheckNTErrors((a->isSparse == false), "TODO!"); CheckNTErrors((a->isSparse == false), "TODO!");
int gridSize[3]; int gridSize[3];
......
...@@ -37,7 +37,6 @@ __global__ ...@@ -37,7 +37,6 @@ __global__
void KernelSign(__half * a, __half * b, int size); void KernelSign(__half * a, __half * b, int size);
/* set each entry to its sign value */ /* set each entry to its sign value */
extern "C"
void _CudaSign(const XTensor * a, XTensor * b); void _CudaSign(const XTensor * a, XTensor * b);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -35,7 +35,7 @@ c = a + b * \beta ...@@ -35,7 +35,7 @@ c = a + b * \beta
>> size - the size of a/b/c >> size - the size of a/b/c
>> beta - the coefficient >> beta - the coefficient
*/ */
extern "C" __global__ __global__
void KernelADD(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta) void KernelADD(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta)
{ {
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
......
...@@ -29,15 +29,13 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -29,15 +29,13 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* summation of data arrays (CUDA Kernel) */ /* summation of data arrays (CUDA Kernel) */
extern "C" __global__ __global__
void KernelADD(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta = (DTYPE)1.0); void KernelADD(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta = (DTYPE)1.0);
/* tensor summation c = a + b * \beta (cuda version) */ /* tensor summation c = a + b * \beta (cuda version) */
extern "C"
void _CudaSum(const XTensor * a, const XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0); void _CudaSum(const XTensor * a, const XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);
/* tensor summation c = a + b * \beta (cuda version) with an input handle */ /* tensor summation c = a + b * \beta (cuda version) with an input handle */
extern "C"
void _CudaSumWithHandle(int devID, cublasHandle_t * handle, DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta = (DTYPE)1.0); void _CudaSumWithHandle(int devID, cublasHandle_t * handle, DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta = (DTYPE)1.0);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -40,9 +40,9 @@ where b is a vector. ...@@ -40,9 +40,9 @@ where b is a vector.
void _SumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta) void _SumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
{ {
CheckNTErrors((a && b && c), "Empty input tensors!"); CheckNTErrors((a && b && c), "Empty input tensors!");
CheckNTErrors((XTensor::IsIdentical(a, c)), "Unmatched tensors in addition!"); CheckNTErrors((XTensor::IsSameShaped(a, c)), "Unmatched tensors in addition!");
CheckNTErrors((b->order == 2 && b->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]), CheckNTErrors((b->order == 2 && b->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]),
"Illegal input vector size!"); "Illegal input vector size!");
int rowNum = a->dimSize[0]; int rowNum = a->dimSize[0];
int colNum = a->dimSize[1]; int colNum = a->dimSize[1];
......
...@@ -39,7 +39,7 @@ c_col = a_col + b * \beta ...@@ -39,7 +39,7 @@ c_col = a_col + b * \beta
>> size - size of the entire data array >> size - size of the entire data array
>> beta - the scaling factor >> beta - the scaling factor
*/ */
extern "C" __global__ __global__
void KernelADDByColumnTV(DTYPE * a, DTYPE * b, DTYPE * c, int colNum, int blockSize, int size, DTYPE beta) void KernelADDByColumnTV(DTYPE * a, DTYPE * b, DTYPE * c, int colNum, int blockSize, int size, DTYPE beta)
{ {
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
...@@ -67,11 +67,11 @@ where b is a vector. ...@@ -67,11 +67,11 @@ where b is a vector.
void _CudaSumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta) void _CudaSumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
{ {
CheckNTErrors((a && b && c), "Empty input tensors!"); CheckNTErrors((a && b && c), "Empty input tensors!");
CheckNTErrors((XTensor::IsIdentical(a, c)), "Unmatched tensors in addition!"); CheckNTErrors((XTensor::IsSameShaped(a, c)), "Unmatched tensors in addition!");
CheckNTErrors((b->order == 2 && b->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]), CheckNTErrors((b->order == 2 && b->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]),
"Illegal input vector size!"); "Illegal input vector size!");
CheckNTErrors((a->dataType == DEFAULT_DTYPE && b->dataType == DEFAULT_DTYPE && CheckNTErrors((a->dataType == DEFAULT_DTYPE && b->dataType == DEFAULT_DTYPE &&
c->dataType == DEFAULT_DTYPE), "TODO"); c->dataType == DEFAULT_DTYPE), "TODO");
int rowNum = a->dimSize[0]; int rowNum = a->dimSize[0];
int colNum = a->dimSize[1]; int colNum = a->dimSize[1];
......
...@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* summation of a tensor and a vector (column vector) */ /* summation of a tensor and a vector (column vector) */
extern "C"
void _CudaSumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0); void _CudaSumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -27,9 +27,8 @@ ...@@ -27,9 +27,8 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* sum of a tensor and a (column) vector */ /* sum of a tensor and a (column) vector */
extern "C"
void _SumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0); void _SumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
#endif // __SUMBYCOLUMNTV_H__ #endif // __SUMBYCOLUMNTV_H__
\ No newline at end of file
...@@ -40,9 +40,9 @@ where c and a are vectors, and b_col is a column in b. ...@@ -40,9 +40,9 @@ where c and a are vectors, and b_col is a column in b.
void _SumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta) void _SumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
{ {
CheckNTErrors((a && b && c), "Empty input tensors!"); CheckNTErrors((a && b && c), "Empty input tensors!");
CheckNTErrors((XTensor::IsIdentical(a, c)), "Unmatched tensors in addition!"); CheckNTErrors((XTensor::IsSameShaped(a, c)), "Unmatched tensors in addition!");
CheckNTErrors((a->order == 2 && a->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]), CheckNTErrors((a->order == 2 && a->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]),
"Illegal input vector size!"); "Illegal input vector size!");
if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) { if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
#ifdef USE_CUDA #ifdef USE_CUDA
......
...@@ -39,7 +39,7 @@ c = a + \sum{col} b_col * \beta ...@@ -39,7 +39,7 @@ c = a + \sum{col} b_col * \beta
>> size - size of the entire data array >> size - size of the entire data array
>> beta - the scaling factor >> beta - the scaling factor
*/ */
extern "C" __global__ __global__
void KernelADDByColumnVT(DTYPE * a, DTYPE * b, DTYPE * c, int colNum, int rowNum, int blockNum, DTYPE beta) void KernelADDByColumnVT(DTYPE * a, DTYPE * b, DTYPE * c, int colNum, int rowNum, int blockNum, DTYPE beta)
{ {
int row = blockDim.x * blockIdx.x + threadIdx.x; int row = blockDim.x * blockIdx.x + threadIdx.x;
...@@ -83,11 +83,11 @@ where c and a are vectors, and b_col is a column in b. ...@@ -83,11 +83,11 @@ where c and a are vectors, and b_col is a column in b.
void _CudaSumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta) void _CudaSumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
{ {
CheckNTErrors((a && b && c), "Empty input tensors!"); CheckNTErrors((a && b && c), "Empty input tensors!");
CheckNTErrors((XTensor::IsIdentical(a, c)), "Unmatched tensors in addition!"); CheckNTErrors((XTensor::IsSameShaped(a, c)), "Unmatched tensors in addition!");
CheckNTErrors((a->order == 2 && a->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]), CheckNTErrors((a->order == 2 && a->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]),
"Illegal input vector size!"); "Illegal input vector size!");
CheckNTErrors((a->dataType == DEFAULT_DTYPE && b->dataType == DEFAULT_DTYPE && CheckNTErrors((a->dataType == DEFAULT_DTYPE && b->dataType == DEFAULT_DTYPE &&
c->dataType == DEFAULT_DTYPE), "TODO"); c->dataType == DEFAULT_DTYPE), "TODO");
int rowNum = b->dimSize[0]; int rowNum = b->dimSize[0];
int colNum = b->dimSize[1]; int colNum = b->dimSize[1];
......
...@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* summation of a vector (column vector) and a tensor */ /* summation of a vector (column vector) and a tensor */
extern "C"
void _CudaSumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0); void _CudaSumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -27,9 +27,8 @@ ...@@ -27,9 +27,8 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* sum of a (column) vector and a tensor */ /* sum of a (column) vector and a tensor */
extern "C"
void _SumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0); void _SumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
#endif // __SUMBYCOLUMNVT_H__ #endif // __SUMBYCOLUMNVT_H__
\ No newline at end of file
...@@ -42,7 +42,7 @@ void _MatrixMULCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -42,7 +42,7 @@ void _MatrixMULCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
{ {
CheckNTErrors((a && b && c), "Empty input tensors!"); CheckNTErrors((a && b && c), "Empty input tensors!");
CheckNTErrors((a->order == 2 && b->order == 2 && c->order == 2), CheckNTErrors((a->order == 2 && b->order == 2 && c->order == 2),
"Input tensors must have a order = 2!"); "Input tensors must have a order = 2!");
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!"); CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
CheckNTErrors((b->dataType == DEFAULT_DTYPE), "TODO!"); CheckNTErrors((b->dataType == DEFAULT_DTYPE), "TODO!");
CheckNTErrors((c->dataType == DEFAULT_DTYPE), "TODO!"); CheckNTErrors((c->dataType == DEFAULT_DTYPE), "TODO!");
......
...@@ -143,7 +143,6 @@ void _CudaBLASMatrixMULBatched(cublasHandle_t * handle, ...@@ -143,7 +143,6 @@ void _CudaBLASMatrixMULBatched(cublasHandle_t * handle,
} }
/* matrix multiplication in batch and strided mode via cuda version BLAS */ /* matrix multiplication in batch and strided mode via cuda version BLAS */
extern "C"
void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle, void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, long long int strideA, const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, long long int strideA,
const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB, long long int strideB, const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB, long long int strideB,
...@@ -225,9 +224,9 @@ void _CudaBLASMatrixMULList(cublasHandle_t * handle, ...@@ -225,9 +224,9 @@ void _CudaBLASMatrixMULList(cublasHandle_t * handle,
XTensor * ai = (XTensor*)a->GetItem(i); XTensor * ai = (XTensor*)a->GetItem(i);
XTensor * bi = (XTensor*)b->GetItem(i); XTensor * bi = (XTensor*)b->GetItem(i);
XTensor * ci = (XTensor*)c->GetItem(i); XTensor * ci = (XTensor*)c->GetItem(i);
if (!XTensor::IsIdentical(aim, ai) || if (!XTensor::IsSameShaped(aim, ai) ||
!XTensor::IsIdentical(bim, bi) || !XTensor::IsSameShaped(bim, bi) ||
!XTensor::IsIdentical(cim, ci)) !XTensor::IsSameShaped(cim, ci))
{ {
isUniform = false; isUniform = false;
break; break;
......
...@@ -27,14 +27,12 @@ ...@@ -27,14 +27,12 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* matrix multiplication (BLAS) */ /* matrix multiplication (BLAS) */
extern "C"
void _MatrixMULCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, void _MatrixMULCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB,
XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0); XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
#ifdef USE_CUDA #ifdef USE_CUDA
/* matrix multiplication via cuda version BLAS */ /* matrix multiplication via cuda version BLAS */
extern "C"
void _CudaBLASMatrixMUL(cublasHandle_t * handle, void _CudaBLASMatrixMUL(cublasHandle_t * handle,
const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB, const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
...@@ -42,7 +40,6 @@ void _CudaBLASMatrixMUL(cublasHandle_t * handle, ...@@ -42,7 +40,6 @@ void _CudaBLASMatrixMUL(cublasHandle_t * handle,
int na, int ma, int nb, int mb, int nc, int mc, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0); int na, int ma, int nb, int mb, int nc, int mc, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
/* matrix multiplication in batch mode via cuda version BLAS */ /* matrix multiplication in batch mode via cuda version BLAS */
extern "C"
void _CudaBLASMatrixMULBatched(cublasHandle_t * handle, void _CudaBLASMatrixMULBatched(cublasHandle_t * handle,
const void ** a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, const void ** a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
const void ** b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB, const void ** b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
...@@ -51,7 +48,6 @@ void _CudaBLASMatrixMULBatched(cublasHandle_t * handle, ...@@ -51,7 +48,6 @@ void _CudaBLASMatrixMULBatched(cublasHandle_t * handle,
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0); DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
/* matrix multiplication in batch and strided mode via cuda version BLAS */ /* matrix multiplication in batch and strided mode via cuda version BLAS */
extern "C"
void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle, void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, long long int strideA, const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, long long int strideA,
const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB, long long int strideB, const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB, long long int strideB,
...@@ -60,7 +56,6 @@ void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle, ...@@ -60,7 +56,6 @@ void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0); DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
/* matrix multiplication in batch mode via cuda version BLAS */ /* matrix multiplication in batch mode via cuda version BLAS */
extern "C"
void _CudaBLASMatrixMULList(cublasHandle_t * handle, const XList * a, MATRIX_TRANS_TYPE transposedA, void _CudaBLASMatrixMULList(cublasHandle_t * handle, const XList * a, MATRIX_TRANS_TYPE transposedA,
const XList * b, MATRIX_TRANS_TYPE transposedB, XList * c, const XList * b, MATRIX_TRANS_TYPE transposedB, XList * c,
int count, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0); int count, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
......
...@@ -27,14 +27,12 @@ ...@@ -27,14 +27,12 @@
namespace nts{ // namespace nts(NiuTrans.Tensor) namespace nts{ // namespace nts(NiuTrans.Tensor)
/* generate a tensor with selected data c = select(a) */ /* generate a tensor with selected data c = select(a) */
extern "C"
void _CudaSelect(const XTensor * a, XTensor * c, XTensor * indexCPU); void _CudaSelect(const XTensor * a, XTensor * c, XTensor * indexCPU);
/* /*
generate a tensor with selected data in range[low,high] along the given dimension generate a tensor with selected data in range[low,high] along the given dimension
c = select(a) c = select(a)
*/ */
extern "C"
void _CudaSelectRange(const XTensor * a, XTensor * c, int dim, int low, int high); void _CudaSelectRange(const XTensor * a, XTensor * c, int dim, int low, int high);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -27,7 +27,6 @@ ...@@ -27,7 +27,6 @@
namespace nts{ // namespace nts(NiuTrans.Tensor) namespace nts{ // namespace nts(NiuTrans.Tensor)
/* generate a tensor with selected data c = select(a) */ /* generate a tensor with selected data c = select(a) */
extern "C"
void _Select(const XTensor * a, XTensor * c, XTensor * indexCPU); void _Select(const XTensor * a, XTensor * c, XTensor * indexCPU);
/* /*
...@@ -40,7 +39,6 @@ XTensor Select(const XTensor &a, XTensor &indexCPU); ...@@ -40,7 +39,6 @@ XTensor Select(const XTensor &a, XTensor &indexCPU);
generate a tensor with selected data in range[low,high] along the given dimension generate a tensor with selected data in range[low,high] along the given dimension
c = select(a) c = select(a)
*/ */
extern "C"
void _SelectRange(const XTensor * a, XTensor * c, int dim, int low, int high); void _SelectRange(const XTensor * a, XTensor * c, int dim, int low, int high);
/* /*
...@@ -52,4 +50,4 @@ XTensor SelectRange(const XTensor &a, int dim, int low, int high); ...@@ -52,4 +50,4 @@ XTensor SelectRange(const XTensor &a, int dim, int low, int high);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
#endif // __SELECT_H__ #endif // __SELECT_H__
\ No newline at end of file
...@@ -42,7 +42,7 @@ void _Log(const XTensor * a, XTensor * b) ...@@ -42,7 +42,7 @@ void _Log(const XTensor * a, XTensor * b)
} }
#endif #endif
CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!"); CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!"); CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
DTYPE * d = (DTYPE*)a->data; DTYPE * d = (DTYPE*)a->data;
DTYPE * db = (DTYPE*)b->data; DTYPE * db = (DTYPE*)b->data;
......
...@@ -60,10 +60,9 @@ set each entry to its log value ...@@ -60,10 +60,9 @@ set each entry to its log value
>> a - input tensor >> a - input tensor
>> b - output tensor >> b - output tensor
*/ */
extern "C"
void _CudaLog(const XTensor * a, XTensor * b) void _CudaLog(const XTensor * a, XTensor * b)
{ {
CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!"); CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
CheckNTErrors((a->isSparse == false), "TODO!"); CheckNTErrors((a->isSparse == false), "TODO!");
int gridSize[3]; int gridSize[3];
......
...@@ -37,7 +37,6 @@ __global__ ...@@ -37,7 +37,6 @@ __global__
void KernelLog(__half * a, __half * b, int size); void KernelLog(__half * a, __half * b, int size);
/* set each entry to its log value */ /* set each entry to its log value */
extern "C"
void _CudaLog(const XTensor * a, XTensor * b); void _CudaLog(const XTensor * a, XTensor * b);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -45,9 +45,9 @@ where a and b are the scalar and bias respectively, and \epsilon is the adjustme ...@@ -45,9 +45,9 @@ where a and b are the scalar and bias respectively, and \epsilon is the adjustme
void _Normalize(const XTensor * input, XTensor * output, int dim, const XTensor * mean, const XTensor * var, const XTensor * a, const XTensor * b, DTYPE epsilon) void _Normalize(const XTensor * input, XTensor * output, int dim, const XTensor * mean, const XTensor * var, const XTensor * a, const XTensor * b, DTYPE epsilon)
{ {
int dimRDI = input->order - dim - 1; int dimRDI = input->order - dim - 1;
CheckNTErrors((XTensor::IsIdentical(input, output)), "Unmatched input tensors!"); CheckNTErrors((XTensor::IsSameShaped(input, output)), "Unmatched input tensors!");
CheckNTErrors((XTensor::IsIdentical(a, b)), "Unmatched input tensors"); CheckNTErrors((XTensor::IsSameShaped(a, b)), "Unmatched input tensors");
CheckNTErrors((XTensor::IsIdentical(mean, var)), "Unmatched input tensors"); CheckNTErrors((XTensor::IsSameShaped(mean, var)), "Unmatched input tensors");
CheckNTErrors((input && output && mean && var && a && b), "Empty input tensors!"); CheckNTErrors((input && output && mean && var && a && b), "Empty input tensors!");
CheckNTErrors((dimRDI >= 0 && dimRDI < input->order), "Incorrect reduction dimension!"); CheckNTErrors((dimRDI >= 0 && dimRDI < input->order), "Incorrect reduction dimension!");
CheckNTErrors((dimRDI == a->order - 1), "Incorrect reduction dimension!"); CheckNTErrors((dimRDI == a->order - 1), "Incorrect reduction dimension!");
......
...@@ -44,8 +44,8 @@ where a and b are the scalar and bias respectively, and \epsilon is the adjustme ...@@ -44,8 +44,8 @@ where a and b are the scalar and bias respectively, and \epsilon is the adjustme
*/ */
__global__ __global__
void KernelNormalize(DTYPE * input, DTYPE * output, DTYPE * mean, DTYPE * var, void KernelNormalize(DTYPE * input, DTYPE * output, DTYPE * mean, DTYPE * var,
DTYPE * a, DTYPE * b, DTYPE epsilon, DTYPE * a, DTYPE * b, DTYPE epsilon,
int stride, int strideNum, int blockNum) int stride, int strideNum, int blockNum)
{ {
__shared__ DTYPE iMean[MAX_CUDA_THREAD_NUM_PER_BLOCK]; __shared__ DTYPE iMean[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__ DTYPE iVar[MAX_CUDA_THREAD_NUM_PER_BLOCK]; __shared__ DTYPE iVar[MAX_CUDA_THREAD_NUM_PER_BLOCK];
...@@ -88,11 +88,10 @@ where a and b are the scalar and bias respectively, and \epsilon is the adjustme ...@@ -88,11 +88,10 @@ where a and b are the scalar and bias respectively, and \epsilon is the adjustme
>> b - the bias >> b - the bias
>> epsilon - a parameter >> epsilon - a parameter
*/ */
extern "C"
void _CudaNormalize(const XTensor * input, XTensor * output, int dim, void _CudaNormalize(const XTensor * input, XTensor * output, int dim,
const XTensor * mean, const XTensor * var, const XTensor * mean, const XTensor * var,
const XTensor * a, const XTensor * b, const XTensor * a, const XTensor * b,
DTYPE epsilon) DTYPE epsilon)
{ {
CheckNTErrors((input->dataType == DEFAULT_DTYPE), "TODO!"); CheckNTErrors((input->dataType == DEFAULT_DTYPE), "TODO!");
......
...@@ -35,18 +35,17 @@ where a and b are the scalar and bias respectively, and \epsilon is the adjustme ...@@ -35,18 +35,17 @@ where a and b are the scalar and bias respectively, and \epsilon is the adjustme
*/ */
__global__ __global__
void KernelNormalize(DTYPE * input, DTYPE * output, DTYPE * mean, DTYPE * var, void KernelNormalize(DTYPE * input, DTYPE * output, DTYPE * mean, DTYPE * var,
DTYPE * a, DTYPE * b, DTYPE epsilon, DTYPE * a, DTYPE * b, DTYPE epsilon,
int stride, int strideNum, int blockNum); int stride, int strideNum, int blockNum);
/* /*
normalized the data with normal distribution. For an input x, normalized the data with normal distribution. For an input x,
y = a * (x-mean)/sqrt(variance+\epsilon) + b y = a * (x-mean)/sqrt(variance+\epsilon) + b
where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter
*/ */
extern "C"
void _CudaNormalize(const XTensor * input, XTensor * output, int dim, void _CudaNormalize(const XTensor * input, XTensor * output, int dim,
const XTensor * mean, const XTensor * var, const XTensor * mean, const XTensor * var,
const XTensor * a, const XTensor * b, DTYPE epsilon); const XTensor * a, const XTensor * b, DTYPE epsilon);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -31,7 +31,6 @@ normalized the data with normal distribution. ...@@ -31,7 +31,6 @@ normalized the data with normal distribution.
For an input x, y = a * (x-mean)/sqrt(variance+\epsilon) + b For an input x, y = a * (x-mean)/sqrt(variance+\epsilon) + b
where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter. where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter.
*/ */
extern "C"
void _Normalize(const XTensor * input, XTensor * output, int dim, const XTensor * mean, const XTensor * var, const XTensor * a, const XTensor * b, DTYPE epsilon); void _Normalize(const XTensor * input, XTensor * output, int dim, const XTensor * mean, const XTensor * var, const XTensor * a, const XTensor * b, DTYPE epsilon);
/* /*
...@@ -40,7 +39,6 @@ keep the result in the input tenosr and return nothing ...@@ -40,7 +39,6 @@ keep the result in the input tenosr and return nothing
For an input x, x = a * (x-mean)/sqrt(variance+\epsilon) + b For an input x, x = a * (x-mean)/sqrt(variance+\epsilon) + b
where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter. where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter.
*/ */
extern "C"
void _NormalizeMe(XTensor * input, int dim, const XTensor * mean, const XTensor * var, const XTensor * a, const XTensor * b, DTYPE epsilon); void _NormalizeMe(XTensor * input, int dim, const XTensor * mean, const XTensor * var, const XTensor * a, const XTensor * b, DTYPE epsilon);
/* /*
......
...@@ -100,10 +100,9 @@ void KernelPower(__half * a, __half * b, __half p, int size) ...@@ -100,10 +100,9 @@ void KernelPower(__half * a, __half * b, __half p, int size)
} }
/* get the power of the entries */ /* get the power of the entries */
extern "C"
void _CudaPower(const XTensor * a, XTensor * b, DTYPE p) void _CudaPower(const XTensor * a, XTensor * b, DTYPE p)
{ {
CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!"); CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
int gridSize[3]; int gridSize[3];
int blockSize[3]; int blockSize[3];
......
...@@ -37,7 +37,6 @@ __global__ ...@@ -37,7 +37,6 @@ __global__
void KernelSqrtV2(__half * a, __half * b, int size); void KernelSqrtV2(__half * a, __half * b, int size);
/* get the power of the entries */ /* get the power of the entries */
extern "C"
void _CudaPower(const XTensor * a, XTensor * b, DTYPE p); void _CudaPower(const XTensor * a, XTensor * b, DTYPE p);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -47,8 +47,7 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift) ...@@ -47,8 +47,7 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift)
} }
#endif #endif
CheckNTErrors((a->dataType == DEFAULT_DTYPE), CheckNTErrors((a->dataType == DEFAULT_DTYPE), "The tensor is not in the default data type!");
"The tensor is not in the default data type!");
/* sparse tensor */ /* sparse tensor */
if(a->isSparse){ if(a->isSparse){
......
...@@ -37,7 +37,6 @@ __global__ ...@@ -37,7 +37,6 @@ __global__
void KernelScaleAndShift(__half * a, __half * b, int size, __half scale, __half shift); void KernelScaleAndShift(__half * a, __half * b, int size, __half scale, __half shift);
/* scale and shift all tensor entires b = a * scale + shift (cuda version) */ /* scale and shift all tensor entires b = a * scale + shift (cuda version) */
extern "C"
void _CudaScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift); void _CudaScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -86,7 +86,7 @@ void _CopyBlocks(void * source, int blockSize, int * sourceBlocks, int blockNum, ...@@ -86,7 +86,7 @@ void _CopyBlocks(void * source, int blockSize, int * sourceBlocks, int blockNum,
*/ */
for (int i = 0; i < blockNum; i++) { for (int i = 0; i < blockNum; i++) {
XMemCopy((char*)target + targetBlocks[i] * blockSize, devID, XMemCopy((char*)target + targetBlocks[i] * blockSize, devID,
(char*)source + sourceBlocks[i] * blockSize, devID, blockSize); (char*)source + sourceBlocks[i] * blockSize, devID, blockSize);
} }
} }
} }
......
...@@ -39,7 +39,7 @@ Note that a grid may have a number of blocks ...@@ -39,7 +39,7 @@ Note that a grid may have a number of blocks
>> isIndexOnDev - indicates whether the index is on the device already >> isIndexOnDev - indicates whether the index is on the device already
*/ */
void _CopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, void * target, void _CopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, void * target,
int * index, int unitSize, bool isIndexOnDev, XMem * myMem) int * index, int unitSize, bool isIndexOnDev, XMem * myMem)
{ {
CheckNTErrors((unitSize == sizeof(int)), "TODO!"); CheckNTErrors((unitSize == sizeof(int)), "TODO!");
......
...@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* copy data by index */ /* copy data by index */
extern "C"
void _CudaCopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, void * target, int * index, int unitSize, XMem * myMem); void _CudaCopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, void * target, int * index, int unitSize, XMem * myMem);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -27,7 +27,6 @@ ...@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* copy a number of blocks in grid */ /* copy a number of blocks in grid */
extern "C"
void _CopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, void * target, int * index, int unitSize, bool isIndexOnDev, XMem * myMem); void _CopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, void * target, int * index, int unitSize, bool isIndexOnDev, XMem * myMem);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -33,7 +33,6 @@ __global__ ...@@ -33,7 +33,6 @@ __global__
void KernelCopyBlocks(DTYPE * source, int blockSize, int blockNum, DTYPE * target, int * targetBlocks); void KernelCopyBlocks(DTYPE * source, int blockSize, int blockNum, DTYPE * target, int * targetBlocks);
/* copy a number of blocks to target positions (cuda version) */ /* copy a number of blocks to target positions (cuda version) */
extern "C"
void _CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem); void _CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -27,7 +27,6 @@ ...@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* copy a number of blocks to target positions (on site) */ /* copy a number of blocks to target positions (on site) */
extern "C"
void _CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem); void _CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -72,7 +72,7 @@ copy a number of blocks from source positions to target positions (cuda version) ...@@ -72,7 +72,7 @@ copy a number of blocks from source positions to target positions (cuda version)
*/ */
void _CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID) void _CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID)
{ {
CheckNTErrors((devID >= 0), "Wrong device to run!"); CheckNTErrors(devID >= 0, "Wrong device to run!");
CheckNTErrors((blockSize % sizeof(DTYPE) == 0), "Unsupported block size!"); CheckNTErrors((blockSize % sizeof(DTYPE) == 0), "Unsupported block size!");
/* copy the index to the GPU memory */ /* copy the index to the GPU memory */
......
...@@ -33,7 +33,6 @@ __global__ ...@@ -33,7 +33,6 @@ __global__
void KernelCopyBlocksSelected(DTYPE * source, int blockSize, int * sourceBlocks, int blockNum, DTYPE * target, int * targetBlocks); void KernelCopyBlocksSelected(DTYPE * source, int blockSize, int * sourceBlocks, int blockNum, DTYPE * target, int * targetBlocks);
/* copy a number of blocks form source positions to target positions (cuda version) */ /* copy a number of blocks form source positions to target positions (cuda version) */
extern "C"
void _CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID); void _CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -27,7 +27,6 @@ ...@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* copy data blocks by 2d layout */ /* copy data blocks by 2d layout */
extern "C"
void _CopyData2D(void ** s, int sPitch, void ** t, int tPitch, int count, int mSize, int n, XMem * myMem); void _CopyData2D(void ** s, int sPitch, void ** t, int tPitch, int count, int mSize, int n, XMem * myMem);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -38,7 +38,7 @@ in the k-th grid ...@@ -38,7 +38,7 @@ in the k-th grid
*/ */
void _CopyInGrid(const XTensor * s, XTensor * t, int * index, int blockDim, int blockNumInGrid, bool isIndexOnDev) void _CopyInGrid(const XTensor * s, XTensor * t, int * index, int blockDim, int blockNumInGrid, bool isIndexOnDev)
{ {
CheckNTErrors((XTensor::IsIdentical(s, t)), "Unmatched tensors!"); CheckNTErrors((XTensor::IsSameShaped(s, t)), "Unmatched tensors!");
int blockDimRDI = s->order - blockDim - 1; int blockDimRDI = s->order - blockDim - 1;
int blockSize = 1; int blockSize = 1;
......
...@@ -27,9 +27,8 @@ ...@@ -27,9 +27,8 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* copy a number of blocks in grid. i.e., reorder the data blocks in the same memory piece*/ /* copy a number of blocks in grid. i.e., reorder the data blocks in the same memory piece*/
extern "C"
void _CopyInGrid(const XTensor * s, XTensor * t, int * index, int blockDim, int blockNumInGrid, bool isIndexOnDev = false); void _CopyInGrid(const XTensor * s, XTensor * t, int * index, int blockDim, int blockNumInGrid, bool isIndexOnDev = false);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
#endif // __COPYINGRID_H__ #endif // __COPYINGRID_H__
\ No newline at end of file
...@@ -44,7 +44,7 @@ void _CopyIndexed(const XTensor * s, XTensor * t, int dim, int * srcIndex, int i ...@@ -44,7 +44,7 @@ void _CopyIndexed(const XTensor * s, XTensor * t, int dim, int * srcIndex, int i
{ {
CheckNTErrors((s && t), "Invalid tensors!"); CheckNTErrors((s && t), "Invalid tensors!");
CheckNTErrors((s->devID == t->devID || (s->devID < 0 && t->devID < 0)), CheckNTErrors((s->devID == t->devID || (s->devID < 0 && t->devID < 0)),
"the data must be kept on the same device!"); "the data must be kept on the same device!");
CheckNTErrors((dim < s->order && dim < t->order), "A too larget dimension specified!"); CheckNTErrors((dim < s->order && dim < t->order), "A too larget dimension specified!");
CheckNTErrors((s->unitSize == t->unitSize), "Unmatched tensors!"); CheckNTErrors((s->unitSize == t->unitSize), "Unmatched tensors!");
......
...@@ -27,7 +27,6 @@ ...@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* copy selected sub-tensors */ /* copy selected sub-tensors */
extern "C"
void _CopyIndexed(const XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum); void _CopyIndexed(const XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum);
/* /*
......
...@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* copy all elements from a source matrix to a target matrix */ /* copy all elements from a source matrix to a target matrix */
extern "C"
void _CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream = NULL); void _CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream = NULL);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -27,7 +27,6 @@ ...@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* copy s to t */ /* copy s to t */
extern "C"
void _CopyValues(const XTensor * s, XTensor * t, XStream * stream = NULL); void _CopyValues(const XTensor * s, XTensor * t, XStream * stream = NULL);
/* /*
...@@ -38,4 +37,4 @@ XTensor CopyValues(const XTensor &s, XStream * stream = NULL); ...@@ -38,4 +37,4 @@ XTensor CopyValues(const XTensor &s, XStream * stream = NULL);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
#endif // __COPYVALUES_H__ #endif // __COPYVALUES_H__
\ No newline at end of file
...@@ -101,8 +101,8 @@ crossing of the i-th columne and the j-th row. ...@@ -101,8 +101,8 @@ crossing of the i-th columne and the j-th row.
*/ */
__global__ __global__
void KernelReduceMax(__half * input, __half * output, void KernelReduceMax(__half * input, __half * output,
int stride, int strideNum, int reducedStrideNum, int stride, int strideNum, int reducedStrideNum,
int blockSize, int blockNum) int blockSize, int blockNum)
{ {
int idx = threadIdx.x * blockDim.y + threadIdx.y; int idx = threadIdx.x * blockDim.y + threadIdx.y;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x; unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
...@@ -224,8 +224,8 @@ reduce a tensor to another that keeps the max value along a dimension - fast ve ...@@ -224,8 +224,8 @@ reduce a tensor to another that keeps the max value along a dimension - fast ve
*/ */
template <unsigned int goodSize> __global__ template <unsigned int goodSize> __global__
void KernelReduceMaxFast(__half * input, __half * output, void KernelReduceMaxFast(__half * input, __half * output,
int stride, int strideNum, int reducedStrideNum, int stride, int strideNum, int reducedStrideNum,
int blockSize, int blockNum) int blockSize, int blockNum)
{ {
unsigned int tid = threadIdx.y; unsigned int tid = threadIdx.y;
unsigned int j = blockIdx.y * (blockDim.y * 2) + threadIdx.y; unsigned int j = blockIdx.y * (blockDim.y * 2) + threadIdx.y;
......
...@@ -29,7 +29,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor) ...@@ -29,7 +29,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* get the max-valued items along a dimension of the tensor (cuda version) */ /* get the max-valued items along a dimension of the tensor (cuda version) */
extern "C"
void _CudaReduceMax(const XTensor * input, XTensor * output, int dim); void _CudaReduceMax(const XTensor * input, XTensor * output, int dim);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -27,7 +27,6 @@ ...@@ -27,7 +27,6 @@
namespace nts{ // namespace nts(NiuTrans.Tensor) namespace nts{ // namespace nts(NiuTrans.Tensor)
/* get the max value of the items along a dimension of the tensor. */ /* get the max value of the items along a dimension of the tensor. */
extern "C"
void _ReduceMax(const XTensor * input, XTensor * output, int dim); void _ReduceMax(const XTensor * input, XTensor * output, int dim);
/* /*
...@@ -38,4 +37,4 @@ XTensor ReduceMax(const XTensor &input, int dim); ...@@ -38,4 +37,4 @@ XTensor ReduceMax(const XTensor &input, int dim);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
#endif // __REDUCEMAX_H__ #endif // __REDUCEMAX_H__
\ No newline at end of file
...@@ -30,7 +30,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor) ...@@ -30,7 +30,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
get the mean value along a dimension of the tensor get the mean value along a dimension of the tensor
For a 1-dimensional data array a, mean = (1/n) * sum_i input_i For a 1-dimensional data array a, mean = (1/n) * sum_i input_i
*/ */
extern "C"
void _ReduceMean(const XTensor * input, XTensor * output, int dim); void _ReduceMean(const XTensor * input, XTensor * output, int dim);
/* /*
...@@ -42,4 +41,4 @@ XTensor ReduceMean(const XTensor &input, int dim); ...@@ -42,4 +41,4 @@ XTensor ReduceMean(const XTensor &input, int dim);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
#endif // __REDUCEMEAN_H__ #endif // __REDUCEMEAN_H__
\ No newline at end of file
...@@ -31,7 +31,6 @@ standard variance of the items along a dimension of the tensor ...@@ -31,7 +31,6 @@ standard variance of the items along a dimension of the tensor
For a 1-dimensional data array a, For a 1-dimensional data array a,
variance = (1/n * \sum_i (a_i - mean)^2)^0.5 variance = (1/n * \sum_i (a_i - mean)^2)^0.5
*/ */
extern "C"
void _ReduceStandardVariance(XTensor * input, XTensor * output, int dim, XTensor * mean); void _ReduceStandardVariance(XTensor * input, XTensor * output, int dim, XTensor * mean);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -43,22 +43,20 @@ sum = \sum_i exp((a_i - shift)^power) if isExp == true ...@@ -43,22 +43,20 @@ sum = \sum_i exp((a_i - shift)^power) if isExp == true
void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor * shift, DTYPE power, bool isExp) void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor * shift, DTYPE power, bool isExp)
{ {
CheckNTErrors((input->devID == output->devID || (input->devID < 0 && output->devID < 0)), CheckNTErrors((input->devID == output->devID || (input->devID < 0 && output->devID < 0)),
"This code must be run on the same device!"); "This code must be run on the same device!");
CheckNTErrors((input && output), "Empty input or output tensors!"); CheckNTErrors((input && output), "Empty input or output tensors!");
CheckNTErrors((input->order == output->order + 1), "Incorrect tensor sizes!"); CheckNTErrors((input->order == output->order + 1), "Incorrect tensor sizes!");
CheckNTErrors((input->order > dim && dim >=0), "Illegal dimension to reduce!"); CheckNTErrors((input->order > dim && dim >=0), "Illegal dimension to reduce!");
CheckNTErrors((input->dataType == output->dataType), "Unmatched data types!"); CheckNTErrors((input->dataType == output->dataType), "Unmatched data types!");
CheckNTErrors((shift == NULL || XTensor::IsIdentical(output, shift)), "Incorrect shift tensor size!"); CheckNTErrors((shift == NULL || XTensor::IsSameShaped(output, shift)), "Incorrect shift tensor size!");
int dimRDI = input->order - dim - 1; int dimRDI = input->order - dim - 1;
for(int i = 0; i < input->order; i++){ for(int i = 0; i < input->order; i++){
if(i < dimRDI){ if(i < dimRDI){
CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i]), CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i]), "Unmatched tensors!");
"Unmatched tensors!");
} }
else if(i > dimRDI){ else if(i > dimRDI){
CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i - 1]), CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i - 1]), "Unmatched tensors!");
"Unmatched tensors!");
} }
} }
......
...@@ -34,7 +34,6 @@ For a 1-dimensional data array a, ...@@ -34,7 +34,6 @@ For a 1-dimensional data array a,
sum = \sum_i ((a_i + shift)^power) if isExp == false sum = \sum_i ((a_i + shift)^power) if isExp == false
sum = \sum_i exp((a_i + shift)^power) if isExp == true sum = \sum_i exp((a_i + shift)^power) if isExp == true
*/ */
extern "C"
void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor * shift, DTYPE power, bool isExp); void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor * shift, DTYPE power, bool isExp);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -32,7 +32,6 @@ For a 1-dimensional data array a, ...@@ -32,7 +32,6 @@ For a 1-dimensional data array a,
sum = \sum_i (a_i - shift) if isExp == false sum = \sum_i (a_i - shift) if isExp == false
sum = \sum_i exp(a_i - shift) if isExp == true sum = \sum_i exp(a_i - shift) if isExp == true
*/ */
extern "C"
void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor * shift = NULL, void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor * shift = NULL,
DTYPE power = (DTYPE)1.0F, bool isExp = false); DTYPE power = (DTYPE)1.0F, bool isExp = false);
...@@ -56,4 +55,4 @@ XTensor ReduceSum(const XTensor &input, int dim, DTYPE power = (DTYPE)1.0F, bool ...@@ -56,4 +55,4 @@ XTensor ReduceSum(const XTensor &input, int dim, DTYPE power = (DTYPE)1.0F, bool
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
#endif // __REDUCESUM_H__ #endif // __REDUCESUM_H__
\ No newline at end of file
...@@ -31,7 +31,6 @@ squared sum of the items along a dimension of the tensor ...@@ -31,7 +31,6 @@ squared sum of the items along a dimension of the tensor
For a 1-dimensional data array a, For a 1-dimensional data array a,
sum = \sum_i (a_i - shift)^2 sum = \sum_i (a_i - shift)^2
*/ */
extern "C"
void _ReduceSumSquared(const XTensor * input, XTensor * output, int dim, const XTensor * shift); void _ReduceSumSquared(const XTensor * input, XTensor * output, int dim, const XTensor * shift);
/* /*
......
...@@ -30,7 +30,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor) ...@@ -30,7 +30,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
variance of the items along a dimension of the tensor variance of the items along a dimension of the tensor
For a 1-dimensional data array a, variance = 1/n * \sum_i (a_i - mean)^2 For a 1-dimensional data array a, variance = 1/n * \sum_i (a_i - mean)^2
*/ */
extern "C"
void _ReduceVariance(const XTensor * input, XTensor * output, int dim, const XTensor * mean); void _ReduceVariance(const XTensor * input, XTensor * output, int dim, const XTensor * mean);
/* /*
......
...@@ -44,7 +44,7 @@ void _Concatenate(const XList * smalls, XTensor * big, int dim) ...@@ -44,7 +44,7 @@ void _Concatenate(const XList * smalls, XTensor * big, int dim)
XTensor * a = (XTensor*)smalls->GetItem(i - 1); XTensor * a = (XTensor*)smalls->GetItem(i - 1);
XTensor * b = (XTensor*)smalls->GetItem(i); XTensor * b = (XTensor*)smalls->GetItem(i);
CheckNTErrors((a && b), "Empty input tensors!"); CheckNTErrors((a && b), "Empty input tensors!");
if (!XTensor::IsIdentical(a, b)) if (!XTensor::IsSameShaped(a, b))
uniform = false; uniform = false;
} }
...@@ -76,7 +76,7 @@ XTensor Concatenate(const XList &smalls, int dim) ...@@ -76,7 +76,7 @@ XTensor Concatenate(const XList &smalls, int dim)
XTensor * a = (XTensor*)smalls.GetItem(i - 1); XTensor * a = (XTensor*)smalls.GetItem(i - 1);
XTensor * b = (XTensor*)smalls.GetItem(i); XTensor * b = (XTensor*)smalls.GetItem(i);
CheckNTErrors((a && b), "Empty input tensors!"); CheckNTErrors((a && b), "Empty input tensors!");
if (!XTensor::IsIdentical(a, b)) if (!XTensor::IsSameShaped(a, b))
uniform = false; uniform = false;
} }
XTensor * tensor = (XTensor*)smalls.GetItem(0); XTensor * tensor = (XTensor*)smalls.GetItem(0);
...@@ -177,7 +177,7 @@ XTensor Concatenate(const XTensor &smallA, const XTensor &smallB, int dim) ...@@ -177,7 +177,7 @@ XTensor Concatenate(const XTensor &smallA, const XTensor &smallB, int dim)
XTensor * a = (XTensor*)smalls.Get(i - 1); XTensor * a = (XTensor*)smalls.Get(i - 1);
XTensor * b = (XTensor*)smalls.Get(i); XTensor * b = (XTensor*)smalls.Get(i);
CheckNTErrors((a && b), "Empty input tensors!"); CheckNTErrors((a && b), "Empty input tensors!");
if (!XTensor::IsIdentical(a, b)) if (!XTensor::IsSameShaped(a, b))
uniform = false; uniform = false;
} }
XTensor * tensor = (XTensor*)smalls.Get(0); XTensor * tensor = (XTensor*)smalls.Get(0);
......
...@@ -27,7 +27,6 @@ ...@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* concatenate a list of tensors along a given dimension */ /* concatenate a list of tensors along a given dimension */
extern "C"
void _ConcatenateSolely(const XList * smalls, XTensor * big, int dim); void _ConcatenateSolely(const XList * smalls, XTensor * big, int dim);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -69,7 +69,6 @@ set target data block index for the data movement in split ...@@ -69,7 +69,6 @@ set target data block index for the data movement in split
>> gridNum - number of grids >> gridNum - number of grids
>> mem - the memory pool >> mem - the memory pool
*/ */
extern "C"
void _CudaMakeMergeBlockIndex(int devID, void _CudaMakeMergeBlockIndex(int devID,
int * blockIndex, int blockNum, int blockNumInMerge, int * blockIndex, int blockNum, int blockNumInMerge,
int splitSizeInGrid, int gridSize, int gridNum) int splitSizeInGrid, int gridSize, int gridNum)
......
...@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* set target data block index for the data movement in split */ /* set target data block index for the data movement in split */
extern "C"
void _CudaMakeMergeBlockIndex(int devID, int * blockIndex, int blockNum, int blockNumInMerge, void _CudaMakeMergeBlockIndex(int devID, int * blockIndex, int blockNum, int blockNumInMerge,
int splitSizeInGrid, int gridSize, int gridNum); int splitSizeInGrid, int gridSize, int gridNum);
......
...@@ -27,7 +27,6 @@ ...@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* set target data block index for the data movement in merge */ /* set target data block index for the data movement in merge */
extern "C"
void _MakeMergeBlockIndex(int * blockIndex, int blockNum, int blockNumInMerge, void _MakeMergeBlockIndex(int * blockIndex, int blockNum, int blockNumInMerge,
int splitSizeInGrid, int gridSize, int gridNum, XMem * mem); int splitSizeInGrid, int gridSize, int gridNum, XMem * mem);
......
...@@ -57,7 +57,6 @@ set target data block index for the data movement in split ...@@ -57,7 +57,6 @@ set target data block index for the data movement in split
>> blockSplitSize - size of the splitted block >> blockSplitSize - size of the splitted block
>> blockNum - number of data blocks >> blockNum - number of data blocks
*/ */
extern "C"
void _CudaMakeSplitBlockIndex(int devID, int * blockIndex, int splitNum, int blockSplitSize, int blockNum) void _CudaMakeSplitBlockIndex(int devID, int * blockIndex, int splitNum, int blockSplitSize, int blockNum)
{ {
int cudaGrids[3]; int cudaGrids[3];
......
...@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* set target data block index for the data movement in split */ /* set target data block index for the data movement in split */
extern "C"
void _CudaMakeSplitBlockIndex(int devID, int * blockIndex, int splitNum, int blockSplitSize, int blockNum); void _CudaMakeSplitBlockIndex(int devID, int * blockIndex, int splitNum, int blockSplitSize, int blockNum);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -27,7 +27,6 @@ ...@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* set target data block index for the data movement in split */ /* set target data block index for the data movement in split */
extern "C"
void _MakeSplitBlockIndex(int * blockIndex, int splitNum, int blockSplitSize, int blockNum, XMem * mem); void _MakeSplitBlockIndex(int * blockIndex, int splitNum, int blockSplitSize, int blockNum, XMem * mem);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -49,7 +49,7 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim) ...@@ -49,7 +49,7 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
CheckNTErrors((s != NULL && t != NULL), "Invalid tensors!"); CheckNTErrors((s != NULL && t != NULL), "Invalid tensors!");
CheckNTErrors((s->devID == t->devID || (s->devID < 0 && t->devID < 0)), CheckNTErrors((s->devID == t->devID || (s->devID < 0 && t->devID < 0)),
"the data must be kept on the same device!"); "the data must be kept on the same device!");
CheckNTErrors((s->unitNum == t->unitNum && s->unitSize == t->unitSize), "Unmatched tensors!"); CheckNTErrors((s->unitNum == t->unitNum && s->unitSize == t->unitSize), "Unmatched tensors!");
CheckNTErrors((s->order == t->order + 1), "Unmatched tensors!"); CheckNTErrors((s->order == t->order + 1), "Unmatched tensors!");
...@@ -58,11 +58,11 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim) ...@@ -58,11 +58,11 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
for (int i = 0; i < s->order; i++) { for (int i = 0; i < s->order; i++) {
if (i == whereToMergeRDI) { if (i == whereToMergeRDI) {
CheckNTErrors((t->dimSizeRDI[i] == s->dimSizeRDI[i] * s->dimSizeRDI[leadingDimRDI]), CheckNTErrors((t->dimSizeRDI[i] == s->dimSizeRDI[i] * s->dimSizeRDI[leadingDimRDI]),
"Unmatched tensor sizes!"); "Unmatched tensor sizes!");
} }
else if (i > leadingDimRDI) { else if (i > leadingDimRDI) {
CheckNTErrors((s->dimSizeRDI[i - 1] == t->dimSizeRDI[i]), CheckNTErrors((s->dimSizeRDI[i - 1] == t->dimSizeRDI[i]),
"Unmatched tensor sizes!"); "Unmatched tensor sizes!");
} }
} }
...@@ -99,8 +99,7 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim) ...@@ -99,8 +99,7 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
char * sData = (char*)s->data + g * blockSize * blockNum * s->unitSize; char * sData = (char*)s->data + g * blockSize * blockNum * s->unitSize;
for (int k = 0; k < mergedNum; k++) { for (int k = 0; k < mergedNum; k++) {
XMemCopy2D(tData + k * tStep, tPtich, t->devID, XMemCopy2D(tData + k * tStep, tPtich, t->devID,
sData + k * sStep, sPitch, s->devID, sData + k * sStep, sPitch, s->devID, mSize, n);
mSize, n);
} }
} }
} }
...@@ -356,7 +355,7 @@ merge two tensors into a big tensor (return a XTensor structure) ...@@ -356,7 +355,7 @@ merge two tensors into a big tensor (return a XTensor structure)
*/ */
XTensor Merge(const XTensor &smallA, const XTensor &smallB, int whereToMerge) XTensor Merge(const XTensor &smallA, const XTensor &smallB, int whereToMerge)
{ {
CheckNTErrors(XTensor::IsIdentical(&smallA, &smallB), CheckNTErrors(XTensor::IsSameShaped(&smallA, &smallB),
"The two tensors must be of the same size!"); "The two tensors must be of the same size!");
int order = smallA.order; int order = smallA.order;
......
...@@ -71,7 +71,6 @@ merge data by blocks (cuda version) ...@@ -71,7 +71,6 @@ merge data by blocks (cuda version)
>> target - target data array >> target - target data array
>> myMem - the memory pool >> myMem - the memory pool
*/ */
extern "C"
void _CudaMergeBlockLists(const XList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem) void _CudaMergeBlockLists(const XList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem)
{ {
CheckNTErrors((myMem != NULL), "No memory pool!"); CheckNTErrors((myMem != NULL), "No memory pool!");
......
...@@ -33,7 +33,6 @@ __global__ ...@@ -33,7 +33,6 @@ __global__
void KernelCopyBlockLists(DTYPE ** sourceList, int * sourceBlockSizes, int sourceBlockNum, DTYPE ** targetList); void KernelCopyBlockLists(DTYPE ** sourceList, int * sourceBlockSizes, int sourceBlockNum, DTYPE ** targetList);
/* merge data by blocks (cuda version) */ /* merge data by blocks (cuda version) */
extern "C"
void _CudaMergeBlockLists(const XList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem); void _CudaMergeBlockLists(const XList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -27,7 +27,6 @@ ...@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* merge data by blocks */ /* merge data by blocks */
extern "C"
void _MergeBlockLists(const XList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem); void _MergeBlockLists(const XList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -41,7 +41,7 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum) ...@@ -41,7 +41,7 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
{ {
CheckNTErrors((s && t), "Invalid tensors!"); CheckNTErrors((s && t), "Invalid tensors!");
CheckNTErrors((s->devID == t->devID || (s->devID < 0 && t->devID < 0)), CheckNTErrors((s->devID == t->devID || (s->devID < 0 && t->devID < 0)),
"the data must be kept on the same device!"); "the data must be kept on the same device!");
CheckNTErrors((s->unitNum == t->unitNum && s->unitSize == t->unitSize), "Unmatched tensors!"); CheckNTErrors((s->unitNum == t->unitNum && s->unitSize == t->unitSize), "Unmatched tensors!");
CheckNTErrors((s->order == t->order - 1), "Unmatched tensors!"); CheckNTErrors((s->order == t->order - 1), "Unmatched tensors!");
...@@ -51,11 +51,11 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum) ...@@ -51,11 +51,11 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
for (int i = 0; i < s->order; i++) { for (int i = 0; i < s->order; i++) {
if (i == whereToSplitRDI) { if (i == whereToSplitRDI) {
CheckNTErrors((s->dimSizeRDI[i] == t->dimSizeRDI[i] * splitNum), CheckNTErrors((s->dimSizeRDI[i] == t->dimSizeRDI[i] * splitNum),
"Unmatched tensor sizes!"); "Unmatched tensor sizes!");
} }
else { else {
CheckNTErrors((s->dimSizeRDI[i] == t->dimSizeRDI[i]), CheckNTErrors((s->dimSizeRDI[i] == t->dimSizeRDI[i]),
"Unmatched tensor sizes!"); "Unmatched tensor sizes!");
} }
} }
...@@ -301,7 +301,7 @@ void Split(const XTensor &big, XList &smalls, int whereToSplit, int splitNum) ...@@ -301,7 +301,7 @@ void Split(const XTensor &big, XList &smalls, int whereToSplit, int splitNum)
XLink::AddParamToHeadInt(s, whereToSplit); XLink::AddParamToHeadInt(s, whereToSplit);
/* it is tricky here that we keep the id of each /* it is tricky here that we keep the id of each
block, rather than the total number of splits */ block, rather than the total number of the splits */
XLink::AddParamToHeadInt(s, i); XLink::AddParamToHeadInt(s, i);
} }
} }
......
...@@ -66,7 +66,6 @@ insert a dimension by copying the blocks for x times (where x is the size of the ...@@ -66,7 +66,6 @@ insert a dimension by copying the blocks for x times (where x is the size of the
>> dim - where to insert the dimension >> dim - where to insert the dimension
>> dSize - size of the newly-inserted dimension >> dSize - size of the newly-inserted dimension
*/ */
extern "C"
void _CudaUnsqueeze(const XTensor * a, XTensor * b, int dim, int dSize) void _CudaUnsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
{ {
int blockSize = 1; int blockSize = 1;
......
...@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* duplicate the data along a given dimension */ /* duplicate the data along a given dimension */
extern "C"
void _CudaUnsqueeze(const XTensor * a, XTensor * b, int dim, int dSize); void _CudaUnsqueeze(const XTensor * a, XTensor * b, int dim, int dSize);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -27,7 +27,6 @@ ...@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* insert a dimension by copying the blocks for x times (where x is the size of the inerted dimension) */ /* insert a dimension by copying the blocks for x times (where x is the size of the inerted dimension) */
extern "C"
void _Unsqueeze(const XTensor * a, XTensor * b, int dim, int dSize); void _Unsqueeze(const XTensor * a, XTensor * b, int dim, int dSize);
/* /*
...@@ -39,4 +38,4 @@ XTensor Unsqueeze(const XTensor &a, int dim, int dSize); ...@@ -39,4 +38,4 @@ XTensor Unsqueeze(const XTensor &a, int dim, int dSize);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
#endif // __UNSQUEEZE_H__ #endif // __UNSQUEEZE_H__
\ No newline at end of file
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
*/ */
#include "../../XTensor.h" #include "../../XTensor.h"
#include "../movement/CopyValues.h"
#include "../../XUtility.h" #include "../../XUtility.h"
#include "../../XName.h" #include "../../XName.h"
#include "Sort.h" #include "Sort.h"
...@@ -36,7 +37,7 @@ sort the tensor along a given dimension ...@@ -36,7 +37,7 @@ sort the tensor along a given dimension
*/ */
void _Sort(const XTensor * a, XTensor * b, XTensor * index, int dim) void _Sort(const XTensor * a, XTensor * b, XTensor * index, int dim)
{ {
CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!"); CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
CheckNTErrors((dim >= 0 && dim < a->order), "Incorrect dimension specified!"); CheckNTErrors((dim >= 0 && dim < a->order), "Incorrect dimension specified!");
CheckNTErrors((a->order == index->order), "Unmatched input tensors!"); CheckNTErrors((a->order == index->order), "Unmatched input tensors!");
CheckNTErrors((index->dataType == X_INT), "Wrong data type!"); CheckNTErrors((index->dataType == X_INT), "Wrong data type!");
...@@ -63,15 +64,15 @@ void _Sort(const XTensor * a, XTensor * b, XTensor * index, int dim) ...@@ -63,15 +64,15 @@ void _Sort(const XTensor * a, XTensor * b, XTensor * index, int dim)
blockNum *= a->dimSizeRDI[i]; blockNum *= a->dimSizeRDI[i];
int blockSize = stride * strideNum; int blockSize = stride * strideNum;
_CopyValues(a, b);
for (int k = 0; k < blockNum; k++) { for (int k = 0; k < blockNum; k++) {
for (int i = 0; i < stride; i++) { for (int i = 0; i < stride; i++) {
void * dataA = (char*)a->data + (k * blockSize + i) * a->unitSize;
void * dataB = (char*)b->data + (k * blockSize + i) * b->unitSize; void * dataB = (char*)b->data + (k * blockSize + i) * b->unitSize;
void * indexData = (char*)index->data + (k * blockSize + i) * sizeof(int); void * indexData = (char*)index->data + (k * blockSize + i) * sizeof(int);
/* we sort the data array along "dim" */ /* we sort the data array along "dim" */
if (a->dataType == X_FLOAT) if (a->dataType == X_FLOAT)
XQSort(dataA, dataB, indexData, strideNum, a->unitSize, stride, CompXFloat); XQSort(dataB, indexData, strideNum, a->unitSize, stride, CompXFloat);
else { else {
ShowNTErrors("TODO!"); ShowNTErrors("TODO!");
} }
......
...@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* sort the tensor along a given dimension */ /* sort the tensor along a given dimension */
extern "C"
void _CudaSortBig(const XTensor * a, XTensor * b, XTensor * indexA, XTensor * indexB, int dim, int k = -1); void _CudaSortBig(const XTensor * a, XTensor * b, XTensor * indexA, XTensor * indexB, int dim, int k = -1);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -39,7 +39,6 @@ void _SortMe(XTensor * a, XTensor * index, int dim); ...@@ -39,7 +39,6 @@ void _SortMe(XTensor * a, XTensor * index, int dim);
sort the data along a given dimension (return a XTensor structure) sort the data along a given dimension (return a XTensor structure)
make a new tensor to keep the result and return it make a new tensor to keep the result and return it
*/ */
extern "C"
void Sort(XTensor & a, XTensor & b, XTensor & index, int dim); void Sort(XTensor & a, XTensor & b, XTensor & index, int dim);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* get the top-k items along a given dimension */ /* get the top-k items along a given dimension */
extern "C"
void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k); void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k);
#endif // USE_CUDA #endif // USE_CUDA
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论