Commit 7e9d7015 by xuchen

This is a fundamental integration!

parent b3a76184
......@@ -96,7 +96,7 @@ void XMathGrad::GradMultiply(XTensor * node)
XNoder::MakeGrad(a);
XNoder::MakeGrad(b);
CheckNTErrors(XTensor::IsIdentical(a, b), "Wrong sized input tensors!");
CheckNTErrors(XTensor::IsSameShaped(a, b), "Wrong sized input tensors!");
_Multiply(node->grad, b, a->grad, 1.0F);
_Multiply(node->grad, a, b->grad, 1.0F);
}
......
......@@ -71,9 +71,11 @@ dE/da = split(dE/dc)
void XShapeGrad::GradMerge(XTensor * node)
{
XLink &income = node->income;
CheckNTErrors(income.tailNum == 0, "Wrong input tensor number for MERGE!");
XTensor * input = income.tails[0];
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for MERGE!");
CheckNTErrors(node->order == input->order - 1, "wrong tensor orders!");
int whereToMerge = income.GetParamInt(0);
int leadDim = income.GetParamInt(1);
......@@ -95,13 +97,13 @@ void XShapeGrad::GradMerge(XTensor * node)
}
dims[0] = -dims[0];
XTensor gradInputSmall(input->order - leadDim, dims,
input->dataType, input->denseRatio,
input->devID, input->mem);
input->dataType, input->denseRatio,
input->devID, input->mem);
dims[whereToMerge - leadDim] *= dims[0];
XTensor gradNodeSmall(node->order - leadDim, dims,
node->dataType, node->denseRatio,
node->devID, node->mem);
XTensor gradNodeSmall(node->order - leadDim, dims + leadDim + 1,
node->dataType, node->denseRatio,
node->devID, node->mem);
/* we can simply split the gradient tensor
if the input is used in merging only */
......@@ -109,7 +111,7 @@ void XShapeGrad::GradMerge(XTensor * node)
for(int i = 0; i < blockNum; i++){
gradNodeSmall.data = (char*)node->grad->data + i * blockSize;
gradInputSmall.data = (char*)input->grad->data + i * blockSize;
_Split(&gradNodeSmall, &gradInputSmall, whereToMerge - leadDim, input->dimSize[leadDim]);
_Split(&gradNodeSmall, &gradInputSmall, whereToMerge - leadDim - 1, input->dimSize[leadDim]);
}
}
......@@ -123,7 +125,7 @@ void XShapeGrad::GradMerge(XTensor * node)
for(int i = 0; i < blockNum; i++){
gradNodeSmall.data = (char*)node->grad->data + i * blockSize;
gradInputSmall.data = (char*)input->grad->data + i * blockSize;
_Split(&gradNodeSmall, &gradInputSmallBuf, whereToMerge - leadDim, input->dimSize[leadDim]);
_Split(&gradNodeSmall, &gradInputSmallBuf, whereToMerge - leadDim - 1, input->dimSize[leadDim]);
_Sum(&gradInputSmall, &gradInputSmallBuf, &gradInputSmall);
}
}
......@@ -162,7 +164,7 @@ void XShapeGrad::GradMergeList(XTensor * node)
smallsGrad.Add(tail->grad);
if(i > 1){
CheckNTErrors(XTensor::IsIdentical(last, tail),
CheckNTErrors(XTensor::IsSameShaped(last, tail),
"Input tensors must be of the same size!");
}
......
......@@ -29,7 +29,7 @@ void XNoder::MakeGrad(XTensor * node)
if(node == NULL)
return;
if(!XTensor::IsIdentical(node, node->grad)){
if(!XTensor::IsSameShaped(node, node->grad)){
delete node->grad;
node->grad = NewTensor(node);
node->grad->SetZeroAll();
......
......@@ -73,8 +73,7 @@ void MakeWordBatch(XTensor &batch, NGram * ngrams, int ngramNum, int n, int vSiz
void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net);
void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NAME loss,
FNNModel &model, FNNModel &grad, FNNNet &net);
void FBInOne(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NAME loss,
FNNModel &model, XNet &net);
void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model);
/*
entry of the program
......@@ -415,7 +414,10 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
}
else{
/* forward + backward process */
FBInOne(inputs, output, gold, CROSSENTROPY, model, autoDiffer);
ForwardAutoDiff(inputs, output, model);
/* automatic differentiation */
autoDiffer.Backward(output, gold, CROSSENTROPY);
/* update model parameters */
Update(model, grad, learningRate, true);
......@@ -902,17 +904,14 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
}
/*
forward + backward in one procedure
forward process (with tensor connections)
>> inputs - input word representations
>> output - output probability
>> gold - gold standard
>> loss - loss function name
>> model - the fnn model
*/
void FBInOne(XTensor inputs[], XTensor &output, XTensor &gold,
LOSS_FUNCTION_NAME loss, FNNModel &model, XNet &net)
void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model)
{
int batchSize = gold.GetDim(0);
int batchSize = inputs[0].GetDim(0);
int n = model.n;
int depth = model.hDepth;
......@@ -945,9 +944,6 @@ void FBInOne(XTensor inputs[], XTensor &output, XTensor &gold,
/* output layer */
output = LogSoftmax(MMul(hidden, model.outputW) + b, 1);
/* automatic differentiation */
net.Backward(output);
}
/*
......
......@@ -127,7 +127,6 @@ struct FNNNet
};
/* entry of the program */
extern "C"
int FNNLMMain(int argc, const char ** argv);
};
......
......@@ -47,9 +47,9 @@ extern const char * GetDataTypeName(TENSOR_DATA_TYPE type);
extern TENSOR_DATA_TYPE GetDataType(const char * typeName);
/* data conversion (for lower precision computation) */
extern "C" unsigned short FloatToFloat16(float f);
extern "C" float Float16ToFloat(unsigned short h);
extern "C" void ConvertDataType(int devID,
unsigned short FloatToFloat16(float f);
float Float16ToFloat(unsigned short h);
void ConvertDataType(int devID,
void * s, TENSOR_DATA_TYPE typeS,
void * t, TENSOR_DATA_TYPE typeT, int size);
......
......@@ -321,7 +321,7 @@ void XLink::MakeLink(const XList * list, XTensor * h, int id)
continue;
XLink &outgo = t->outgo;
CheckNTErrors(outgo.head == NULL || outgo.head == t,
"Wrong head of the hyperedge!");
"Wrong head of the hyperedge!");
outgo.SetHead(t);
outgo.AddTail(h);
}
......@@ -349,6 +349,7 @@ void XLink::MakeLink(XTensor * t, XList * list, int id)
/* backward */
XLink &outgo = t->outgo;
outgo.SetHead(t);
CheckNTErrors(outgo.head == NULL || outgo.head == t, "Wrong head of the hyperedge!");
for(int i = 0; i < list->count; i++){
XTensor * t = (XTensor*)list->GetItem(i);
......
......@@ -193,7 +193,7 @@ XTensor::~XTensor()
the connectivity of the graph. To kill memory
leak, we release the data of the new tensor
when its parent is deleted (see ClearIncoming). */
if(isTmp && outgo.tailNum > 0){
if(outgo.tailNum > 0){
int dims[MAX_TENSOR_DIM_NUM];
memcpy(dims, dimSize, order * sizeof(int));
dims[0] = -dims[0];
......@@ -285,6 +285,27 @@ void XTensor::ShallowCopy(const XTensor &tensor)
/* overloading of the equal-sign */
XTensor& XTensor::operator= (const XTensor& tensor)
{
/* we must make a hard copy of the tensor if it is the input
of another node. */
if(outgo.tailNum > 0){
int dims[MAX_TENSOR_DIM_NUM];
memcpy(dims, dimSize, order * sizeof(int));
dims[0] = -dims[0];
XTensor * newTensor = new XTensor(order, dims, dataType, denseRatio, devID, mem);
newTensor->SetTMP();
newTensor->data = data;
newTensor->dataHost = dataHost;
XLink::Replace(this, newTensor);
XLink::ClearOutgoing(this);
XLink::ClearIncoming(this);
newTensor->ShallowCopy(this);
data = NULL;
dataHost = NULL;
}
/* hard copy of the data array */
int size = unitNum * unitSize;
if( isInit && !isSparse && !tensor.isSparse &&
......@@ -349,7 +370,7 @@ judge whether the two matrices are in the same type and size
>> b - anther tensor to compare with
<< return - whether the two input tensors are identical
*/
bool XTensor::IsIdentical(const XTensor * a, const XTensor * b)
bool XTensor::IsSameShaped(const XTensor * a, const XTensor * b)
{
if(a == NULL || b == NULL)
return false;
......@@ -381,9 +402,9 @@ judge whether the three matrices are in the same type and size
>> c - a tensor again
<< return - whether the two input tensors are identical
*/
bool XTensor::IsIdentical(XTensor * a, XTensor * b, XTensor * c)
bool XTensor::IsSameShaped(XTensor * a, XTensor * b, XTensor * c)
{
return IsIdentical(a, b) && IsIdentical(a, c);
return IsSameShaped(a, b) && IsSameShaped(a, c);
}
/*
......
......@@ -207,11 +207,11 @@ public:
/* judge whether the two matrices are in the same type and size */
static
bool IsIdentical(const XTensor * a, const XTensor * b);
bool IsSameShaped(const XTensor * a, const XTensor * b);
/* judge whether the three matrices are in the same type and size */
static
bool IsIdentical(XTensor * a, XTensor * b, XTensor * c);
bool IsSameShaped(XTensor * a, XTensor * b, XTensor * c);
/* set the size of each dimension */
void SetDim(int * myDimSize);
......
......@@ -486,9 +486,8 @@ quick sorting
NOTE: this means that the items may not placed in a continuous memory space
>> comp - the comparison function
*/
void XQSort(void * dataA, void * dataB, void * index, int num, int width, int stride, int (*comp)(const void *, const void *))
void XQSort(void * data, void * index, int num, int width, int stride, int (*comp)(const void *, const void *))
{
XMemCopy(dataB, -1, dataA, -1, num * width);
char *lo, *hi; // ends of sub-array currently sorting
int *indexlo, *indexhi;
char *mid; // points to middle of subarray
......@@ -507,8 +506,8 @@ void XQSort(void * dataA, void * dataB, void * index, int num, int width, int st
stackptr = 0;
lo = (char*)dataB;
hi = (char*)dataB + realStride * (num - 1);
lo = (char*)data;
hi = (char*)data + realStride * (num - 1);
indexlo = (int*)index;
indexhi = index != NULL ? (int*)index + stride * (num - 1) : NULL;
......
......@@ -53,7 +53,7 @@ extern void XSleep(int sleepTime);
extern double GetClock();
extern double GetClockSec();
extern void XQSort(void * dataA, void * dataB, void * index, int num, int width, int stride, int (*comp)(const void *, const void *));
extern void XQSort(void * data, void * index, int num, int width, int stride, int (*comp)(const void *, const void *));
extern int CompXFloat(const void * a, const void * b);
#ifdef USE_CUDA
......
......@@ -42,7 +42,7 @@ void _Absolute(const XTensor * a, XTensor * b)
}
#endif
CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
DTYPE * d = (DTYPE*)a->data;
DTYPE * db = (DTYPE*)b->data;
......
......@@ -60,10 +60,9 @@ set each entry to its absolute value
>> a - input tensor
>> b - output tensor
*/
extern "C"
void _CudaAbsolute(const XTensor * a, XTensor * b)
{
CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
CheckNTErrors((a->isSparse == false), "TODO!");
int gridSize[3];
......
......@@ -34,7 +34,6 @@ __global__
void KernelAbsolute(__half * a, __half * b, int size);
/* set each entry to its absolute value */
extern "C"
void _CudaAbsolute(const XTensor * a, XTensor * b);
#endif // USE_CUDA
......
......@@ -55,9 +55,9 @@ void _MatrixMULBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA,
XTensor * ai = (XTensor*)a->GetItem(i);
XTensor * bi = (XTensor*)b->GetItem(i);
XTensor * ci = (XTensor*)c->GetItem(i);
if (!XTensor::IsIdentical(aim, ai) ||
!XTensor::IsIdentical(bim, bi) ||
!XTensor::IsIdentical(cim, ci))
if (!XTensor::IsSameShaped(aim, ai) ||
!XTensor::IsSameShaped(bim, bi) ||
!XTensor::IsSameShaped(cim, ci))
{
isUniform = false;
break;
......
......@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* matrix multiplication in batch mode (CPU code) */
extern "C"
void _MatrixMULBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA, const XList * b, MATRIX_TRANS_TYPE transposedB,
XList * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
......
......@@ -46,10 +46,10 @@ c = a * b * \alpha
>> cRowSize - row size of matrix c
>> alpha - the scaling factor
*/
extern "C" __global__
__global__
void KernelMatrixMulDenseMSparseMV2(DTYPE * a, MATRIX_TRANS_TYPE transposedA, int aColSize, int aRowSize,
void * b, MATRIX_TRANS_TYPE transposedB, int bNonZeroNum, int bColSize, int bRowSize,
DTYPE * c, int cColSize, int cRowSize, DTYPE alpha)
void * b, MATRIX_TRANS_TYPE transposedB, int bNonZeroNum, int bColSize, int bRowSize,
DTYPE * c, int cColSize, int cRowSize, DTYPE alpha)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
......
......@@ -32,17 +32,16 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
mutilication of a dense matrix with a sparse vector
c = a * b * \alpha
*/
extern "C" __global__
__global__
void KernelMatrixMulDenseMSparseMV2(DTYPE * a, MATRIX_TRANS_TYPE transposedA, int aColSize, int aRowSize,
void * b, MATRIX_TRANS_TYPE transposedB, int bNonZeroNum, int bColSize, int bRowSize,
DTYPE * c, int cColSize, int cRowSize, DTYPE alpha);
void * b, MATRIX_TRANS_TYPE transposedB, int bNonZeroNum, int bColSize, int bRowSize,
DTYPE * c, int cColSize, int cRowSize, DTYPE alpha);
/*
matrix multiplication (for 2d tensors) (cuda version)
c = trans(a) * trans(b) * alpha + c * beta
where trans() return the transposed matrix if the flag is fired
*/
extern "C"
void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XStream * stream = NULL);
......
......@@ -30,7 +30,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
matrix multiplication for a block (x1,y1) - (x2,y2)
where (x1,y1) is the upper-left corner and (x2,y2) is the bottom-right corner
*/
extern "C"
void _MatrixMul2DMultiTheading(XList * args);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -31,7 +31,6 @@ matrix multiplication (for 2d tensors) with multi-threading.
c = trans(a) * trans(b) * alpha + c * beta
where trans() return the transposed matrix if the flag is fired.
*/
extern "C"
void _MatrixMul2DParallel(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB,
XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
......
......@@ -113,10 +113,10 @@ void _MatrixMulBatched(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
cublasHandle_t * handle = a->mem != NULL ? a->mem->GetCublasHandle() : GDevs.GetCudaHandle(a->devID);
_CudaBLASMatrixMULList(handle,
aList, transposedA,
bList, transposedB,
cList, aList->count,
alpha, beta);
aList, transposedA,
bList, transposedB,
cList, aList->count,
alpha, beta);
BacktoCudaDev(a->devID, devIDBackup);
#else
......
......@@ -34,7 +34,7 @@ multiplication of data arrays in a element-wise manner c(i) = a(i)*b(i)
>> c - result data array
>> size - size of c
*/
extern "C" __global__
__global__
void KernelMulElementWise(DTYPE * a, DTYPE * b, DTYPE * c, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
......@@ -51,7 +51,7 @@ multiplication of data arrays in a element-wise manner c(i) = a(i)*b(i) + \alpha
>> size - size of c
>> alpha - the coefficient
*/
extern "C" __global__
__global__
void KernelMulElementWiseV2(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE alpha)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
......@@ -120,7 +120,6 @@ where i is the item index
>> alpha - the coefficient
>> leadingDim - dimension along which we perform broadcasting
*/
extern "C"
void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
{
int leadingDimRDI = a->order - leadingDim - 1;
......
......@@ -29,11 +29,11 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* multiplication of two tensors in a element-wise manner c(i) = a(i)*b(i) */
extern "C" __global__
__global__
void KernelMulElementWise(DTYPE * a, DTYPE * b, DTYPE * c, int size);
/* multiplication of two tensors in a element-wise manner c(i) = a(i)*b(i) + \alpha*c(i) */
extern "C" __global__
__global__
void KernelMulElementWiseV2(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE alpha);
/* multiplication of two tensors in a element-wise manner c(i) = a(i)*b(i)+ \alpha*c(i) */
......@@ -41,7 +41,6 @@ template<int nonZeroAlpha>__global__
void KernelMulElementWiseTensorDynamic(DTYPE * a, DTYPE * b, DTYPE * c, DTYPE alpha, int stride, int ldSizeA, int ldSizeB, int ldSizeC, int blockNum);
/* element-wise product of two tensors */
extern "C"
void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha = 0, int leadingDim = 0);
#endif // USE_CUDA
......
......@@ -41,7 +41,7 @@ void _Negate(const XTensor * a, XTensor * b)
}
#endif
CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
DTYPE * d = (DTYPE*)a->data;
DTYPE * db = (DTYPE*)b->data;
......
......@@ -68,10 +68,9 @@ set each entry to its negtive value
>> a - input tensor
>> b - output tensor
*/
extern "C"
void _CudaNegate(const XTensor * a, XTensor * b)
{
CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
CheckNTErrors((a->isSparse == false), "TODO!");
int gridSize[3];
......
......@@ -37,7 +37,6 @@ __global__
void KernelNegate(__half * a, __half * b, int size);
/* set each entry to its negtive value */
extern "C"
void _CudaNegate(const XTensor * a, XTensor * b);
#endif // USE_CUDA
......
......@@ -41,7 +41,7 @@ void _Sign(const XTensor * a, XTensor * b)
}
#endif
CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
DTYPE * d = (DTYPE*)a->data;
DTYPE * db = (DTYPE*)b->data;
......
......@@ -66,10 +66,9 @@ set each entry to its sign value
>> a - input tensor we are processing
>> b - output tensor we are processing
*/
extern "C"
void _CudaSign(const XTensor * a, XTensor * b)
{
CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
CheckNTErrors((a->isSparse == false), "TODO!");
int gridSize[3];
......
......@@ -37,7 +37,6 @@ __global__
void KernelSign(__half * a, __half * b, int size);
/* set each entry to its sign value */
extern "C"
void _CudaSign(const XTensor * a, XTensor * b);
#endif // USE_CUDA
......
......@@ -35,7 +35,7 @@ c = a + b * \beta
>> size - the size of a/b/c
>> beta - the coefficient
*/
extern "C" __global__
__global__
void KernelADD(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
......
......@@ -29,15 +29,13 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* summation of data arrays (CUDA Kernel) */
extern "C" __global__
__global__
void KernelADD(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta = (DTYPE)1.0);
/* tensor summation c = a + b * \beta (cuda version) */
extern "C"
void _CudaSum(const XTensor * a, const XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);
/* tensor summation c = a + b * \beta (cuda version) with an input handle */
extern "C"
void _CudaSumWithHandle(int devID, cublasHandle_t * handle, DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta = (DTYPE)1.0);
#endif // USE_CUDA
......
......@@ -40,9 +40,9 @@ where b is a vector.
void _SumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
{
CheckNTErrors((a && b && c), "Empty input tensors!");
CheckNTErrors((XTensor::IsIdentical(a, c)), "Unmatched tensors in addition!");
CheckNTErrors((XTensor::IsSameShaped(a, c)), "Unmatched tensors in addition!");
CheckNTErrors((b->order == 2 && b->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]),
"Illegal input vector size!");
"Illegal input vector size!");
int rowNum = a->dimSize[0];
int colNum = a->dimSize[1];
......
......@@ -39,7 +39,7 @@ c_col = a_col + b * \beta
>> size - size of the entire data array
>> beta - the scaling factor
*/
extern "C" __global__
__global__
void KernelADDByColumnTV(DTYPE * a, DTYPE * b, DTYPE * c, int colNum, int blockSize, int size, DTYPE beta)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
......@@ -67,11 +67,11 @@ where b is a vector.
void _CudaSumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
{
CheckNTErrors((a && b && c), "Empty input tensors!");
CheckNTErrors((XTensor::IsIdentical(a, c)), "Unmatched tensors in addition!");
CheckNTErrors((XTensor::IsSameShaped(a, c)), "Unmatched tensors in addition!");
CheckNTErrors((b->order == 2 && b->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]),
"Illegal input vector size!");
"Illegal input vector size!");
CheckNTErrors((a->dataType == DEFAULT_DTYPE && b->dataType == DEFAULT_DTYPE &&
c->dataType == DEFAULT_DTYPE), "TODO");
c->dataType == DEFAULT_DTYPE), "TODO");
int rowNum = a->dimSize[0];
int colNum = a->dimSize[1];
......
......@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* summation of a tensor and a vector (column vector) */
extern "C"
void _CudaSumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
#endif // USE_CUDA
......
......@@ -27,9 +27,8 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* sum of a tensor and a (column) vector */
extern "C"
void _SumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);
} // namespace nts(NiuTrans.Tensor)
#endif // __SUMBYCOLUMNTV_H__
\ No newline at end of file
#endif // __SUMBYCOLUMNTV_H__
......@@ -40,9 +40,9 @@ where c and a are vectors, and b_col is a column in b.
void _SumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
{
CheckNTErrors((a && b && c), "Empty input tensors!");
CheckNTErrors((XTensor::IsIdentical(a, c)), "Unmatched tensors in addition!");
CheckNTErrors((XTensor::IsSameShaped(a, c)), "Unmatched tensors in addition!");
CheckNTErrors((a->order == 2 && a->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]),
"Illegal input vector size!");
"Illegal input vector size!");
if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
#ifdef USE_CUDA
......
......@@ -39,7 +39,7 @@ c = a + \sum{col} b_col * \beta
>> size - size of the entire data array
>> beta - the scaling factor
*/
extern "C" __global__
__global__
void KernelADDByColumnVT(DTYPE * a, DTYPE * b, DTYPE * c, int colNum, int rowNum, int blockNum, DTYPE beta)
{
int row = blockDim.x * blockIdx.x + threadIdx.x;
......@@ -83,11 +83,11 @@ where c and a are vectors, and b_col is a column in b.
void _CudaSumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
{
CheckNTErrors((a && b && c), "Empty input tensors!");
CheckNTErrors((XTensor::IsIdentical(a, c)), "Unmatched tensors in addition!");
CheckNTErrors((XTensor::IsSameShaped(a, c)), "Unmatched tensors in addition!");
CheckNTErrors((a->order == 2 && a->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]),
"Illegal input vector size!");
"Illegal input vector size!");
CheckNTErrors((a->dataType == DEFAULT_DTYPE && b->dataType == DEFAULT_DTYPE &&
c->dataType == DEFAULT_DTYPE), "TODO");
c->dataType == DEFAULT_DTYPE), "TODO");
int rowNum = b->dimSize[0];
int colNum = b->dimSize[1];
......
......@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* summation of a vector (column vector) and a tensor */
extern "C"
void _CudaSumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
#endif // USE_CUDA
......
......@@ -27,9 +27,8 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* sum of a (column) vector and a tensor */
extern "C"
void _SumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);
} // namespace nts(NiuTrans.Tensor)
#endif // __SUMBYCOLUMNVT_H__
\ No newline at end of file
#endif // __SUMBYCOLUMNVT_H__
......@@ -42,7 +42,7 @@ void _MatrixMULCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
{
CheckNTErrors((a && b && c), "Empty input tensors!");
CheckNTErrors((a->order == 2 && b->order == 2 && c->order == 2),
"Input tensors must have a order = 2!");
"Input tensors must have a order = 2!");
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
CheckNTErrors((b->dataType == DEFAULT_DTYPE), "TODO!");
CheckNTErrors((c->dataType == DEFAULT_DTYPE), "TODO!");
......
......@@ -143,7 +143,6 @@ void _CudaBLASMatrixMULBatched(cublasHandle_t * handle,
}
/* matrix multiplication in batch and strided mode via cuda version BLAS */
extern "C"
void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, long long int strideA,
const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB, long long int strideB,
......@@ -225,9 +224,9 @@ void _CudaBLASMatrixMULList(cublasHandle_t * handle,
XTensor * ai = (XTensor*)a->GetItem(i);
XTensor * bi = (XTensor*)b->GetItem(i);
XTensor * ci = (XTensor*)c->GetItem(i);
if (!XTensor::IsIdentical(aim, ai) ||
!XTensor::IsIdentical(bim, bi) ||
!XTensor::IsIdentical(cim, ci))
if (!XTensor::IsSameShaped(aim, ai) ||
!XTensor::IsSameShaped(bim, bi) ||
!XTensor::IsSameShaped(cim, ci))
{
isUniform = false;
break;
......
......@@ -27,14 +27,12 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* matrix multiplication (BLAS) */
extern "C"
void _MatrixMULCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB,
XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
#ifdef USE_CUDA
/* matrix multiplication via cuda version BLAS */
extern "C"
void _CudaBLASMatrixMUL(cublasHandle_t * handle,
const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
......@@ -42,7 +40,6 @@ void _CudaBLASMatrixMUL(cublasHandle_t * handle,
int na, int ma, int nb, int mb, int nc, int mc, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
/* matrix multiplication in batch mode via cuda version BLAS */
extern "C"
void _CudaBLASMatrixMULBatched(cublasHandle_t * handle,
const void ** a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
const void ** b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
......@@ -51,7 +48,6 @@ void _CudaBLASMatrixMULBatched(cublasHandle_t * handle,
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
/* matrix multiplication in batch and strided mode via cuda version BLAS */
extern "C"
void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, long long int strideA,
const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB, long long int strideB,
......@@ -60,7 +56,6 @@ void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
/* matrix multiplication in batch mode via cuda version BLAS */
extern "C"
void _CudaBLASMatrixMULList(cublasHandle_t * handle, const XList * a, MATRIX_TRANS_TYPE transposedA,
const XList * b, MATRIX_TRANS_TYPE transposedB, XList * c,
int count, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
......
......@@ -27,14 +27,12 @@
namespace nts{ // namespace nts(NiuTrans.Tensor)
/* generate a tensor with selected data c = select(a) */
extern "C"
void _CudaSelect(const XTensor * a, XTensor * c, XTensor * indexCPU);
/*
generate a tensor with selected data in range[low,high] along the given dimension
c = select(a)
*/
extern "C"
void _CudaSelectRange(const XTensor * a, XTensor * c, int dim, int low, int high);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -27,7 +27,6 @@
namespace nts{ // namespace nts(NiuTrans.Tensor)
/* generate a tensor with selected data c = select(a) */
extern "C"
void _Select(const XTensor * a, XTensor * c, XTensor * indexCPU);
/*
......@@ -40,7 +39,6 @@ XTensor Select(const XTensor &a, XTensor &indexCPU);
generate a tensor with selected data in range[low,high] along the given dimension
c = select(a)
*/
extern "C"
void _SelectRange(const XTensor * a, XTensor * c, int dim, int low, int high);
/*
......@@ -52,4 +50,4 @@ XTensor SelectRange(const XTensor &a, int dim, int low, int high);
} // namespace nts(NiuTrans.Tensor)
#endif // __SELECT_H__
\ No newline at end of file
#endif // __SELECT_H__
......@@ -42,7 +42,7 @@ void _Log(const XTensor * a, XTensor * b)
}
#endif
CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
DTYPE * d = (DTYPE*)a->data;
DTYPE * db = (DTYPE*)b->data;
......
......@@ -60,10 +60,9 @@ set each entry to its log value
>> a - input tensor
>> b - output tensor
*/
extern "C"
void _CudaLog(const XTensor * a, XTensor * b)
{
CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
CheckNTErrors((a->isSparse == false), "TODO!");
int gridSize[3];
......
......@@ -37,7 +37,6 @@ __global__
void KernelLog(__half * a, __half * b, int size);
/* set each entry to its log value */
extern "C"
void _CudaLog(const XTensor * a, XTensor * b);
#endif // USE_CUDA
......
......@@ -45,9 +45,9 @@ where a and b are the scalar and bias respectively, and \epsilon is the adjustme
void _Normalize(const XTensor * input, XTensor * output, int dim, const XTensor * mean, const XTensor * var, const XTensor * a, const XTensor * b, DTYPE epsilon)
{
int dimRDI = input->order - dim - 1;
CheckNTErrors((XTensor::IsIdentical(input, output)), "Unmatched input tensors!");
CheckNTErrors((XTensor::IsIdentical(a, b)), "Unmatched input tensors");
CheckNTErrors((XTensor::IsIdentical(mean, var)), "Unmatched input tensors");
CheckNTErrors((XTensor::IsSameShaped(input, output)), "Unmatched input tensors!");
CheckNTErrors((XTensor::IsSameShaped(a, b)), "Unmatched input tensors");
CheckNTErrors((XTensor::IsSameShaped(mean, var)), "Unmatched input tensors");
CheckNTErrors((input && output && mean && var && a && b), "Empty input tensors!");
CheckNTErrors((dimRDI >= 0 && dimRDI < input->order), "Incorrect reduction dimension!");
CheckNTErrors((dimRDI == a->order - 1), "Incorrect reduction dimension!");
......
......@@ -44,8 +44,8 @@ where a and b are the scalar and bias respectively, and \epsilon is the adjustme
*/
__global__
void KernelNormalize(DTYPE * input, DTYPE * output, DTYPE * mean, DTYPE * var,
DTYPE * a, DTYPE * b, DTYPE epsilon,
int stride, int strideNum, int blockNum)
DTYPE * a, DTYPE * b, DTYPE epsilon,
int stride, int strideNum, int blockNum)
{
__shared__ DTYPE iMean[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__ DTYPE iVar[MAX_CUDA_THREAD_NUM_PER_BLOCK];
......@@ -88,11 +88,10 @@ where a and b are the scalar and bias respectively, and \epsilon is the adjustme
>> b - the bias
>> epsilon - a parameter
*/
extern "C"
void _CudaNormalize(const XTensor * input, XTensor * output, int dim,
const XTensor * mean, const XTensor * var,
const XTensor * a, const XTensor * b,
DTYPE epsilon)
const XTensor * mean, const XTensor * var,
const XTensor * a, const XTensor * b,
DTYPE epsilon)
{
CheckNTErrors((input->dataType == DEFAULT_DTYPE), "TODO!");
......
......@@ -35,18 +35,17 @@ where a and b are the scalar and bias respectively, and \epsilon is the adjustme
*/
__global__
void KernelNormalize(DTYPE * input, DTYPE * output, DTYPE * mean, DTYPE * var,
DTYPE * a, DTYPE * b, DTYPE epsilon,
int stride, int strideNum, int blockNum);
DTYPE * a, DTYPE * b, DTYPE epsilon,
int stride, int strideNum, int blockNum);
/*
normalized the data with normal distribution. For an input x,
y = a * (x-mean)/sqrt(variance+\epsilon) + b
where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter
*/
extern "C"
void _CudaNormalize(const XTensor * input, XTensor * output, int dim,
const XTensor * mean, const XTensor * var,
const XTensor * a, const XTensor * b, DTYPE epsilon);
const XTensor * mean, const XTensor * var,
const XTensor * a, const XTensor * b, DTYPE epsilon);
#endif // USE_CUDA
......
......@@ -31,7 +31,6 @@ normalized the data with normal distribution.
For an input x, y = a * (x-mean)/sqrt(variance+\epsilon) + b
where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter.
*/
extern "C"
void _Normalize(const XTensor * input, XTensor * output, int dim, const XTensor * mean, const XTensor * var, const XTensor * a, const XTensor * b, DTYPE epsilon);
/*
......@@ -40,7 +39,6 @@ keep the result in the input tenosr and return nothing
For an input x, x = a * (x-mean)/sqrt(variance+\epsilon) + b
where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter.
*/
extern "C"
void _NormalizeMe(XTensor * input, int dim, const XTensor * mean, const XTensor * var, const XTensor * a, const XTensor * b, DTYPE epsilon);
/*
......
......@@ -100,10 +100,9 @@ void KernelPower(__half * a, __half * b, __half p, int size)
}
/* get the power of the entries */
extern "C"
void _CudaPower(const XTensor * a, XTensor * b, DTYPE p)
{
CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
int gridSize[3];
int blockSize[3];
......
......@@ -37,7 +37,6 @@ __global__
void KernelSqrtV2(__half * a, __half * b, int size);
/* get the power of the entries */
extern "C"
void _CudaPower(const XTensor * a, XTensor * b, DTYPE p);
#endif // USE_CUDA
......
......@@ -47,8 +47,7 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift)
}
#endif
CheckNTErrors((a->dataType == DEFAULT_DTYPE),
"The tensor is not in the default data type!");
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "The tensor is not in the default data type!");
/* sparse tensor */
if(a->isSparse){
......
......@@ -37,7 +37,6 @@ __global__
void KernelScaleAndShift(__half * a, __half * b, int size, __half scale, __half shift);
/* scale and shift all tensor entires b = a * scale + shift (cuda version) */
extern "C"
void _CudaScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift);
#endif // USE_CUDA
......
......@@ -86,7 +86,7 @@ void _CopyBlocks(void * source, int blockSize, int * sourceBlocks, int blockNum,
*/
for (int i = 0; i < blockNum; i++) {
XMemCopy((char*)target + targetBlocks[i] * blockSize, devID,
(char*)source + sourceBlocks[i] * blockSize, devID, blockSize);
(char*)source + sourceBlocks[i] * blockSize, devID, blockSize);
}
}
}
......
......@@ -39,7 +39,7 @@ Note that a grid may have a number of blocks
>> isIndexOnDev - indicates whether the index is on the device already
*/
void _CopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, void * target,
int * index, int unitSize, bool isIndexOnDev, XMem * myMem)
int * index, int unitSize, bool isIndexOnDev, XMem * myMem)
{
CheckNTErrors((unitSize == sizeof(int)), "TODO!");
......
......@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* copy data by index */
extern "C"
void _CudaCopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, void * target, int * index, int unitSize, XMem * myMem);
#endif // USE_CUDA
......
......@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* copy a number of blocks in grid */
extern "C"
void _CopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, void * target, int * index, int unitSize, bool isIndexOnDev, XMem * myMem);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -33,7 +33,6 @@ __global__
void KernelCopyBlocks(DTYPE * source, int blockSize, int blockNum, DTYPE * target, int * targetBlocks);
/* copy a number of blocks to target positions (cuda version) */
extern "C"
void _CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem);
#endif // USE_CUDA
......
......@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* copy a number of blocks to target positions (on site) */
extern "C"
void _CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -72,7 +72,7 @@ copy a number of blocks from source positions to target positions (cuda version)
*/
void _CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID)
{
CheckNTErrors((devID >= 0), "Wrong device to run!");
CheckNTErrors(devID >= 0, "Wrong device to run!");
CheckNTErrors((blockSize % sizeof(DTYPE) == 0), "Unsupported block size!");
/* copy the index to the GPU memory */
......
......@@ -33,7 +33,6 @@ __global__
void KernelCopyBlocksSelected(DTYPE * source, int blockSize, int * sourceBlocks, int blockNum, DTYPE * target, int * targetBlocks);
/* copy a number of blocks form source positions to target positions (cuda version) */
extern "C"
void _CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID);
#endif // USE_CUDA
......
......@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* copy data blocks by 2d layout */
extern "C"
void _CopyData2D(void ** s, int sPitch, void ** t, int tPitch, int count, int mSize, int n, XMem * myMem);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -38,7 +38,7 @@ in the k-th grid
*/
void _CopyInGrid(const XTensor * s, XTensor * t, int * index, int blockDim, int blockNumInGrid, bool isIndexOnDev)
{
CheckNTErrors((XTensor::IsIdentical(s, t)), "Unmatched tensors!");
CheckNTErrors((XTensor::IsSameShaped(s, t)), "Unmatched tensors!");
int blockDimRDI = s->order - blockDim - 1;
int blockSize = 1;
......
......@@ -27,9 +27,8 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* copy a number of blocks in grid. i.e., reorder the data blocks in the same memory piece*/
extern "C"
void _CopyInGrid(const XTensor * s, XTensor * t, int * index, int blockDim, int blockNumInGrid, bool isIndexOnDev = false);
} // namespace nts(NiuTrans.Tensor)
#endif // __COPYINGRID_H__
\ No newline at end of file
#endif // __COPYINGRID_H__
......@@ -44,7 +44,7 @@ void _CopyIndexed(const XTensor * s, XTensor * t, int dim, int * srcIndex, int i
{
CheckNTErrors((s && t), "Invalid tensors!");
CheckNTErrors((s->devID == t->devID || (s->devID < 0 && t->devID < 0)),
"the data must be kept on the same device!");
"the data must be kept on the same device!");
CheckNTErrors((dim < s->order && dim < t->order), "A too larget dimension specified!");
CheckNTErrors((s->unitSize == t->unitSize), "Unmatched tensors!");
......
......@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* copy selected sub-tensors */
extern "C"
void _CopyIndexed(const XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum);
/*
......
......@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* copy all elements from a source matrix to a target matrix */
extern "C"
void _CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream = NULL);
#endif // USE_CUDA
......
......@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* copy s to t */
extern "C"
void _CopyValues(const XTensor * s, XTensor * t, XStream * stream = NULL);
/*
......@@ -38,4 +37,4 @@ XTensor CopyValues(const XTensor &s, XStream * stream = NULL);
} // namespace nts(NiuTrans.Tensor)
#endif // __COPYVALUES_H__
\ No newline at end of file
#endif // __COPYVALUES_H__
......@@ -101,8 +101,8 @@ crossing of the i-th columne and the j-th row.
*/
__global__
void KernelReduceMax(__half * input, __half * output,
int stride, int strideNum, int reducedStrideNum,
int blockSize, int blockNum)
int stride, int strideNum, int reducedStrideNum,
int blockSize, int blockNum)
{
int idx = threadIdx.x * blockDim.y + threadIdx.y;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
......@@ -224,8 +224,8 @@ reduce a tensor to another that keeps the max value along a dimension - fast ve
*/
template <unsigned int goodSize> __global__
void KernelReduceMaxFast(__half * input, __half * output,
int stride, int strideNum, int reducedStrideNum,
int blockSize, int blockNum)
int stride, int strideNum, int reducedStrideNum,
int blockSize, int blockNum)
{
unsigned int tid = threadIdx.y;
unsigned int j = blockIdx.y * (blockDim.y * 2) + threadIdx.y;
......
......@@ -29,7 +29,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* get the max-valued items along a dimension of the tensor (cuda version) */
extern "C"
void _CudaReduceMax(const XTensor * input, XTensor * output, int dim);
#endif // USE_CUDA
......
......@@ -27,7 +27,6 @@
namespace nts{ // namespace nts(NiuTrans.Tensor)
/* get the max value of the items along a dimension of the tensor. */
extern "C"
void _ReduceMax(const XTensor * input, XTensor * output, int dim);
/*
......@@ -38,4 +37,4 @@ XTensor ReduceMax(const XTensor &input, int dim);
} // namespace nts(NiuTrans.Tensor)
#endif // __REDUCEMAX_H__
\ No newline at end of file
#endif // __REDUCEMAX_H__
......@@ -30,7 +30,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
get the mean value along a dimension of the tensor
For a 1-dimensional data array a, mean = (1/n) * sum_i input_i
*/
extern "C"
void _ReduceMean(const XTensor * input, XTensor * output, int dim);
/*
......@@ -42,4 +41,4 @@ XTensor ReduceMean(const XTensor &input, int dim);
} // namespace nts(NiuTrans.Tensor)
#endif // __REDUCEMEAN_H__
\ No newline at end of file
#endif // __REDUCEMEAN_H__
......@@ -31,7 +31,6 @@ standard variance of the items along a dimension of the tensor
For a 1-dimensional data array a,
variance = (1/n * \sum_i (a_i - mean)^2)^0.5
*/
extern "C"
void _ReduceStandardVariance(XTensor * input, XTensor * output, int dim, XTensor * mean);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -43,22 +43,20 @@ sum = \sum_i exp((a_i - shift)^power) if isExp == true
void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor * shift, DTYPE power, bool isExp)
{
CheckNTErrors((input->devID == output->devID || (input->devID < 0 && output->devID < 0)),
"This code must be run on the same device!");
"This code must be run on the same device!");
CheckNTErrors((input && output), "Empty input or output tensors!");
CheckNTErrors((input->order == output->order + 1), "Incorrect tensor sizes!");
CheckNTErrors((input->order > dim && dim >=0), "Illegal dimension to reduce!");
CheckNTErrors((input->dataType == output->dataType), "Unmatched data types!");
CheckNTErrors((shift == NULL || XTensor::IsIdentical(output, shift)), "Incorrect shift tensor size!");
CheckNTErrors((shift == NULL || XTensor::IsSameShaped(output, shift)), "Incorrect shift tensor size!");
int dimRDI = input->order - dim - 1;
for(int i = 0; i < input->order; i++){
if(i < dimRDI){
CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i]),
"Unmatched tensors!");
CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i]), "Unmatched tensors!");
}
else if(i > dimRDI){
CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i - 1]),
"Unmatched tensors!");
CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i - 1]), "Unmatched tensors!");
}
}
......
......@@ -34,7 +34,6 @@ For a 1-dimensional data array a,
sum = \sum_i ((a_i + shift)^power) if isExp == false
sum = \sum_i exp((a_i + shift)^power) if isExp == true
*/
extern "C"
void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor * shift, DTYPE power, bool isExp);
#endif // USE_CUDA
......
......@@ -32,7 +32,6 @@ For a 1-dimensional data array a,
sum = \sum_i (a_i - shift) if isExp == false
sum = \sum_i exp(a_i - shift) if isExp == true
*/
extern "C"
void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor * shift = NULL,
DTYPE power = (DTYPE)1.0F, bool isExp = false);
......@@ -56,4 +55,4 @@ XTensor ReduceSum(const XTensor &input, int dim, DTYPE power = (DTYPE)1.0F, bool
} // namespace nts(NiuTrans.Tensor)
#endif // __REDUCESUM_H__
\ No newline at end of file
#endif // __REDUCESUM_H__
......@@ -31,7 +31,6 @@ squared sum of the items along a dimension of the tensor
For a 1-dimensional data array a,
sum = \sum_i (a_i - shift)^2
*/
extern "C"
void _ReduceSumSquared(const XTensor * input, XTensor * output, int dim, const XTensor * shift);
/*
......
......@@ -30,7 +30,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
variance of the items along a dimension of the tensor
For a 1-dimensional data array a, variance = 1/n * \sum_i (a_i - mean)^2
*/
extern "C"
void _ReduceVariance(const XTensor * input, XTensor * output, int dim, const XTensor * mean);
/*
......
......@@ -44,7 +44,7 @@ void _Concatenate(const XList * smalls, XTensor * big, int dim)
XTensor * a = (XTensor*)smalls->GetItem(i - 1);
XTensor * b = (XTensor*)smalls->GetItem(i);
CheckNTErrors((a && b), "Empty input tensors!");
if (!XTensor::IsIdentical(a, b))
if (!XTensor::IsSameShaped(a, b))
uniform = false;
}
......@@ -76,7 +76,7 @@ XTensor Concatenate(const XList &smalls, int dim)
XTensor * a = (XTensor*)smalls.GetItem(i - 1);
XTensor * b = (XTensor*)smalls.GetItem(i);
CheckNTErrors((a && b), "Empty input tensors!");
if (!XTensor::IsIdentical(a, b))
if (!XTensor::IsSameShaped(a, b))
uniform = false;
}
XTensor * tensor = (XTensor*)smalls.GetItem(0);
......@@ -177,7 +177,7 @@ XTensor Concatenate(const XTensor &smallA, const XTensor &smallB, int dim)
XTensor * a = (XTensor*)smalls.Get(i - 1);
XTensor * b = (XTensor*)smalls.Get(i);
CheckNTErrors((a && b), "Empty input tensors!");
if (!XTensor::IsIdentical(a, b))
if (!XTensor::IsSameShaped(a, b))
uniform = false;
}
XTensor * tensor = (XTensor*)smalls.Get(0);
......
......@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* concatenate a list of tensors along a given dimension */
extern "C"
void _ConcatenateSolely(const XList * smalls, XTensor * big, int dim);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -69,7 +69,6 @@ set target data block index for the data movement in split
>> gridNum - number of grids
>> mem - the memory pool
*/
extern "C"
void _CudaMakeMergeBlockIndex(int devID,
int * blockIndex, int blockNum, int blockNumInMerge,
int splitSizeInGrid, int gridSize, int gridNum)
......
......@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* set target data block index for the data movement in split */
extern "C"
void _CudaMakeMergeBlockIndex(int devID, int * blockIndex, int blockNum, int blockNumInMerge,
int splitSizeInGrid, int gridSize, int gridNum);
......
......@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* set target data block index for the data movement in merge */
extern "C"
void _MakeMergeBlockIndex(int * blockIndex, int blockNum, int blockNumInMerge,
int splitSizeInGrid, int gridSize, int gridNum, XMem * mem);
......
......@@ -57,7 +57,6 @@ set target data block index for the data movement in split
>> blockSplitSize - size of the splitted block
>> blockNum - number of data blocks
*/
extern "C"
void _CudaMakeSplitBlockIndex(int devID, int * blockIndex, int splitNum, int blockSplitSize, int blockNum)
{
int cudaGrids[3];
......
......@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* set target data block index for the data movement in split */
extern "C"
void _CudaMakeSplitBlockIndex(int devID, int * blockIndex, int splitNum, int blockSplitSize, int blockNum);
#endif // USE_CUDA
......
......@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* set target data block index for the data movement in split */
extern "C"
void _MakeSplitBlockIndex(int * blockIndex, int splitNum, int blockSplitSize, int blockNum, XMem * mem);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -49,7 +49,7 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
CheckNTErrors((s != NULL && t != NULL), "Invalid tensors!");
CheckNTErrors((s->devID == t->devID || (s->devID < 0 && t->devID < 0)),
"the data must be kept on the same device!");
"the data must be kept on the same device!");
CheckNTErrors((s->unitNum == t->unitNum && s->unitSize == t->unitSize), "Unmatched tensors!");
CheckNTErrors((s->order == t->order + 1), "Unmatched tensors!");
......@@ -58,11 +58,11 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
for (int i = 0; i < s->order; i++) {
if (i == whereToMergeRDI) {
CheckNTErrors((t->dimSizeRDI[i] == s->dimSizeRDI[i] * s->dimSizeRDI[leadingDimRDI]),
"Unmatched tensor sizes!");
"Unmatched tensor sizes!");
}
else if (i > leadingDimRDI) {
CheckNTErrors((s->dimSizeRDI[i - 1] == t->dimSizeRDI[i]),
"Unmatched tensor sizes!");
"Unmatched tensor sizes!");
}
}
......@@ -99,8 +99,7 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
char * sData = (char*)s->data + g * blockSize * blockNum * s->unitSize;
for (int k = 0; k < mergedNum; k++) {
XMemCopy2D(tData + k * tStep, tPtich, t->devID,
sData + k * sStep, sPitch, s->devID,
mSize, n);
sData + k * sStep, sPitch, s->devID, mSize, n);
}
}
}
......@@ -356,7 +355,7 @@ merge two tensors into a big tensor (return a XTensor structure)
*/
XTensor Merge(const XTensor &smallA, const XTensor &smallB, int whereToMerge)
{
CheckNTErrors(XTensor::IsIdentical(&smallA, &smallB),
CheckNTErrors(XTensor::IsSameShaped(&smallA, &smallB),
"The two tensors must be of the same size!");
int order = smallA.order;
......
......@@ -71,7 +71,6 @@ merge data by blocks (cuda version)
>> target - target data array
>> myMem - the memory pool
*/
extern "C"
void _CudaMergeBlockLists(const XList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem)
{
CheckNTErrors((myMem != NULL), "No memory pool!");
......
......@@ -33,7 +33,6 @@ __global__
void KernelCopyBlockLists(DTYPE ** sourceList, int * sourceBlockSizes, int sourceBlockNum, DTYPE ** targetList);
/* merge data by blocks (cuda version) */
extern "C"
void _CudaMergeBlockLists(const XList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem);
#endif // USE_CUDA
......
......@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* merge data by blocks */
extern "C"
void _MergeBlockLists(const XList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -41,7 +41,7 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
{
CheckNTErrors((s && t), "Invalid tensors!");
CheckNTErrors((s->devID == t->devID || (s->devID < 0 && t->devID < 0)),
"the data must be kept on the same device!");
"the data must be kept on the same device!");
CheckNTErrors((s->unitNum == t->unitNum && s->unitSize == t->unitSize), "Unmatched tensors!");
CheckNTErrors((s->order == t->order - 1), "Unmatched tensors!");
......@@ -51,11 +51,11 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
for (int i = 0; i < s->order; i++) {
if (i == whereToSplitRDI) {
CheckNTErrors((s->dimSizeRDI[i] == t->dimSizeRDI[i] * splitNum),
"Unmatched tensor sizes!");
"Unmatched tensor sizes!");
}
else {
CheckNTErrors((s->dimSizeRDI[i] == t->dimSizeRDI[i]),
"Unmatched tensor sizes!");
"Unmatched tensor sizes!");
}
}
......@@ -301,7 +301,7 @@ void Split(const XTensor &big, XList &smalls, int whereToSplit, int splitNum)
XLink::AddParamToHeadInt(s, whereToSplit);
/* it is tricky here that we keep the id of each
block, rather than the total number of splits */
block, rather than the total number of the splits */
XLink::AddParamToHeadInt(s, i);
}
}
......
......@@ -66,7 +66,6 @@ insert a dimension by copying the blocks for x times (where x is the size of the
>> dim - where to insert the dimension
>> dSize - size of the newly-inserted dimension
*/
extern "C"
void _CudaUnsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
{
int blockSize = 1;
......
......@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* duplicate the data along a given dimension */
extern "C"
void _CudaUnsqueeze(const XTensor * a, XTensor * b, int dim, int dSize);
#endif // USE_CUDA
......
......@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* insert a dimension by copying the blocks for x times (where x is the size of the inerted dimension) */
extern "C"
void _Unsqueeze(const XTensor * a, XTensor * b, int dim, int dSize);
/*
......@@ -39,4 +38,4 @@ XTensor Unsqueeze(const XTensor &a, int dim, int dSize);
} // namespace nts(NiuTrans.Tensor)
#endif // __UNSQUEEZE_H__
\ No newline at end of file
#endif // __UNSQUEEZE_H__
......@@ -20,6 +20,7 @@
*/
#include "../../XTensor.h"
#include "../movement/CopyValues.h"
#include "../../XUtility.h"
#include "../../XName.h"
#include "Sort.h"
......@@ -36,7 +37,7 @@ sort the tensor along a given dimension
*/
void _Sort(const XTensor * a, XTensor * b, XTensor * index, int dim)
{
CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
CheckNTErrors((dim >= 0 && dim < a->order), "Incorrect dimension specified!");
CheckNTErrors((a->order == index->order), "Unmatched input tensors!");
CheckNTErrors((index->dataType == X_INT), "Wrong data type!");
......@@ -63,15 +64,15 @@ void _Sort(const XTensor * a, XTensor * b, XTensor * index, int dim)
blockNum *= a->dimSizeRDI[i];
int blockSize = stride * strideNum;
_CopyValues(a, b);
for (int k = 0; k < blockNum; k++) {
for (int i = 0; i < stride; i++) {
void * dataA = (char*)a->data + (k * blockSize + i) * a->unitSize;
void * dataB = (char*)b->data + (k * blockSize + i) * b->unitSize;
void * indexData = (char*)index->data + (k * blockSize + i) * sizeof(int);
/* we sort the data array along "dim" */
if (a->dataType == X_FLOAT)
XQSort(dataA, dataB, indexData, strideNum, a->unitSize, stride, CompXFloat);
XQSort(dataB, indexData, strideNum, a->unitSize, stride, CompXFloat);
else {
ShowNTErrors("TODO!");
}
......
......@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* sort the tensor along a given dimension */
extern "C"
void _CudaSortBig(const XTensor * a, XTensor * b, XTensor * indexA, XTensor * indexB, int dim, int k = -1);
#endif // USE_CUDA
......
......@@ -39,7 +39,6 @@ void _SortMe(XTensor * a, XTensor * index, int dim);
sort the data along a given dimension (return a XTensor structure)
make a new tensor to keep the result and return it
*/
extern "C"
void Sort(XTensor & a, XTensor & b, XTensor & index, int dim);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* get the top-k items along a given dimension */
extern "C"
void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k);
#endif // USE_CUDA
......
......@@ -27,13 +27,11 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* get the top-k items along a given dimension */
extern "C"
void _TopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k);
/* get the top-k items along a given dimension */
extern "C"
void TopK(XTensor &a, XTensor &b, XTensor &index, int dim, int k);
} // namespace nts(NiuTrans.Tensor)
#endif // __TOPK_H__
\ No newline at end of file
#endif // __TOPK_H__
......@@ -63,7 +63,6 @@ set the cell to the ascending order along a given dimension
>> a - the tensor
>> dim - the dimension
*/
extern "C"
void CudaSetAscendingOrder(XTensor * a, int dim)
{
CheckNTErrors((a->dataType == X_INT), "TODO!");
......
......@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* set the cell to the ascending order along a given dimension */
extern "C"
void CudaSetAscendingOrder(XTensor * a, int dim);
#endif // USE_CUDA
......
......@@ -28,15 +28,12 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* segmentation and parallel processing for 2d tensors (i.e., matrices) */
/* segment a 2d tensor (i.e., matrix) into blocks and run jobs in parallel */
extern "C"
void RunParallel2D(XPRunner * parallelRunner, void * job, int opNum, int rowNum, int colNum, int argNum, ...);
/* segment a block into sub-blocks */
extern "C"
int SegmentTensor2D(int rowNum, int colNum, int blockNum, int * blockIndex);
/* segment a block into sub-blocks */
extern "C"
int SegmentTensor2DInRows(int rowNum, int colNum, int blockNum, int * blockIndex);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -106,7 +106,7 @@ void _HardTanHBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
LOSS_FUNCTION_NAME lossName)
{
CheckNTErrors((gold == NULL || XTensor::IsIdentical(gold, y)),
CheckNTErrors((gold == NULL || XTensor::IsSameShaped(gold, y)),
"The tensors must be of the same size!");
#ifdef USE_CUDA
......
......@@ -35,11 +35,9 @@ y = 1 if x > 1
x if -1 <= x <= 1
-1 if x < -1
*/
extern "C"
void _CudaHardTanH(const XTensor * input, XTensor * output);
/* de/dx (Cuda version) */
extern "C"
void _CudaHardTanHBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
LOSS_FUNCTION_NAME lossName);
......
......@@ -72,7 +72,7 @@ void _IdentityBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
LOSS_FUNCTION_NAME lossName)
{
CheckNTErrors((gold == NULL || XTensor::IsIdentical(gold, y)),
CheckNTErrors((gold == NULL || XTensor::IsSameShaped(gold, y)),
"The tensors must be of the same size!");
if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE)
......
......@@ -309,7 +309,7 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
}
}
else {
CheckNTErrors((XTensor::IsIdentical(gold, y)), "The tensors must be of the same size!");
CheckNTErrors((XTensor::IsSameShaped(gold, y)), "The tensors must be of the same size!");
for (int k = 0; k < blockNum; k++) {
gp = (DTYPE*)gold->data + k * blockSize;
op = (DTYPE*)y->data + k * blockSize;
......@@ -363,7 +363,7 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
}
}
else {
CheckNTErrors((XTensor::IsIdentical(gold, y)), "The tensors must be of the same size!");
CheckNTErrors((XTensor::IsSameShaped(gold, y)), "The tensors must be of the same size!");
for (int k = 0; k < blockNum; k++) {
gp = (DTYPE*)gold->data + k * blockSize;
op = (DTYPE*)y->data + k * blockSize;
......
......@@ -190,7 +190,7 @@ set dE/dx = exp(y)
>> size - size of output
>> lossName - name of the loss function
*/
extern "C" __global__
__global__
void KernelExpLoss(DTYPE * dedy, DTYPE * dedx, DTYPE * y, int size, LOSS_FUNCTION_NAME lossName)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
......@@ -409,7 +409,7 @@ void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
dedx->dimSize[0], dedx->dimSize[1], gold->unitNumNonZero, lossName);
}
else {
CheckNTErrors((XTensor::IsIdentical(gold, y)), "The tensors must be of the same size!");
CheckNTErrors((XTensor::IsSameShaped(gold, y)), "The tensors must be of the same size!");
for (int k = 0; k < blockNum; k++) {
GDevs.GetCudaThread(x->devID, blockSize, cudaGridSize, cudaBlockSize);
......
......@@ -48,7 +48,7 @@ DTYPE _LossCompute(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
DTYPE error = 0.0F;
if (output->devID < 0) {
CheckNTErrors((gLen >= 0 && gLen <= output->unitNum), "Illegal input length!");
CheckNTErrors((XTensor::IsIdentical(gold, output)), "The input tensors must be of the same size!");
CheckNTErrors((XTensor::IsSameShaped(gold, output)), "The input tensors must be of the same size!");
CheckNTErrors((gold->dimSizeRDI[0] == 1 && output->dimSizeRDI[0] == 1), "TODO!");
CheckNTErrors((gold->order > leadDim && leadDim >= 0), "Illegal leading dimension!");
CheckNTErrors((gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE),
......@@ -206,7 +206,7 @@ DTYPE _LossComputeForLogScale(XTensor * gold, XTensor * output,
int leadDim, int gBeg, int gLen, int oBeg)
{
CheckNTErrors(gLen >= 0 && gLen <= output->unitNum, "Illegal input length!");
CheckNTErrors(XTensor::IsIdentical(gold, output), "The input tensors must be of the same size!");
CheckNTErrors(XTensor::IsSameShaped(gold, output), "The input tensors must be of the same size!");
CheckNTErrors(gold->dimSizeRDI[0] == 1 && output->dimSizeRDI[0] == 1, "TODO!");
CheckNTErrors(gold->order > leadDim && leadDim >= 0, "Illegal leading dimension!");
CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, "TODO!");
......@@ -402,9 +402,10 @@ void _LossBackward(XTensor * dedy, XTensor * t, XTensor * y,
if (y->devID < 0) {
CheckNTErrors(tLen <= y->unitNum, "Illegal input length!");
CheckNTErrors(XTensor::IsIdentical(t, y)&& XTensor::IsIdentical(dedy, y),
"The input tensors must be of the same size!");
CheckNTErrors((dedy->devID == t->devID) && (dedy->devID == y->devID), "Tensor must be on the same device!");
CheckNTErrors(XTensor::IsSameShaped(t, y)&& XTensor::IsSameShaped(dedy, y),
"The input tensors must be of the same size!");
CheckNTErrors((dedy->devID == t->devID) && (dedy->devID == y->devID),
"Tensor must be on the same device!");
CheckNTErrors(t->order > leadDim, "Illegal leading dimension!");
CheckNTErrors(t->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE, "TODO!");
......
......@@ -55,7 +55,7 @@ DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
bool isLogOutput, int leadDim, int gBeg, int gLen, int yBeg)
{
CheckNTErrors((gLen >= 0 && gLen <= y->unitNum), "Illegal input length!");
CheckNTErrors((XTensor::IsIdentical(gold, y)), "The input tensors must be of the same size!");
CheckNTErrors((XTensor::IsSameShaped(gold, y)), "The input tensors must be of the same size!");
CheckNTErrors((gold->dimSizeRDI[0] == 1 && y->dimSizeRDI[0] == 1), "TODO!");
CheckNTErrors((gold->order > leadDim && leadDim >= 0), "Illegal leading dimension!");
CheckNTErrors((gold->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE),
......@@ -223,7 +223,7 @@ backward compuation for squared error (Cuda kernel)
>> y - model output (in vector)
>> size - size of the vector (dedy)
*/
extern "C" __global__
__global__
void KernelLossBackwardSquaredError(DTYPE * dedy, DTYPE * t, DTYPE * y, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
......@@ -243,7 +243,7 @@ backward compuation of blocks for squared error (Cuda kernel)
>> lenInBlock - number of items in a block for computation
>> size - size of the vector (dedy)
*/
extern "C" __global__
__global__
void KernelLossBackwardSquaredErrorBlock(DTYPE * dedy, DTYPE * t, DTYPE * y,
int blockSize, int begInBlock, int lenInBlock, int size)
{
......@@ -266,7 +266,7 @@ backward compuation for cross entropy (Cuda kernel)
>> y - model output (in vector)
>> size - size of the vector (dedy)
*/
extern "C" __global__
__global__
void KernelLossBackwardCrossEntropy(DTYPE * dedy, DTYPE * t, DTYPE * y, int tBeg, int tLen, int yBeg, int blockNum, int stride, int dimensionSize)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
......@@ -298,7 +298,7 @@ backward compuation for cross entropy (Cuda kernel)
>> lenInBlock - number of items in a block for computation
>> size - size of the vector (dedy)
*/
extern "C" __global__
__global__
void KernelLossBackwardCrossEntropyBlock(DTYPE * dedy, DTYPE * t, DTYPE * y,
int blockSize, int begInBlock, int lenInBlock, int size)
{
......@@ -333,20 +333,21 @@ void _CudaLossBackward(XTensor * dedy, XTensor * t, XTensor * y,
int leadDim, int tBeg, int tLen, int yBeg)
{
CheckNTErrors((tLen <= y->unitNum), "Illegal input length!");
CheckNTErrors((XTensor::IsIdentical(t, y)&& XTensor::IsIdentical(dedy, y)),
"The input tensors must be of the same size!");
CheckNTErrors(((dedy->devID == t->devID) && (dedy->devID == y->devID)), "Tensor must be on the same device!");
CheckNTErrors((XTensor::IsSameShaped(t, y)&& XTensor::IsSameShaped(dedy, y)),
"The input tensors must be of the same size!");
CheckNTErrors(((dedy->devID == t->devID) && (dedy->devID == y->devID)),
"Tensor must be on the same device!");
CheckNTErrors((t->order > leadDim), "Illegal leading dimension!");
CheckNTErrors((t->dataType == DEFAULT_DTYPE &&
y->dataType == DEFAULT_DTYPE &&
dedy->dataType == DEFAULT_DTYPE),
"Input vectors are not in default type.");
y->dataType == DEFAULT_DTYPE &&
dedy->dataType == DEFAULT_DTYPE),
"Input vectors are not in default type.");
CheckNTErrors((dedy->devID >= 0 && t->devID >= 0 && y->devID >= 0),
"The backward compuation must be performed on GPUs.");
"The backward compuation must be performed on GPUs.");
CheckNTErrors((dedy->devID == t->devID && dedy->devID == y->devID),
"The vectors must be on the same GPU.");
"The vectors must be on the same GPU.");
CheckNTErrors((tBeg == yBeg), "TODO!");
int leadDimRDI = leadDim >= 0 ? y->order - leadDim - 1 : -1;
......
......@@ -30,21 +30,17 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* compute the loss (cuda version) */
extern "C"
DTYPE _CudaLossCompute(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
bool isLogOutput, int leadDim, int gBeg, int gLen, int oBeg);
/* compute the loss in log scale (cuda version) */
extern "C"
DTYPE _CudaLossComputeForLogScale(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
int leadDim, int gBeg, int gLen, int oBeg);
/* backward compuation for a single element (cuda version) */
extern "C"
DTYPE _CudaLossBackwardPoint(DTYPE t, DTYPE y, LOSS_FUNCTION_NAME LFName);
/* backward compuation for (dense) vectors (cuda version) */
extern "C"
void _CudaLossBackward(XTensor * dedy, XTensor * t, XTensor * y,
LOSS_FUNCTION_NAME LFName,
int leadDim = -1, int tBeg = 0, int tLen = -1, int yBeg = 0);
......
......@@ -47,25 +47,21 @@ loss function to measure the "number" of errors
*/
/* compute the loss */
extern "C"
DTYPE _LossCompute(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
bool isLogOutput, int leadDim, int gBeg, int gLen, int oBeg);
/* compute the loss (log version) */
extern "C"
DTYPE _LossComputeForLogScale(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
int leadDim, int gBeg, int gLen, int oBeg);
/* backward compuation for a single element */
extern "C"
DTYPE _LossBackwardPoint(DTYPE t, DTYPE y, LOSS_FUNCTION_NAME LFName);
/* backward compuation for (dense) vectors */
extern "C"
void _LossBackward(XTensor * dEdY, XTensor * t, XTensor * y,
LOSS_FUNCTION_NAME LFName,
int leadDim = -1, int tBeg = 0, int tLen = -1, int yBeg = 0);
} // namespace nts(NiuTrans.Tensor)
#endif // __LOSS_H__
\ No newline at end of file
#endif // __LOSS_H__
......@@ -103,7 +103,7 @@ void _RectifyBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
LOSS_FUNCTION_NAME lossName)
{
CheckNTErrors((gold == NULL || XTensor::IsIdentical(gold, y)),
CheckNTErrors((gold == NULL || XTensor::IsSameShaped(gold, y)),
"The tensors must be of the same size!");
#ifdef USE_CUDA
......
......@@ -30,11 +30,9 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* rectify function y = max(0, x) (Cuda version) */
extern "C"
void _CudaRectify(const XTensor * input, XTensor * output);
/* de/dx (Cuda version) */
extern "C"
void _CudaRectifyBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
LOSS_FUNCTION_NAME lossName);
......
......@@ -94,8 +94,8 @@ void _SigmoidBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
LOSS_FUNCTION_NAME lossName)
{
CheckNTErrors((gold == NULL || XTensor::IsIdentical(gold, y)),
"The tensors must be of the same size!");
CheckNTErrors((gold == NULL || XTensor::IsSameShaped(gold, y)),
"The tensors must be of the same size!");
#ifdef USE_CUDA
if(x->devID >= 0 || y->devID >= 0){
......
......@@ -30,11 +30,9 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* rectify function y = max(0, x) (Cuda version) */
extern "C"
void _CudaSigmoid(const XTensor * input, XTensor * output);
/* de/dx (Cuda version) */
extern "C"
void _CudaSigmoidBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
LOSS_FUNCTION_NAME lossName);
......
......@@ -230,7 +230,7 @@ void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
}
}
else{
CheckNTErrors((XTensor::IsIdentical(gold, y)), "The tensors must be of the same size!");
CheckNTErrors((XTensor::IsSameShaped(gold, y)), "The tensors must be of the same size!");
for(int k = 0; k < blockNum; k++){
gp = (DTYPE*)gold->data + k * blockSize;
op = (DTYPE*)y->data + k * blockSize;
......@@ -269,7 +269,7 @@ void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
}
}
else{
CheckNTErrors((XTensor::IsIdentical(gold, y)), "The tensors must be of the same size!");
CheckNTErrors((XTensor::IsSameShaped(gold, y)), "The tensors must be of the same size!");
for(int k = 0; k < blockNum; k++){
gp = (DTYPE*)gold->data + k * blockSize;
op = (DTYPE*)y->data + k * blockSize;
......
......@@ -167,7 +167,7 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s
{
CheckNTErrors((x->devID >= 0), "Forward computation of softmax must be run on GPUs.");
CheckNTErrors((x->devID == y->devID), "Tensors used in softmax are not on the same GPU.");
CheckNTErrors((XTensor::IsIdentical(x, y)), "Input tensors must be of the same size!");
CheckNTErrors((XTensor::IsSameShaped(x, y)), "Input tensors must be of the same size!");
int leadDimRDI = y->order - leadDim - 1;
int dimensionSize = y->dimSizeRDI[leadDimRDI];
......
......@@ -30,15 +30,12 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* softmax y = e^x / \sum_{i} e^{x_i} (Cuda version) */
extern "C"
void _CudaSoftmax(const XTensor * input, XTensor * output, int leadDim);
/* softmax y = e^x / \sum_{i} e^{x_i} (Cuda version) */
extern "C"
void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * sum, XTensor * max);
/* de/dx (Cuda version) */
extern "C"
void _CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
int leadDim,
......
......@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* test for Absolute Function */
extern "C"
bool TestAbsolute();
} // namespace nts(NiuTrans.Tensor)
......
......@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* test for Concatenate Function */
extern "C"
bool TestConcatenate();
} // namespace nts(NiuTrans.Tensor)
......
......@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* test for ConcatenateSolely Function */
extern "C"
bool TestConcatenateSolely();
} // namespace nts(NiuTrans.Tensor)
......
......@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* test for ConvertDataType Function */
extern "C"
bool TestConvertDataType();
} // namespace nts(NiuTrans.Tensor)
......
......@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* test for CopyIndexed Function */
extern "C"
bool TestCopyIndexed();
} // namespace nts(NiuTrans.Tensor)
......
......@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* test for CopyValues Function */
extern "C"
bool TestCopyValues();
} // namespace nts(NiuTrans.Tensor)
......
......@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* test for HardTanH Function */
extern "C"
bool TestHardTanH();
} // namespace nts(NiuTrans.Tensor)
......
......@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* test for Identity Function */
extern "C"
bool TestIdentity();
} // namespace nts(NiuTrans.Tensor)
......
......@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* test for Log Function */
extern "C"
bool TestLog();
} // namespace nts(NiuTrans.Tensor)
......
......@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* test for LogSoftmax Function */
extern "C"
bool TestLogSoftmax();
} // namespace nts(NiuTrans.Tensor)
......
......@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* test for Loss Function */
extern "C"
bool TestLoss();
} // namespace nts(NiuTrans.Tensor)
......
......@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* test for MatrixMul Function */
extern "C"
bool TestMatrixMul();
} // namespace nts(NiuTrans.Tensor)
......
......@@ -248,7 +248,6 @@ bool TestMatrixMul2D2()
*/
/* test for MatrixMul2D Function */
extern "C"
bool TestMatrixMul2D()
{
XPRINT(0, stdout, "[TEST MATRIXMUL2D] matrix multiplication (for 2d tensors) \n");
......
......@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* test for MatrixMul2D Function */
extern "C"
bool TestMatrixMul2D();
} // namespace nts(NiuTrans.Tensor)
......
......@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* test for MatrixMul2DParallel Function */
extern "C"
bool TestMatrixMul2DParallel();
} // namespace nts(NiuTrans.Tensor)
......
......@@ -210,14 +210,14 @@ bool TestSort()
XPRINT(0, stdout, "[TEST SORT] sort the tensor along a given dimension \n");
bool returnFlag = true, caseFlag = true;
///* case 1 test */
//caseFlag = TestSort1();
//if (!caseFlag) {
// returnFlag = false;
// XPRINT(0, stdout, ">> case 1 failed!\n");
//}
//else
// XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 1 test */
caseFlag = TestSort1();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 1 failed!\n");
}
else
XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 2 test */
caseFlag = TestSort2();
......
......@@ -69,7 +69,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* test for all Function */
extern "C"
bool Test();
} // namespace nts(NiuTrans.Tensor)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论