Commit 2bb8754f by xiaotong

code for inference of fnnlm

parent f31bc3fb
......@@ -1106,6 +1106,7 @@ void Test(const char * test, const char * result, FNNModel &model)
/* the gold standard */
XTensor gold;
if (!autoDiff) {
/* prepare an empty network for building the fnn */
FNNNet net;
......@@ -1118,6 +1119,10 @@ void Test(const char * test, const char * result, FNNModel &model)
/* forward computation */
Forward(inputs, output, model, net);
}
else {
ForwardAutoDiff(inputs, output, model);
}
/* prediction probabilities */
XTensor probs;
......
......@@ -57,6 +57,11 @@ void _MatrixMulBatched(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
CheckNTErrors((a->order == b->order && a->order == c->order),
"Input tensor and output tensor must have same order!");
if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
_MatrixMulBatchedGPU(a, transposedA, b, transposedB, c, alpha, beta);
return;
}
int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1];
......@@ -213,83 +218,15 @@ void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
blockNum *= a->dimSizeRDI[i];
}
XList * aList = new XList(10);
XList * bList = new XList(10);
XList * cList = new XList(10);
int aDimSize[2] = {-a->dimSizeRDI[1], a->dimSizeRDI[0]};
int bDimSize[2] = {-b->dimSizeRDI[1], b->dimSizeRDI[0]};
int cDimSize[2] = {-c->dimSizeRDI[1], c->dimSizeRDI[0]};
XTensor * tensorBuf = new XTensor[blockNum * 3];
XTensor * aBuf = tensorBuf;
XTensor * bBuf = tensorBuf + blockNum;
XTensor * cBuf = tensorBuf + blockNum * 2;
for (int p = 0; p < blockNum; p++) {
void * ap = (char*)a->data + aRealBlockSize * p;
void * bp = (char*)b->data + bRealBlockSize * p;
void * cp = (char*)c->data + cRealBlockSize * p;
XTensor * ai = aBuf + p;
XTensor * bi = bBuf + p;
XTensor * ci = cBuf + p;
InitTensor(ai, 2, aDimSize, a->dataType, a->denseRatio, a->devID, a->mem);
InitTensor(bi, 2, bDimSize, b->dataType, b->denseRatio, b->devID, b->mem);
InitTensor(ci, 2, cDimSize, c->dataType, c->denseRatio, c->devID, c->mem);
ai->data = ap;
bi->data = bp;
ci->data = cp;
aList->Add(ai);
bList->Add(bi);
cList->Add(ci);
}
if (a->devID >= 0 && b->devID >= 0 && c->devID >= 0) {
#ifdef USE_CUDA
CheckNTErrors((a->devID == b->devID && a->devID == c->devID),
"The code must be run on the same GPU!");
int devIDBackup;
ProtectCudaDev(a->devID, devIDBackup);
cublasHandle_t * handle = a->mem != NULL ? a->mem->GetCublasHandle() : GDevs.GetCudaHandle(a->devID);
_CudaBLASMatrixMULList(handle,
aList, transposedA,
bList, transposedB,
cList, aList->count,
alpha, beta);
BacktoCudaDev(a->devID, devIDBackup);
#else
ShowNTErrors("Please specify USE_CUDA and recompile the code!");
#endif
}
else {
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
_MatrixMULBatchedCPU(aList, transposedA,
bList, transposedB,
cList, alpha, beta);
}
for (int i = 0; i < aList->count; i++) {
XTensor * ai = (XTensor*)aList->GetItem(i);
ai->data = NULL;;
}
for (int i = 0; i < bList->count; i++) {
XTensor * bi = (XTensor*)bList->GetItem(i);
bi->data = NULL;
}
_CudaBLASMatrixMULBatchedStrided(handle,
a->data, transposedA, a->dataType, aBlockSize,
b->data, transposedB, b->dataType, bBlockSize,
c->data, c->dataType, cBlockSize, blockNum,
a->dimSizeRDI[1], a->dimSizeRDI[0],
b->dimSizeRDI[1], b->dimSizeRDI[0],
c->dimSizeRDI[1], c->dimSizeRDI[0], alpha, beta);
for (int i = 0; i < cList->count; i++) {
XTensor * ci = (XTensor*)cList->GetItem(i);
ci->data = NULL;
}
delete[] tensorBuf;
delete aList;
delete bList;
delete cList;
}
/*
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论