Commit 2bb8754f by xiaotong

code for inference of fnnlm

parent f31bc3fb
...@@ -1106,18 +1106,23 @@ void Test(const char * test, const char * result, FNNModel &model) ...@@ -1106,18 +1106,23 @@ void Test(const char * test, const char * result, FNNModel &model)
/* the gold standard */ /* the gold standard */
XTensor gold; XTensor gold;
/* prepare an empty network for building the fnn */ if (!autoDiff) {
FNNNet net; /* prepare an empty network for building the fnn */
FNNNet net;
/* make the input tensor for position i */ /* make the input tensor for position i */
for (int i = 0; i < model.n - 1; i++) for (int i = 0; i < model.n - 1; i++)
MakeWordBatch(inputs[i], ngrams, ngramNum, i, model.vSize, model.devID, model.mem); MakeWordBatch(inputs[i], ngrams, ngramNum, i, model.vSize, model.devID, model.mem);
/* make the gold tensor */ /* make the gold tensor */
MakeWordBatch(gold, ngrams, ngramNum, model.n - 1, model.vSize, model.devID, model.mem); MakeWordBatch(gold, ngrams, ngramNum, model.n - 1, model.vSize, model.devID, model.mem);
/* forward computation */ /* forward computation */
Forward(inputs, output, model, net); Forward(inputs, output, model, net);
}
else {
ForwardAutoDiff(inputs, output, model);
}
/* prediction probabilities */ /* prediction probabilities */
XTensor probs; XTensor probs;
......
...@@ -57,6 +57,11 @@ void _MatrixMulBatched(const XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -57,6 +57,11 @@ void _MatrixMulBatched(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
CheckNTErrors((a->order == b->order && a->order == c->order), CheckNTErrors((a->order == b->order && a->order == c->order),
"Input tensor and output tensor must have same order!"); "Input tensor and output tensor must have same order!");
if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
_MatrixMulBatchedGPU(a, transposedA, b, transposedB, c, alpha, beta);
return;
}
int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1]; int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0]; int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1]; int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1];
...@@ -213,83 +218,15 @@ void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -213,83 +218,15 @@ void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
blockNum *= a->dimSizeRDI[i]; blockNum *= a->dimSizeRDI[i];
} }
XList * aList = new XList(10); cublasHandle_t * handle = a->mem != NULL ? a->mem->GetCublasHandle() : GDevs.GetCudaHandle(a->devID);
XList * bList = new XList(10); _CudaBLASMatrixMULBatchedStrided(handle,
XList * cList = new XList(10); a->data, transposedA, a->dataType, aBlockSize,
int aDimSize[2] = {-a->dimSizeRDI[1], a->dimSizeRDI[0]}; b->data, transposedB, b->dataType, bBlockSize,
int bDimSize[2] = {-b->dimSizeRDI[1], b->dimSizeRDI[0]}; c->data, c->dataType, cBlockSize, blockNum,
int cDimSize[2] = {-c->dimSizeRDI[1], c->dimSizeRDI[0]}; a->dimSizeRDI[1], a->dimSizeRDI[0],
b->dimSizeRDI[1], b->dimSizeRDI[0],
XTensor * tensorBuf = new XTensor[blockNum * 3]; c->dimSizeRDI[1], c->dimSizeRDI[0], alpha, beta);
XTensor * aBuf = tensorBuf;
XTensor * bBuf = tensorBuf + blockNum;
XTensor * cBuf = tensorBuf + blockNum * 2;
for (int p = 0; p < blockNum; p++) {
void * ap = (char*)a->data + aRealBlockSize * p;
void * bp = (char*)b->data + bRealBlockSize * p;
void * cp = (char*)c->data + cRealBlockSize * p;
XTensor * ai = aBuf + p;
XTensor * bi = bBuf + p;
XTensor * ci = cBuf + p;
InitTensor(ai, 2, aDimSize, a->dataType, a->denseRatio, a->devID, a->mem);
InitTensor(bi, 2, bDimSize, b->dataType, b->denseRatio, b->devID, b->mem);
InitTensor(ci, 2, cDimSize, c->dataType, c->denseRatio, c->devID, c->mem);
ai->data = ap;
bi->data = bp;
ci->data = cp;
aList->Add(ai);
bList->Add(bi);
cList->Add(ci);
}
if (a->devID >= 0 && b->devID >= 0 && c->devID >= 0) {
#ifdef USE_CUDA
CheckNTErrors((a->devID == b->devID && a->devID == c->devID),
"The code must be run on the same GPU!");
int devIDBackup;
ProtectCudaDev(a->devID, devIDBackup);
cublasHandle_t * handle = a->mem != NULL ? a->mem->GetCublasHandle() : GDevs.GetCudaHandle(a->devID);
_CudaBLASMatrixMULList(handle,
aList, transposedA,
bList, transposedB,
cList, aList->count,
alpha, beta);
BacktoCudaDev(a->devID, devIDBackup);
#else
ShowNTErrors("Please specify USE_CUDA and recompile the code!");
#endif
}
else {
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
_MatrixMULBatchedCPU(aList, transposedA,
bList, transposedB,
cList, alpha, beta);
}
for (int i = 0; i < aList->count; i++) {
XTensor * ai = (XTensor*)aList->GetItem(i);
ai->data = NULL;;
}
for (int i = 0; i < bList->count; i++) {
XTensor * bi = (XTensor*)bList->GetItem(i);
bi->data = NULL;
}
for (int i = 0; i < cList->count; i++) {
XTensor * ci = (XTensor*)cList->GetItem(i);
ci->data = NULL;
}
delete[] tensorBuf;
delete aList;
delete bList;
delete cList;
} }
/* /*
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论