Commit 00e5e46d by linye

fnnlm float16 training supported, there remains som bugs and liutengbo need to fix them

parent d2c7e39a
...@@ -77,6 +77,7 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA ...@@ -77,6 +77,7 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
FNNModel &model, FNNModel &grad, FNNNet &net); FNNModel &model, FNNModel &grad, FNNNet &net);
void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model); void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model);
void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model); void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model);
void ForwardAutoDiffLin(NGram * ngrams, int batch, XTensor &output, FNNModel &model);
/* /*
entry of the program entry of the program
...@@ -123,6 +124,8 @@ int FNNLMMain(int argc, const char ** argv) ...@@ -123,6 +124,8 @@ int FNNLMMain(int argc, const char ** argv)
/* load arguments */ /* load arguments */
LoadArgs(argc, argv, model); LoadArgs(argc, argv, model);
srand(1);
/* check the setting */ /* check the setting */
Check(model); Check(model);
...@@ -543,11 +546,34 @@ void Train(const char * train, bool isShuffled, FNNModel &model) ...@@ -543,11 +546,34 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
/* forward + backward process */ /* forward + backward process */
/* this is implemented by gather function */ /* this is implemented by gather function */
ForwardAutoDiff(ngrams, ngramNum, output, model); //ForwardAutoDiff(ngrams, ngramNum, output, model);
ForwardAutoDiffLin(ngrams, ngramNum, output, model);
//XNet net;
//net.ShowNetwork(stdout, &output);
//FILE* fOut1 = fopen("test-output", "w");
//output.Dump(&output, fOut1, "output");
//fclose(fOut1);
//fflush(fOut1);
//if (step==216)
//{
// exit(1);
//}
/* this is implemented by multiply function */ /* this is implemented by multiply function */
lossTensor = CrossEntropy(output, gold); lossTensor = CrossEntropy(output, gold);
//FILE* fOut1 = fopen("test3", "a");
//fprintf(fOut1, "step=%d ", step);
//lossTensor.Dump(&lossTensor, fOut1, "lossTensor:");
//fclose(fOut1);
//fflush(fOut1);
int stepTmp = step+1;
/* automatic differentiation */ /* automatic differentiation */
autoDiffer.Backward(lossTensor); autoDiffer.Backward(lossTensor);
...@@ -567,7 +593,7 @@ void Train(const char * train, bool isShuffled, FNNModel &model) ...@@ -567,7 +593,7 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
break; break;
} }
if (step % 100 == 0) { if (step % 1 == 0) {
double elapsed = GetClockSec() - startT; double elapsed = GetClockSec() - startT;
XPRINT5(0, stderr, "[INFO] elapsed=%.1fs, step=%d, epoch=%d, ngram=%d, ppl=%.3f\n", XPRINT5(0, stderr, "[INFO] elapsed=%.1fs, step=%d, epoch=%d, ngram=%d, ppl=%.3f\n",
elapsed, step, epoch + 1, wordCountTotal, exp(loss / wordCount)); elapsed, step, epoch + 1, wordCountTotal, exp(loss / wordCount));
...@@ -637,13 +663,21 @@ void Update(FNNModel &model, FNNModel &grad, float epsilon, bool isNodeGrad) ...@@ -637,13 +663,21 @@ void Update(FNNModel &model, FNNModel &grad, float epsilon, bool isNodeGrad)
gradList.Add(model.embeddingW.grad); gradList.Add(model.embeddingW.grad);
} }
//FILE* fOut1 = fopen("test-2", "a");
for (int i = 0; i < paraList.count; i++) { for (int i = 0; i < paraList.count; i++) {
XTensor * para = (XTensor*)paraList.GetItem(i); XTensor * para = (XTensor*)paraList.GetItem(i);
XTensor * paraGrad = (XTensor*)gradList.GetItem(i); XTensor * paraGrad = (XTensor*)gradList.GetItem(i);
//fprintf(fOut1, "id=%d ", para->id);
//para->Dump(para, fOut1, "para:", 50);
//paraGrad->Dump(paraGrad, fOut1, "paraGrad:", 50);
/* the delta rule */ /* the delta rule */
_Sum(para, paraGrad, para, -epsilon); _Sum(para, paraGrad, para, -epsilon);
} }
//fprintf(fOut1, "\n");
//fclose(fOut1);
//fflush(fOut1);
} }
/* /*
...@@ -792,8 +826,16 @@ void InitZeroOneTensor2DFp16(XTensor &tensor, int rowNum, int colNum, int * rows ...@@ -792,8 +826,16 @@ void InitZeroOneTensor2DFp16(XTensor &tensor, int rowNum, int colNum, int * rows
{ {
InitTensor2DV2(&tensor, rowNum, colNum, X_FLOAT16, devID); InitTensor2DV2(&tensor, rowNum, colNum, X_FLOAT16, devID);
XTensor tensor1;
InitTensor2DV2(&tensor1, rowNum, colNum, X_FLOAT, devID);
tensor1.SetZeroAll();
/* set none-zero cells */ /* set none-zero cells */
_SetDataFixed(&tensor, 1.0); for (int i = 0; i < itemNum; i++)
tensor1.Set2D(1.0F, rows[i], cols[i]);
_ConvertDataType(&tensor1, &tensor);
} }
/* /*
...@@ -1086,14 +1128,152 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model ...@@ -1086,14 +1128,152 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model
hidden = Reshape(embeddingBig, embeddingBig.order, dimSize); hidden = Reshape(embeddingBig, embeddingBig.order, dimSize);
/* hidden layers */ /* hidden layers */
for(int i = 0; i < depth; i++) for (int i = 0; i < depth; i++) {
//XTensor hiddenBefore;
//hiddenBefore = MMul(hidden, model.hiddenW[i]) + model.hiddenB[i];
//if (hiddenBefore.dataType == X_FLOAT16) {
// XTensor hiddenBeforeFp32;
// hiddenBeforeFp32 = ConvertDataType(hiddenBefore, X_FLOAT);
// XTensor hiddenFp32;
// hiddenFp32 = HardTanH(hiddenBeforeFp32);
// hidden = ConvertDataType(hiddenFp32, X_FLOAT16);
//}
//else {
// hidden = HardTanH(hiddenBefore);
//}
hidden = HardTanH(MMul(hidden, model.hiddenW[i]) + model.hiddenB[i]); hidden = HardTanH(MMul(hidden, model.hiddenW[i]) + model.hiddenB[i]);
}
/* output layer */ /* output layer */
//output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1); //output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1);
//XTensor softmaxBefore;
//softmaxBefore = MMul(hidden, model.outputW) + model.outputB;
//if (softmaxBefore.dataType == X_FLOAT16) {
// XTensor softmaxBeforeFp32;
// softmaxBeforeFp32 = ConvertDataType(softmaxBefore, X_FLOAT);
// XTensor outputeFp32;
// outputeFp32 = Softmax(softmaxBeforeFp32, 1);
// output = ConvertDataType(outputeFp32, X_FLOAT16);
//}
//else {
// output = Softmax(softmaxBefore, 1);
//}
output = Softmax(MMul(hidden, model.outputW) + model.outputB, 1); output = Softmax(MMul(hidden, model.outputW) + model.outputB, 1);
} }
void ForwardAutoDiffLin(NGram * ngrams, int batch, XTensor &output, FNNModel &model)
{
int n = model.n;
int depth = model.hDepth;
XTensor words;
XTensor embeddingBig;
XTensor hidden;
XTensor b;
int size = batch * (n - 1);
int * index = new int[size];
for (int i = 0; i < batch; i++) {
for (int j = 0; j < n - 1; j++) {
int a = i * (n - 1) + j;
index[a] = ngrams[i].words[j];
}
}
InitTensor1DV2(&words, size, X_INT, model.devID);
words.SetData(index, size);
/*test for Gather float16 datatype backward*/
//XTensor embeddingW16;
//XTensor embeddingBig16;
//embeddingW16 = ConvertDataType(model.embeddingW, X_FLOAT16);
//embeddingBig16 = Gather(embeddingW16, words);
//embeddingBig = ConvertDataType(embeddingBig16, X_FLOAT);
embeddingBig = Gather(model.embeddingW, words);
delete[] index;
int dimSize[2];
dimSize[0] = embeddingBig.GetDim(0) / (n - 1);
dimSize[1] = embeddingBig.GetDim(1) * (n - 1);
/*test for Reshape float16 datatype backward*/
//XTensor embeddingBig16;
//XTensor hidden16;
//embeddingBig16 = ConvertDataType(embeddingBig, X_FLOAT16);
//hidden16 = Reshape(embeddingBig16, embeddingBig16.order, dimSize);
//hidden = ConvertDataType(hidden16, X_FLOAT);
hidden = Reshape(embeddingBig, embeddingBig.order, dimSize);
/* hidden layers */
for (int i = 0; i < depth; i++) {
/*test for MMul float16 backward*/
//XTensor hiddenW16;
//XTensor hidden16;
//XTensor hiddenBefore16;
//XTensor hiddenBefore;
//hiddenW16 = ConvertDataType(model.hiddenW[i], X_FLOAT16);
//hidden16 = ConvertDataType(hidden, X_FLOAT16);
//hiddenBefore16 = MMul(hidden16, hiddenW16);
//hiddenBefore = ConvertDataType(hiddenBefore16, X_FLOAT);
//hidden = HardTanH(hiddenBefore + model.hiddenB[i]);
/*test for HardTanH and Sum float16 backward*/
//XTensor hiddenBefore;
//XTensor hiddenBefore16;
//XTensor hiddenB16;
//XTensor hidden16;
//hiddenBefore = MMul(hidden, model.hiddenW[i]);
//hiddenBefore16 = ConvertDataType(hiddenBefore,X_FLOAT16);
//hiddenB16 = ConvertDataType(model.hiddenB[i], X_FLOAT16);
//hidden16 = HardTanH(hiddenBefore16 + hiddenB16);
//hidden = ConvertDataType(hidden16, X_FLOAT);
hidden = HardTanH(MMul(hidden, model.hiddenW[i]) + model.hiddenB[i]);
}
/* output layer */
/*test for MMul float16 backward*/
//XTensor outputW16;
//XTensor hidden16;
//XTensor outputBefore16;
//XTensor outputBefore;
//outputW16 = ConvertDataType(model.outputW, X_FLOAT16);
//hidden16 = ConvertDataType(hidden, X_FLOAT16);
//outputBefore16 = MMul(hidden16, outputW16);
//outputBefore = ConvertDataType(outputBefore16, X_FLOAT);
//output = Softmax(outputBefore + model.outputB, 1);
/*test for and Sum float16 backward*/
//XTensor outputBefore;
//XTensor outputBefore16;
//XTensor outputB16;
//XTensor output16;
//XTensor softmaxBefore16;
//XTensor softmaxBefore;
//outputBefore = MMul(hidden, model.outputW);
//outputBefore16 = ConvertDataType(outputBefore, X_FLOAT16);
//outputB16 = ConvertDataType(model.outputB, X_FLOAT16);
//softmaxBefore16 = outputBefore16 + outputB16;
//softmaxBefore = ConvertDataType(softmaxBefore16, X_FLOAT);
//output = Softmax(softmaxBefore, 1);
/*test for Softmax and Sum float16 backward*/
XTensor softmaxBefore;
XTensor softmaxBefore16;
XTensor output16;
softmaxBefore = MMul(hidden, model.outputW) + model.outputB;
softmaxBefore16 = ConvertDataType(softmaxBefore, X_FLOAT16);
output16 = Softmax(softmaxBefore16, 1);
output = ConvertDataType(output16, X_FLOAT);
//output = Softmax(MMul(hidden, model.outputW) + model.outputB, 1);
}
/* /*
forward process (with tensor connections) (this is implemented by multiply function) forward process (with tensor connections) (this is implemented by multiply function)
>> inputs - input word representations >> inputs - input word representations
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论