using cpu float16 and test fnn and t2t times

1da50ae2 · ltb · 29d2352b · 1da50ae2 · 1da50ae2 · 1da50ae2
Commit 1da50ae2 authored Aug 05, 2019 by ltb
--- a/source/network/Main.cpp
+++ b/source/network/Main.cpp
--- a/source/sample/fnnlm/FNNLM.cpp
+++ b/source/sample/fnnlm/FNNLM.cpp
@@ -416,6 +416,18 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
    double startT = GetClockSec();
+	double mkinput = 0.0;
+	double mkgold = 0.0;
+	double train_time = 0.0;
+	double clearModel = 0.0;
+	double forward=0.0;
+	double backward = 0.0;
+	double update = 0.0;
+	double end = 0.0;
+	double start = 0.0;
+	double time;
    /* iterate for a number of epochs */
    for(epoch = 0; epoch < nEpoch; epoch++){
@@ -426,7 +438,6 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
        wordCount = 0;
        loss = 0;
        ngramNum = 1;
        while(ngramNum > 0){
            /* load a minibatch of ngrams */
@@ -447,13 +458,18 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
            /* the loss tensor */
            XTensor lossTensor;
+			start = GetClockSec();
            /* make the input tensor for position i */
            for(int i = 0; i < model.n - 1; i++)
                MakeWordBatch(inputs[i], ngrams, ngramNum, i, model.vSize, model.devID);
+			mkinput += GetClockSec() - start;
+			start = GetClockSec();
            /* make the gold tensor */
            MakeWordBatch(gold, ngrams, ngramNum, model.n - 1, model.vSize, model.devID);
+			mkgold += GetClockSec() - start;
+			time = GetClockSec();
            if(!autoDiff){
                /* prepare an empty network for building the fnn */
                FNNNet net;
@@ -475,28 +491,37 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
                loss -= prob;
            }
            else{
+				start = GetClockSec();
                /* gradient = 0 */
                Clear(model, true);
+				clearModel += GetClockSec() - start;
+				start = GetClockSec();
                /* forward + backward process */
                /* this is implemented by gather function */
                ForwardAutoDiff(ngrams, ngramNum, output, model);
+				forward += GetClockSec() - start;
+				start = GetClockSec();
 				/* this is implemented by multiply function */
                lossTensor = CrossEntropy(output, gold);
                /* automatic differentiation */
                autoDiffer.Backward(lossTensor);
+				backward += GetClockSec() - start;
+				start = GetClockSec();
                /* update model parameters */
                Update(model, grad, learningRate, true);
+				update += GetClockSec() - start;
+				start = GetClockSec();
                /* get probabilities */
                float prob = ReduceSumAll(lossTensor);
                loss += prob;
+				end += GetClockSec() - start;
            }
+			train_time += GetClockSec() - time;
            wordCount += ngramNum;
            wordCountTotal += ngramNum;
@@ -507,8 +532,19 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
            if (step % 100 == 0) {				
                double elapsed = GetClockSec() - startT;
+				startT = GetClockSec();
+				XPRINT8(0, stderr, "[Time] mkinput=%.5lfs,mkgold=%.5lfs,train_time=%.5lfs,clearModel=%.5lfs,forward=%.5lfs, backward=%.5lf, update=%.5lf, end=%.5lf\n",
+					mkinput, mkgold, train_time, clearModel, forward, backward, update,end);
                XPRINT5(0, stderr, "[INFO] elapsed=%.1fs, step=%d, epoch=%d, ngram=%d, ppl=%.3f\n",
                           elapsed, step, epoch + 1, wordCountTotal, exp(loss / wordCount));
+				mkinput = 0.0;
+				mkgold = 0.0;
+				train_time = 0.0;
+				clearModel = 0.0;
+				forward = 0.0;
+				backward = 0.0;
+				update = 0.0;
+				end = 0.0;
            }
        }

--- a/source/sample/transformer/T2TTrainer.cpp
+++ b/source/sample/transformer/T2TTrainer.cpp
@@ -148,6 +148,14 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
    double startT = GetClockSec();
+	double mkinput = 0.0;
+	double train_time = 0.0;
+	double forward = 0.0;
+	double backward = 0.0;
+	double update = 0.0;
+	double start = 0.0;
+	double time = 0.0;
    for(epoch = 1; epoch <= nepoch; epoch++){
 #ifndef WIN32
        if(isShuffled)
@@ -177,17 +185,30 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
        /* label smoothed gold standard (if needed) */
        XTensor goldSmoothed;
-        while (batchLoader.LoadBatch(file, model->isLM, 
+        //while (batchLoader.LoadBatch(file, model->isLM, 
+        //                             &batchEnc, &paddingEnc, &batchDec, &paddingDec, &gold, &label,
+        //                             NULL, vSize, vSizeTgt,
+        //                             sBatchSize, wBatchSize, isLenSorted, ws, wc, devID, true)) 
+		while (true)
+        {
+			start = GetClockSec();
+			int batch = batchLoader.LoadBatch(file, model->isLM,
 				&batchEnc, &paddingEnc, &batchDec, &paddingDec, &gold, &label,
 				NULL, vSize, vSizeTgt,
-                                     sBatchSize, wBatchSize, isLenSorted, ws, wc, devID, true)) 
+				sBatchSize, wBatchSize, isLenSorted, ws, wc, devID, true);
-        {
+			mkinput += GetClockSec() - start;
+			if (!batch) {
+				break;
+			}
+			time = GetClockSec();
            CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch");
            /* output probabilities */
            XTensor output;
+			start = GetClockSec();
            /* make the network */
            if(model->isLM)
                model->MakeLM(batchEnc, output, paddingEnc, true);
@@ -196,11 +217,12 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
            else{
                ShowNTErrors("Illegal model type!");
            }
+			forward += GetClockSec() - start;
            /* back-propagation for obtaining gradients */
            //if (labelSmoothingP > 0)
            //    LabelSmooth(&gold, &goldSmoothed, labelSmoothingP);
+			start = GetClockSec();
            XTensor labelOnehot;
            labelOnehot = IndexToOnehot(label, vSizeTgt, labelSmoothingP);
@@ -229,7 +251,9 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
                net.Backward(lossTensor);
                //net.Backward(output, labelOnehot, paddingDec, CROSSENTROPY);
                //net.Backward(output, label, labelSmoothingP, CROSSENTROPY);
+				backward += GetClockSec() - start;
+				start = GetClockSec();
                gradStep += 1;
                loss += prob;
                wordCount += wc;
@@ -248,10 +272,12 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
                    gradStep = 0;
                    validStep++;
+					update += GetClockSec() - start;
                }
            }
            else
                nSkipped++;
+			train_time += GetClockSec() - time;
            if(++step >= nstep){
                isEnd = true;
@@ -260,11 +286,19 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
            if (step % 100 == 0) {
                double elapsed = GetClockSec() - startT;
+				startT = GetClockSec();
+				XPRINT6(0, stderr, "[Time] elapsed=%.5lfs,mkinput=%.5lfs,train_time=%.5lfs,forward=%.5lfs, backward=%.5lf, update=%.5lf\n",
+					elapsed, mkinput,train_time, forward, backward, update);
                XPRINT8(0, stderr, "[INFO] elapsed=%.1fs, step=%d, epoch=%d, tword=%d, sword=%d, loss=%.3f, ppl=%.3f, sppl=%.3f",
                        elapsed, step, epoch, wordCountTotal, wordCountBatch, loss/wordCount, exp(loss/wordCount), exp(prob/wc));
                if (!doUpdate)
                    XPRINT(0, stderr, " (no update)");
                XPRINT(0, stderr, "\n");
+				mkinput = 0.0;
+				train_time = 0.0;
+				forward = 0.0;
+				backward = 0.0;
+				update = 0.0;
            }
            if(nStepCheckpoint > 0 && ++nStepCheck >= nStepCheckpoint){

--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -25,6 +25,7 @@
 * $Update by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2017-11-18 bug fixes
 *
 */
+#include "halfLib/half/half.hpp"
 #include <stdio.h>
 #include <stdlib.h>
@@ -50,6 +51,11 @@
 #include "function/Identity.h"
 #include "core/CHeader.h"
+//#include "halfLib/HalfFloat/umHalf.h"
 #ifdef USE_CUDA
 // the CUDA stuff
@@ -376,6 +382,7 @@ XTensor& XTensor::operator= (const XTensor& tensor)
            XMemCopy(data, devID, tensor.data, tensor.devID, size);
            if(dataHost != NULL && tensor.dataHost != NULL)
                XMemCopy(dataHost, -1, tensor.dataHost, tensor.devID, size);
+                XMemCopy(dataHost, -1, tensor.dataHost, tensor.devID, size);
        }
        else{
            DestroyData();
@@ -1854,6 +1861,16 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int beg, 
            }
        }
+		else if (dataType==X_FLOAT16) {
+			int end = MIN(n > 0 ? beg + n : beg + unitNum, unitNum);
+			for (int i = beg; i < end; i++) {
+				halfCPU f = ((halfCPU*)d)[i];
+				if (i == beg)
+					fprintf(file, "%hx", f);
+				else
+					fprintf(file, " %hx", f);
+			}
+		}
        else if (dataType == X_INT) {
            int end = MIN(n > 0 ? beg + n : beg + unitNum, unitNum);
            for(int i = beg; i < end; i++){
@@ -1900,9 +1917,22 @@ dump data to a file
 */
 void XTensor::Dump(const XTensor * tensor, FILE * file, const char * label, const int n, const int beg, const int verbose)
 {
+	if (tensor->dataType == X_FLOAT)
+	{
 		XTensor a(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, tensor->devID, tensor->mem);
 		_CopyValues(tensor, &a);
 		a.Dump(file, label, n, beg, verbose);
+	}
+	else if (tensor->dataType == X_FLOAT16)
+	{
+		XTensor a(tensor->order, tensor->dimSize, X_FLOAT, tensor->denseRatio, tensor->devID, tensor->mem);
+		_ConvertDataType(tensor, &a);
+		a.Dump(file, label, n, beg, verbose);
+	}
+	else
+	{
+		ShowNTErrors("TO DO!");
+	}
 }
 /* 
@@ -1980,6 +2010,14 @@ void XTensor::Read(FILE * file, const char * label)
                }
            }
        }
+		else if (dataType==X_FLOAT16){
+			for (int i = 0; i < unitNum; i++) {
+				halfCPU * f = ((halfCPU*)data) + i;
+				if (fscanf(file, "%hx", f) < 1) {
+					ShowNTErrors("Incorrect tensor format!");
+				}
+			}
+		}
        else {
            ShowNTErrors("TODO!");
        }
@@ -2006,15 +2044,13 @@ void XTensor::Read(FILE * file, const char * label)
        }
    }
    do {
        c = fgetc(file);
    } while (c != '\n' && c != EOF);
    XMemCopy(dataBackup, devID, data, -1, GetDataSizeInChar());
    data = dataBackup;
+    delete[](char *)dataBuf;
-    delete[](char*)dataBuf;
 }
 /*

--- a/source/tensor/core/utilities/FlushToMem.cu
+++ b/source/tensor/core/utilities/FlushToMem.cu
@@ -97,7 +97,7 @@ void CudaCPUToGPUFlush(TensorList * mList, int devID, XMem * GPUMem)
 /* copy the data from GPU memory to CPU memory */
 void CudaGPUToCPUFlush(XTensor * tensor)
 {
-    CheckNTErrors((sizeof(DTYPE) == tensor->unitSize), "Unsupported data type.");
+    //CheckNTErrors((sizeof(DTYPE) == tensor->unitSize), "Unsupported data type.");
    if (tensor->dataHost != NULL)
        delete[](char*)tensor->dataHost;