bug fixes

5f345e87 · xiaotong · e6c92495 · 5f345e87 · 5f345e87 · 5f345e87
Commit 5f345e87 authored Mar 05, 2021 by xiaotong
--- a/source/train/TTrain.cpp
+++ b/source/train/TTrain.cpp
@@ -148,7 +148,6 @@ get a batch of samples
 */
 bool TTDataLoader::GetBatchSimple(XList * inputs, XList * golds)
 {
-    fprintf(stderr, "get batch 0\n");
    CheckNTErrors(file != NULL, "No input file specificed!");
    CheckNTErrors(inputs != NULL && inputs->count >= 1, "Wrong argument!");
    CheckNTErrors(golds != NULL && golds->count >= 1, "Wrong argument!");
@@ -184,16 +183,14 @@ bool TTDataLoader::GetBatchSimple(XList * inputs, XList * golds)
        InitTensor2D(input, count, 3, X_INT);
        InitTensor2D(gold, count, 1, X_INT);

-        input->SetData(input, count * 3);
-        gold->SetData(gold, count);
+        input->SetData(inputBatch, count * 3);
+        gold->SetData(goldBatch, count);
    }

    delete[] line;
    delete[] inputBatch;
    delete[] goldBatch;

-    fprintf(stderr, "get batch 1\n");
-
    if (count > 0)
        return true;
    else
@@ -225,15 +222,17 @@ void TTModel::Init(XConfig &myConfig, int devID)
 {
    SetConfig(myConfig);

-    int vSize = MAX_INT_IN_TTRAIN + 1;
-    int eSize = config.GetInt("esize", TT_EMBEDDING_SIZE);
-    int hSize = config.GetInt("hsize", TT_HIDDEN_SIZE);
+    vSize = MAX_INT_IN_TTRAIN + 1;
+    eSize = config.GetInt("esize", TT_EMBEDDING_SIZE);
+    hSize = config.GetInt("hsize", TT_HIDDEN_SIZE);

    InitTensor2D(&embeddingW, vSize, eSize, X_FLOAT, devID);
    InitTensor2D(&hiddenW, 3 * eSize, hSize, X_FLOAT, devID);
+    InitTensor2D(&outputW, hSize, vSize, X_FLOAT, devID);

    embeddingW.SetDataRand(-0.1F, 0.1F);
    hiddenW.SetDataRand(-0.1F, 0.1F);
+    outputW.SetDataRand(-0.1F, 0.1F);
 }

 /* create the model */
@@ -243,21 +242,17 @@ void TTModel::Forward(int devID, XTensor * input, XTensor * output)
    XTensor embeddingCat;
    XTensor hidden;

-    fprintf(stderr, "forward 0\n");
-
    /* [e_0, e_1, e_2] = w_e * input(one-hot) */
    embedding = Gather(embeddingW, *input);

    /* e = merge(e_0, e_1, e_2) */
-    embeddingCat = Merge(embedding, 0, 1);
-
-    /* h = e * w_h */
-    hidden = MMul(embeddingCat, hiddenW);
+    embeddingCat = Merge(embedding, embedding.order - 1, embedding.order - 2);

-    /* output = Softmax(h) */
-    *output = Softmax(hidden, 0);
+    /* h = hardtanh(e * w_h) */
+    hidden = HardTanH(MMul(embeddingCat, hiddenW));

-    fprintf(stderr, "forward 1\n");
+    /* output = Softmax(h * w_o) */
+    *output = Softmax(MMul(hidden, outputW), -1);
 }

 /* clear the model */
@@ -292,15 +287,26 @@ bool TTModel::RunSimple(XList * inputs, XList * outputs, XList * golds)
    XTensor * output = (XTensor*)outputs->GetItem(0);
    XTensor * gold = (XTensor*)golds->GetItem(0);
    XTensor loss;
+    XTensor goldOneHot;

    XNet net;

    Forward(devID, input, output);

-    loss = CrossEntropy(output, gold);
+    goldOneHot = IndexToOnehot(*gold, vSize, 0.0F);
+
+    int* dims = new int[goldOneHot.order];
+    for (int i = 0; i < goldOneHot.order - 2; i++)
+        dims[i] = goldOneHot.GetDim(i);
+    dims[goldOneHot.order - 2] = goldOneHot.GetDim(goldOneHot.order - 1);
+    goldOneHot.Reshape(goldOneHot.order - 1, dims);
+
+    loss = CrossEntropy(output, goldOneHot);

    net.Backward(loss);

+    delete[] dims;
+
    return true;
 }


--- a/source/train/TTrain.h
+++ b/source/train/TTrain.h
@@ -111,6 +111,18 @@ protected:
    /* parameter matrix of the hidden layer */
    XTensor hiddenW;

+    /* parameter matrix of the output layer */
+    XTensor outputW;
+
+    /* vocabulary size */
+    int vSize;
+
+    /* embedding size */
+    int eSize;
+
+    /* hidden layer size */
+    int hSize;
+
 public:
    /* constructor */
    TTModel();

--- a/source/train/XLeader.cpp
+++ b/source/train/XLeader.cpp
@@ -206,6 +206,7 @@ bool XLeader::Run(XConfig * config, DataDistributeBase * dataDistributor,
                  XModel * model, XOptimizer * optimizer)
 {
     bool isDataOK = true;
+     int activeJobCount = 0;

    /* Feed the input to each worker and geneate the output.
       For each worker, we define a job queue and enqueue jobs
@@ -216,20 +217,23 @@ bool XLeader::Run(XConfig * config, DataDistributeBase * dataDistributor,
        XModel * jmodel = worker->GetModel();

        /* get a batch of samples */
-        bool fetched = dataDistributor->GetBatchSimple(worker->GetInput(), worker->GetGold());
-
-        /* job in queue 1: refresh the model */
-        worker->AddJobRefresh(jmodel);
-
-        /* job in queue 1: run the model */
-        worker->AddJobNeuralNet(jmodel, worker->GetInput(), worker->GetOutput(), worker->GetGold());
-
-        /* clear it */
-        worker->Clear();
+        bool fetched = dataDistributor->GetBatchSimple(worker->GetInput(), worker->GetGold()); 

        if (!fetched)
            isDataOK = false;
+        else {
+            /* job in queue 1: refresh the model */
+            worker->AddJobRefresh(jmodel);
+
+            /* job in queue 1: run the model */
+            worker->AddJobNeuralNet(jmodel, worker->GetInput(), worker->GetOutput(), worker->GetGold());
+
+            activeJobCount++;
+        }
    }
+
+    if (activeJobCount == 0)
+        return false;
    
    XList members(jworkers.count);
    for (int i = 0; i < jworkers.count; i++) {
@@ -266,6 +270,11 @@ bool XLeader::Run(XConfig * config, DataDistributeBase * dataDistributor,

    WaitForFinishing();

+    for (int i = 0; i < jworkers.count; i++) {
+        XWorkerJob * worker = (XWorkerJob*)jworkers[i];
+        worker->Clear();
+    }
+
    return isDataOK;
 }


--- a/source/train/XTrainer.cpp
+++ b/source/train/XTrainer.cpp
@@ -123,7 +123,10 @@ void XTrainer::Run(XConfig * config, DataDistributeBase * dataDistributor,

            /* one step of udpate */
            ok = leader.Run(config, dataDistributor, model, optimizer);
-            
+
+            if ((step + 1) % 100 == 0)
+                fprintf(stderr, "epoch:%d step:%d\n", epoch + 1, step + 1);
+
            if (step++ >= nstep)
                break;
        }
@@ -131,10 +134,12 @@ void XTrainer::Run(XConfig * config, DataDistributeBase * dataDistributor,
        dataDistributor->End();

        if (step >= nstep)
-            break;
+            break;   
    }

    delete[] ids;
+
+    fprintf(stderr, "epoch:%d step:%d\n", epoch, step);
 }

 } /* end of the nts (NiuTrans.Tensor) namespace */
--- a/source/train/XWorkerJob.cpp
+++ b/source/train/XWorkerJob.cpp
@@ -105,7 +105,6 @@ add a new job of model refreshment
 */
 bool XWorkerJob::AddJobRefresh(XModel * myModel)
 {
-    fprintf(stderr, "refresh 0\n");
    CheckNTErrors(myModel != NULL, "no parameter keeper!");

    XList args(1);
@@ -113,8 +112,6 @@ bool XWorkerJob::AddJobRefresh(XModel * myModel)

    queue.EnqueueJob((void*)(char*)XModel::Refresh, &args);

-    fprintf(stderr, "refresh 1\n");
-
    return true;
 }