bug fixes of attention and layer normalization

dd0ad421 · xiaotong · fe37006f · dd0ad421 · dd0ad421 · dd0ad421
Commit dd0ad421 authored Sep 26, 2018 by xiaotong
--- a/source/sample/transformer/T2TAttention.cpp
+++ b/source/sample/transformer/T2TAttention.cpp
@@ -74,14 +74,17 @@ void T2TAttention::InitModel(int argc, char ** argv,
    InitTensor2D(&wk, d, dk, X_FLOAT, devID, mem);
    InitTensor2D(&wq, d, dk, X_FLOAT, devID, mem);
    InitTensor2D(&wv, d, dv, X_FLOAT, devID, mem);
+    InitTensor2D(&wa, d, d, X_FLOAT, devID, mem);
    float scale = 1.0F;
    float finfoutk = (float)sqrt(6.0F * scale/(d + dk));
    float finfoutv = (float)sqrt(6.0F * scale/(d + dv));
+    float finfouta = (float)sqrt(6.0F * scale / (d + d));
    wk.SetDataRand(-finfoutk, finfoutk);
    wq.SetDataRand(-finfoutk, finfoutk);
    wv.SetDataRand(-finfoutv, finfoutv);
+    wa.SetDataRand(-finfouta, finfouta);
 }
 /* 
@@ -135,7 +138,7 @@ XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bo
    att = BMMul(scalar, vheads);
    /* concatenate the heads */
-    return Merge(att, att.order - 1);
+    return MMul(Merge(att, att.order - 1), wa);
 }
 }
--- a/source/sample/transformer/T2TAttention.h
+++ b/source/sample/transformer/T2TAttention.h
@@ -57,6 +57,9 @@ public:
    /* transformation matrix for V */
    XTensor wv;
+    /* transformation after dot-product attention */
+    XTensor wa;
    /* size of transformed Q and K */
    int dk;

--- a/source/sample/transformer/T2TLayerNormal.cpp
+++ b/source/sample/transformer/T2TLayerNormal.cpp
@@ -32,7 +32,8 @@ namespace transformer
 T2TLN::T2TLN()
 {
    devID = -1;
-    mem   = NULL;
+    mem = NULL;
+    d = 0;
 }
 /* de-constructor */
@@ -52,23 +53,23 @@ void T2TLN::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
    devID = myDevID;
    mem = myMem;
-    int d = 0;
+    d = 0;
    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
-    InitTensor2D(&w, d, d, X_FLOAT, devID, mem);
+    InitTensor1D(&w, d, X_FLOAT, devID, mem);
    InitTensor1D(&b, d, X_FLOAT, devID, mem);
    float scale = 1.0F;
-    float finfout = (float)sqrt(6.0F * scale / (d + d));
+    float finfout = (float)sqrt(6.0F * scale / d);
    w.SetDataRand(-finfout, finfout);
    b.SetZeroAll();
 }
 /*
-make the network 
+make the network
 for each layer representation x, we have
-y = 
+y =
 >> input - the input tensor
 >> return - layer normalization output
 */
@@ -90,17 +91,17 @@ XTensor T2TLN::Make(XTensor &input)
    /* standard = sqrt(variance) */
    standard = Power(variance, 0.5F);
-    /* unsqueeze mean and standard deviation to fit them into 
+    /* unsqueeze mean and standard deviation to fit them into
-       the same shape of x */
+        the same shape of x */
    meanFilled = Unsqueeze(mean, x.order - 1, x.GetDim(-1));
    standardFilled = Unsqueeze(standard, x.order - 1, x.GetDim(-1));
    /* x' = (x - \mu)/standard */
-    xn = (x - meanFilled)/standardFilled;
+    xn = (x - meanFilled) / standardFilled;
    /* result = x' * w + b   */
-    return MMul(xn, w) + b;
+    return xn * w + b;
 }
 }
--- a/source/sample/transformer/T2TLayerNormal.h
+++ b/source/sample/transformer/T2TLayerNormal.h
@@ -45,6 +45,9 @@ public:
    /* the bias term b */
    XTensor b;
+    /* dimension size of the model */
+    int d;
 public:
    /* constructor */

--- a/source/sample/transformer/T2TModel.cpp
+++ b/source/sample/transformer/T2TModel.cpp
@@ -174,6 +174,7 @@ void T2TModel::GetParams(XList &list)
        list.Add(&encoder.attentions[i].wk);
        list.Add(&encoder.attentions[i].wq);
        list.Add(&encoder.attentions[i].wv);
+        list.Add(&encoder.attentions[i].wa);
        list.Add(&encoder.fnnLayerNorms[i].w);
        list.Add(&encoder.fnnLayerNorms[i].b);
        list.Add(&encoder.attLayerNorms[i].w);