t2t embedding and output layers

3cd237ff · xiaotong · 287e226c · 3cd237ff · 3cd237ff · 3cd237ff
Commit 3cd237ff authored Aug 02, 2018 by xiaotong
--- a/source/sample/transformer/T2TEmbedding.cpp
+++ b/source/sample/transformer/T2TEmbedding.cpp
@@ -33,6 +33,7 @@ T2TEmbedder::T2TEmbedder()
    devID = -1;
    mem = NULL;
    vSize = -1;
+    maxLength = -1;
 }
 /* deconstructor */
@@ -52,7 +53,6 @@ void T2TEmbedder::InitModel(int argc, const char ** argv, int myDevID, XMem * my
    devID = myDevID;
    mem = myMem;
-    int maxLength = 0;
    int d = 0;
    LoadParamInt(argc, argv, "vsize", &vSize, -1);
@@ -102,7 +102,52 @@ make the network
 */
 XTensor * T2TEmbedder::Make(XTensor * input)
 {
-    return NULL;
+    CheckNTErrors(input->GetDim(-1) == vSize, "Wrong vocabulary size!");
+    CheckNTErrors(input->order > 1, "Wrong input tensor size!");
+    CheckNTErrors(input->dimSize[input->order - 2] < maxLength, "The sequence is too long!");
+    int dims[MAX_TENSOR_DIM_NUM];
+    memcpy(dims, input->dimSize, input->order);
+    dims[0] = eSize;
+    bool match = (posEmbedding.order == input->order);
+    if(match){
+        for(int i = 0; i < input->order; i++){
+            if(dims[i] != posEmbedding.GetDim(i))
+                match = false;
+        }
+    }
+    /* we make positional embeddings first */
+    if(!match){
+        InitTensor(&posEmbedding, input->order, dims, X_FLOAT, 1.0F, devID, mem);
+        XTensor * posTMP = NewTensorBuf(2, dims, X_FLOAT, 1.0F, devID, mem);
+        _CopyValues(&posEmbeddingBase, 0, posTMP->unitNum, posTMP, 0);
+        int dims2[MAX_TENSOR_DIM_NUM];
+        dims2[0] = dims[0];
+        dims2[1] = dims[1];
+        dims2[2] = posEmbedding.unitNum / (dims[0] * dims[1]);
+        posEmbedding.Reshape(3, dims2);
+        _Unsqueeze(posTMP, &posEmbedding, 0, dims2[2]);
+        posEmbedding.Reshape(input->order, dims);
+        DelTensorBuf(posTMP);
+    }
+    XTensor wordEmbedding;
+    /* then we make word embeddings */
+    wordEmbedding = MMul(*input, w);
+    XTensor * result = new XTensor();
+    /* we sum over the two embeddings */
+    *result = wordEmbedding + posEmbedding;
+    return result;
 }
 }
--- a/source/sample/transformer/T2TEmbedding.h
+++ b/source/sample/transformer/T2TEmbedding.h
@@ -48,11 +48,17 @@ public:
    /* embedding size */
    int eSize;
+    /* maximum length of the sequence */
+    int maxLength;
    /* word embedding matrix */
    XTensor w;
    /* predefined positional embeddings. It can speeds up 
       the embedding processing by re-loading. */
+    XTensor posEmbeddingBase;
+    /* positional embeddings */
    XTensor posEmbedding;
 public:

--- a/source/sample/transformer/T2TModel.cpp
+++ b/source/sample/transformer/T2TModel.cpp
@@ -26,7 +26,6 @@
 namespace transformer
 {
 /* constructor */
 T2TModel::T2TModel()
 {
@@ -61,4 +60,30 @@ void T2TModel::InitModel(int argc, const char ** argv)
    outputLayer.InitModel(argc, argv, devID, mem);
 }
+/* 
+make the encoding network
+>> input - input tensor
+<< return - encoding result
+*/
+XTensor * T2TModel::MakeEncoding(XTensor * input)
+{
+    return encoder.Make(input);
+}
+/* 
+make the entire network (with the output softmax layer) 
+>> input - input tensor
+>> output - output tensor (distribution)
+*/
+void T2TModel::Make(XTensor * input, XTensor * output)
+{
+    if(isLM){
+        XTensor * encoding = MakeEncoding(input);
+        outputLayer.Make(encoding, output);
+    }
+    else{
+        ShowNTErrors("TODO!");
+    }
+}
 }
\ No newline at end of file
--- a/source/sample/transformer/T2TModel.h
+++ b/source/sample/transformer/T2TModel.h
@@ -64,6 +64,12 @@ public:
    /* initialize the model */
    void InitModel(int argc, const char ** argv);
+    /* make the encoding network */
+    XTensor * MakeEncoding(XTensor * input);
+    /* make the entire network (with the output softmax layer) */
+    void Make(XTensor * input, XTensor * output);
 };
 }

--- a/source/sample/transformer/T2TOutput.cpp
+++ b/source/sample/transformer/T2TOutput.cpp
@@ -74,4 +74,16 @@ XTensor * T2TOutput::Make(XTensor * input)
    return result;
 }
+/* 
+make the network (redefined output tensor) 
+>> input - input tensor
+>> output - output tensor 
+*/
+void T2TOutput::Make(XTensor * input, XTensor * output)
+{
+    XTensor &x = *input;
+    *output = LogSoftmax(MMul(x, w), -1);
+}
 }
\ No newline at end of file
--- a/source/sample/transformer/T2TOutput.h
+++ b/source/sample/transformer/T2TOutput.h
@@ -63,6 +63,9 @@ public:
    /* make the network */
    XTensor * Make(XTensor * input);
+    /* make the network (redefined output tensor) */
+    void Make(XTensor * input, XTensor * output);
 };

--- a/source/tensor/core/movement/CopyValues.cpp
+++ b/source/tensor/core/movement/CopyValues.cpp
@@ -20,6 +20,7 @@
 */
 #include "../../XName.h"
+#include "../../XUtility.h"
 #include "CopyValues.h"
 #include "CopyValues.cuh"
@@ -42,7 +43,7 @@ void _CopyValues(const XTensor * s, XTensor * t, XStream * stream)
    if ((s->dataType == X_FLOAT16 && t->dataType == X_FLOAT) ||
        (s->dataType == X_FLOAT && t->dataType == X_FLOAT16)) {
        CheckNTErrors(((s->devID < 0 && t->devID < 0) || s->devID == t->devID),
-            "The code must be run on the same device!");
+                       "The code must be run on the same device!");
        CheckNTErrors((s->isSparse || t->isSparse), "TODO!");
        ConvertDataType(s->devID, s->data, s->dataType, t->data, t->dataType, s->unitNum);
    }
@@ -69,6 +70,34 @@ void _CopyValues(const XTensor * s, XTensor * t, XStream * stream)
 }
 /*
+copy s to t
+>> s - source
+>> sBeg - begining of the segment 
+>> sLen - length of the segment
+>> t - target
+>> tBeg - beginning of the segment on the target side
+>> stream - the stream for creating the job pipeline
+*/
+void _CopyValues(const XTensor * s, const int sBeg, const int sLen, XTensor * t, const int tBeg, XStream * stream)
+{
+    CheckNTErrors(s != NULL && t != NULL, "The input tensor and output tensor must be nonempty!");
+    CheckNTErrors(s->data != NULL && t->data != NULL, "Cannot copy from an empty data array!");
+    CheckNTErrors(s->unitSize == t->unitSize, "The input tensors must be of the same unit size!");
+    CheckNTErrors(s->order > sBeg && sBeg >= 0 && sLen <= s->unitNum, "Wrong segment on the source side");
+    CheckNTErrors(t->order > tBeg && tBeg >= 0, "Wrong segment on the target side");
+    if (!s->isSparse && !t->isSparse) {
+        XMemCopy((char*)t->data + tBeg * t->unitSize, t->devID,
+                 (char*)s->data + sBeg * s->unitSize, s->devID,
+                  s->unitSize * sLen);
+    }
+    else {
+        ShowNTErrors("TODO!");
+    }
+}
+/*
 copy s to t (return a XTensor structure)
 make a new tensor to keep the result and return it

--- a/source/tensor/core/movement/CopyValues.h
+++ b/source/tensor/core/movement/CopyValues.h
@@ -29,6 +29,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* copy s to t */
 void _CopyValues(const XTensor * s, XTensor * t, XStream * stream = NULL);
+/* copy a segment of s to t  */
+void _CopyValues(const XTensor * s, const int sBeg, const int sLen, XTensor * t, const int tBeg, XStream * stream = NULL);
 /* 
 copy s to t (return a XTensor structure)
 make a new tensor to keep the result and return it

--- a/source/tensor/function/LogSoftmax.cpp
+++ b/source/tensor/function/LogSoftmax.cpp
@@ -190,6 +190,27 @@ XTensor LogSoftmax(const XTensor &x, int leadDim)
    return y;
 }
+/* 
+log scale softmax y = log(e^x / \sum_{i} e^{x_i})
+make a new tensor to keep the result and return it
+>> x - input vector
+>> y - output vector
+>> leadDim - leading dimension (along which we perform reduction)
+*/
+void LogSoftmax(const XTensor &x, XTensor &y, int leadDim)
+{
+    if(!XTensor::IsSameShaped(&x, &y))
+        InitTensor(&y, &x);
+    /* call _LogSoftmax function */
+    _LogSoftmax(&x, &y, leadDim);
+    /* tensor connection */
+    XLink::MakeLink(&x, NULL, &y, FUNC_LOGSOFTMAX);
+    XLink::AddParamToHeadInt(&y, leadDim);
+}
 /*
 backward computation for dense matrices with default data type

--- a/source/tensor/function/LogSoftmax.h
+++ b/source/tensor/function/LogSoftmax.h
@@ -33,6 +33,9 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim);
 /* log scale softmax y = log(e^x / \sum_{i} e^{x_i}) (return a XTensor structure) */
 XTensor LogSoftmax(const XTensor &x, int leadDim);
+/* log scale softmax y = log(e^x / \sum_{i} e^{x_i}) (with both argument of x and y) */
+void LogSoftmax(const XTensor &x, XTensor &y, int leadDim);
 /* de/dx */
 void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, 
                         XTensor * dedy, XTensor * dedx,