Commit 3cd237ff by xiaotong

t2t embedding and output layers

parent 287e226c
...@@ -33,6 +33,7 @@ T2TEmbedder::T2TEmbedder() ...@@ -33,6 +33,7 @@ T2TEmbedder::T2TEmbedder()
devID = -1; devID = -1;
mem = NULL; mem = NULL;
vSize = -1; vSize = -1;
maxLength = -1;
} }
/* deconstructor */ /* deconstructor */
...@@ -52,7 +53,6 @@ void T2TEmbedder::InitModel(int argc, const char ** argv, int myDevID, XMem * my ...@@ -52,7 +53,6 @@ void T2TEmbedder::InitModel(int argc, const char ** argv, int myDevID, XMem * my
devID = myDevID; devID = myDevID;
mem = myMem; mem = myMem;
int maxLength = 0;
int d = 0; int d = 0;
LoadParamInt(argc, argv, "vsize", &vSize, -1); LoadParamInt(argc, argv, "vsize", &vSize, -1);
...@@ -102,7 +102,52 @@ make the network ...@@ -102,7 +102,52 @@ make the network
*/ */
XTensor * T2TEmbedder::Make(XTensor * input) XTensor * T2TEmbedder::Make(XTensor * input)
{ {
return NULL; CheckNTErrors(input->GetDim(-1) == vSize, "Wrong vocabulary size!");
CheckNTErrors(input->order > 1, "Wrong input tensor size!");
CheckNTErrors(input->dimSize[input->order - 2] < maxLength, "The sequence is too long!");
int dims[MAX_TENSOR_DIM_NUM];
memcpy(dims, input->dimSize, input->order);
dims[0] = eSize;
bool match = (posEmbedding.order == input->order);
if(match){
for(int i = 0; i < input->order; i++){
if(dims[i] != posEmbedding.GetDim(i))
match = false;
}
}
/* we make positional embeddings first */
if(!match){
InitTensor(&posEmbedding, input->order, dims, X_FLOAT, 1.0F, devID, mem);
XTensor * posTMP = NewTensorBuf(2, dims, X_FLOAT, 1.0F, devID, mem);
_CopyValues(&posEmbeddingBase, 0, posTMP->unitNum, posTMP, 0);
int dims2[MAX_TENSOR_DIM_NUM];
dims2[0] = dims[0];
dims2[1] = dims[1];
dims2[2] = posEmbedding.unitNum / (dims[0] * dims[1]);
posEmbedding.Reshape(3, dims2);
_Unsqueeze(posTMP, &posEmbedding, 0, dims2[2]);
posEmbedding.Reshape(input->order, dims);
DelTensorBuf(posTMP);
}
XTensor wordEmbedding;
/* then we make word embeddings */
wordEmbedding = MMul(*input, w);
XTensor * result = new XTensor();
/* we sum over the two embeddings */
*result = wordEmbedding + posEmbedding;
return result;
} }
} }
...@@ -48,11 +48,17 @@ public: ...@@ -48,11 +48,17 @@ public:
/* embedding size */ /* embedding size */
int eSize; int eSize;
/* maximum length of the sequence */
int maxLength;
/* word embedding matrix */ /* word embedding matrix */
XTensor w; XTensor w;
/* predefined positional embeddings. It can speeds up /* predefined positional embeddings. It can speeds up
the embedding processing by re-loading. */ the embedding processing by re-loading. */
XTensor posEmbeddingBase;
/* positional embeddings */
XTensor posEmbedding; XTensor posEmbedding;
public: public:
......
...@@ -26,7 +26,6 @@ ...@@ -26,7 +26,6 @@
namespace transformer namespace transformer
{ {
/* constructor */ /* constructor */
T2TModel::T2TModel() T2TModel::T2TModel()
{ {
...@@ -61,4 +60,30 @@ void T2TModel::InitModel(int argc, const char ** argv) ...@@ -61,4 +60,30 @@ void T2TModel::InitModel(int argc, const char ** argv)
outputLayer.InitModel(argc, argv, devID, mem); outputLayer.InitModel(argc, argv, devID, mem);
} }
/*
make the encoding network
>> input - input tensor
<< return - encoding result
*/
XTensor * T2TModel::MakeEncoding(XTensor * input)
{
return encoder.Make(input);
}
/*
make the entire network (with the output softmax layer)
>> input - input tensor
>> output - output tensor (distribution)
*/
void T2TModel::Make(XTensor * input, XTensor * output)
{
if(isLM){
XTensor * encoding = MakeEncoding(input);
outputLayer.Make(encoding, output);
}
else{
ShowNTErrors("TODO!");
}
}
} }
\ No newline at end of file
...@@ -64,6 +64,12 @@ public: ...@@ -64,6 +64,12 @@ public:
/* initialize the model */ /* initialize the model */
void InitModel(int argc, const char ** argv); void InitModel(int argc, const char ** argv);
/* make the encoding network */
XTensor * MakeEncoding(XTensor * input);
/* make the entire network (with the output softmax layer) */
void Make(XTensor * input, XTensor * output);
}; };
} }
......
...@@ -74,4 +74,16 @@ XTensor * T2TOutput::Make(XTensor * input) ...@@ -74,4 +74,16 @@ XTensor * T2TOutput::Make(XTensor * input)
return result; return result;
} }
/*
make the network (redefined output tensor)
>> input - input tensor
>> output - output tensor
*/
void T2TOutput::Make(XTensor * input, XTensor * output)
{
XTensor &x = *input;
*output = LogSoftmax(MMul(x, w), -1);
}
} }
\ No newline at end of file
...@@ -63,6 +63,9 @@ public: ...@@ -63,6 +63,9 @@ public:
/* make the network */ /* make the network */
XTensor * Make(XTensor * input); XTensor * Make(XTensor * input);
/* make the network (redefined output tensor) */
void Make(XTensor * input, XTensor * output);
}; };
......
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
*/ */
#include "../../XName.h" #include "../../XName.h"
#include "../../XUtility.h"
#include "CopyValues.h" #include "CopyValues.h"
#include "CopyValues.cuh" #include "CopyValues.cuh"
...@@ -42,7 +43,7 @@ void _CopyValues(const XTensor * s, XTensor * t, XStream * stream) ...@@ -42,7 +43,7 @@ void _CopyValues(const XTensor * s, XTensor * t, XStream * stream)
if ((s->dataType == X_FLOAT16 && t->dataType == X_FLOAT) || if ((s->dataType == X_FLOAT16 && t->dataType == X_FLOAT) ||
(s->dataType == X_FLOAT && t->dataType == X_FLOAT16)) { (s->dataType == X_FLOAT && t->dataType == X_FLOAT16)) {
CheckNTErrors(((s->devID < 0 && t->devID < 0) || s->devID == t->devID), CheckNTErrors(((s->devID < 0 && t->devID < 0) || s->devID == t->devID),
"The code must be run on the same device!"); "The code must be run on the same device!");
CheckNTErrors((s->isSparse || t->isSparse), "TODO!"); CheckNTErrors((s->isSparse || t->isSparse), "TODO!");
ConvertDataType(s->devID, s->data, s->dataType, t->data, t->dataType, s->unitNum); ConvertDataType(s->devID, s->data, s->dataType, t->data, t->dataType, s->unitNum);
} }
...@@ -69,6 +70,34 @@ void _CopyValues(const XTensor * s, XTensor * t, XStream * stream) ...@@ -69,6 +70,34 @@ void _CopyValues(const XTensor * s, XTensor * t, XStream * stream)
} }
/* /*
copy s to t
>> s - source
>> sBeg - begining of the segment
>> sLen - length of the segment
>> t - target
>> tBeg - beginning of the segment on the target side
>> stream - the stream for creating the job pipeline
*/
void _CopyValues(const XTensor * s, const int sBeg, const int sLen, XTensor * t, const int tBeg, XStream * stream)
{
CheckNTErrors(s != NULL && t != NULL, "The input tensor and output tensor must be nonempty!");
CheckNTErrors(s->data != NULL && t->data != NULL, "Cannot copy from an empty data array!");
CheckNTErrors(s->unitSize == t->unitSize, "The input tensors must be of the same unit size!");
CheckNTErrors(s->order > sBeg && sBeg >= 0 && sLen <= s->unitNum, "Wrong segment on the source side");
CheckNTErrors(t->order > tBeg && tBeg >= 0, "Wrong segment on the target side");
if (!s->isSparse && !t->isSparse) {
XMemCopy((char*)t->data + tBeg * t->unitSize, t->devID,
(char*)s->data + sBeg * s->unitSize, s->devID,
s->unitSize * sLen);
}
else {
ShowNTErrors("TODO!");
}
}
/*
copy s to t (return a XTensor structure) copy s to t (return a XTensor structure)
make a new tensor to keep the result and return it make a new tensor to keep the result and return it
......
...@@ -29,6 +29,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -29,6 +29,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* copy s to t */ /* copy s to t */
void _CopyValues(const XTensor * s, XTensor * t, XStream * stream = NULL); void _CopyValues(const XTensor * s, XTensor * t, XStream * stream = NULL);
/* copy a segment of s to t */
void _CopyValues(const XTensor * s, const int sBeg, const int sLen, XTensor * t, const int tBeg, XStream * stream = NULL);
/* /*
copy s to t (return a XTensor structure) copy s to t (return a XTensor structure)
make a new tensor to keep the result and return it make a new tensor to keep the result and return it
......
...@@ -190,6 +190,27 @@ XTensor LogSoftmax(const XTensor &x, int leadDim) ...@@ -190,6 +190,27 @@ XTensor LogSoftmax(const XTensor &x, int leadDim)
return y; return y;
} }
/*
log scale softmax y = log(e^x / \sum_{i} e^{x_i})
make a new tensor to keep the result and return it
>> x - input vector
>> y - output vector
>> leadDim - leading dimension (along which we perform reduction)
*/
void LogSoftmax(const XTensor &x, XTensor &y, int leadDim)
{
if(!XTensor::IsSameShaped(&x, &y))
InitTensor(&y, &x);
/* call _LogSoftmax function */
_LogSoftmax(&x, &y, leadDim);
/* tensor connection */
XLink::MakeLink(&x, NULL, &y, FUNC_LOGSOFTMAX);
XLink::AddParamToHeadInt(&y, leadDim);
}
/* /*
backward computation for dense matrices with default data type backward computation for dense matrices with default data type
......
...@@ -33,6 +33,9 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim); ...@@ -33,6 +33,9 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim);
/* log scale softmax y = log(e^x / \sum_{i} e^{x_i}) (return a XTensor structure) */ /* log scale softmax y = log(e^x / \sum_{i} e^{x_i}) (return a XTensor structure) */
XTensor LogSoftmax(const XTensor &x, int leadDim); XTensor LogSoftmax(const XTensor &x, int leadDim);
/* log scale softmax y = log(e^x / \sum_{i} e^{x_i}) (with both argument of x and y) */
void LogSoftmax(const XTensor &x, XTensor &y, int leadDim);
/* de/dx */ /* de/dx */
void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx, XTensor * dedy, XTensor * dedx,
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论