Commit 484f3694 by xiaotong

bug fixes of transformer

parent ec9a3328
......@@ -57,9 +57,9 @@ void T2TAttention::InitModel(int argc, const char ** argv, int myDevID, XMem * m
float minmax = 0;
LoadParamInt(argc, argv, "nhead", &nhead, 8);
LoadParamInt(argc, argv, "d", &dk, DEFAULT_BEDDING_SIZE);
LoadParamInt(argc, argv, "d", &dv, DEFAULT_BEDDING_SIZE);
LoadParamInt(argc, argv, "d", &d, DEFAULT_BEDDING_SIZE);
LoadParamInt(argc, argv, "d", &dk, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "d", &dv, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
LoadParamFloat(argc, argv, "attminmax", &minmax, 0.1F);
InitTensor2D(&wk, d, dk, X_FLOAT, devID, mem);
......
......@@ -53,12 +53,10 @@ void T2TEmbedder::InitModel(int argc, const char ** argv, int myDevID, XMem * my
devID = myDevID;
mem = myMem;
int d = 0;
LoadParamInt(argc, argv, "vsize", &vSize, -1);
LoadParamInt(argc, argv, "maxlen", &maxLength, 256);
LoadParamInt(argc, argv, "d", &eSize, DEFAULT_BEDDING_SIZE);
LoadParamInt(argc, argv, "d", &d, DEFAULT_BEDDING_SIZE);
LoadParamInt(argc, argv, "maxlen", &maxLength, 512);
LoadParamInt(argc, argv, "d", &eSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
InitTensor2D(&w, vSize, eSize, X_FLOAT, devID, mem);
......@@ -135,10 +133,10 @@ XTensor T2TEmbedder::Make(XTensor &input)
XTensor wordEmbedding;
/* then we make word embeddings */
wordEmbedding = MMul(input, w);
wordEmbedding = Linear(MMul(input, w), (float)sqrt((float)d));
/* we sum over the two embeddings */
return wordEmbedding + posEmbedding;
return wordEmbedding +posEmbedding;
}
}
......@@ -29,7 +29,7 @@ using namespace nts;
namespace transformer
{
#define DEFAULT_BEDDING_SIZE 512
#define DEFAULT_EMBEDDING_SIZE 512
/*
embedding (of word at position i):
......@@ -53,6 +53,9 @@ public:
/* maximum length of the sequence */
int maxLength;
/* dimension size of the hidden layers in the t2t model */
int d;
/* word embedding matrix */
XTensor w;
......
......@@ -38,7 +38,8 @@ AttEncoder::~AttEncoder()
{
delete[] attentions;
delete[] fnns;
delete[] layerNorms;
delete[] attLayerNorms;
delete[] fnnLayerNorms;
}
/*
......@@ -54,11 +55,10 @@ void AttEncoder::InitModel(int argc, const char ** argv, int myDevID, XMem * myM
mem = myMem;
LoadParamInt(argc, argv, "nlayer", &nlayer, 6);
LoadParamInt(argc, argv, "hsize", &hSize, 512);
LoadParamInt(argc, argv, "esize", &eSize, 512);
LoadParamInt(argc, argv, "hsize", &hSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "esize", &eSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "vsize", &vSize, -1);
CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsize\"");
......@@ -67,13 +67,15 @@ void AttEncoder::InitModel(int argc, const char ** argv, int myDevID, XMem * myM
attentions = new T2TAttention[nlayer];
fnns = new T2TFNN[nlayer];
layerNorms = new T2TLN[nlayer];
attLayerNorms = new T2TLN[nlayer];
fnnLayerNorms = new T2TLN[nlayer];
/* initialize the stacked layers */
for(int i = 0; i < nlayer; i++){
attentions[i].InitModel(argc, argv, myDevID, myMem);
fnns[i].InitModel(argc, argv, myDevID, myMem);
layerNorms[i].InitModel(argc, argv, myDevID, myMem);
attLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
fnnLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
}
}
......@@ -103,7 +105,7 @@ XTensor AttEncoder::Make(XTensor &input)
/* TODO: dropout */
/* layer normalization */
x = layerNorms[i].Make(res);
x = attLayerNorms[i].Make(res);
/* fnn */
fnn = fnns[i].Make(x);
......@@ -114,7 +116,7 @@ XTensor AttEncoder::Make(XTensor &input)
/* TODO: dropout */
/* layer normalization */
x = layerNorms[i].Make(res);
x = fnnLayerNorms[i].Make(res);
}
return x;
......
......@@ -86,8 +86,11 @@ public:
/* attention model of each layer */
T2TAttention * attentions;
/* layer normalization */
T2TLN * layerNorms;
/* layer normalization for fnn */
T2TLN * fnnLayerNorms;
/* layer normalization for attention */
T2TLN * attLayerNorms;
/* input tensor of the encoder */
XTensor * input;
......
......@@ -56,9 +56,9 @@ void T2TFNN::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem)
float minmax = 0;
LoadParamInt(argc, argv, "d", &inSize, DEFAULT_BEDDING_SIZE);
LoadParamInt(argc, argv, "d", &outSize, DEFAULT_BEDDING_SIZE);
LoadParamInt(argc, argv, "fnnh", &hSize, DEFAULT_BEDDING_SIZE);
LoadParamInt(argc, argv, "d", &inSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "d", &outSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "fnnh", &hSize, DEFAULT_EMBEDDING_SIZE);
LoadParamFloat(argc, argv, "fnnminmax", &minmax, 0.1F);
InitTensor2D(&w1, inSize, hSize, X_FLOAT, devID, mem);
......
......@@ -20,6 +20,8 @@
*/
#include "T2TLayerNormal.h"
#include "T2TUtility.h"
#include "T2TEmbedding.h"
#include "../../tensor/core/CHeader.h"
namespace transformer
......@@ -48,6 +50,18 @@ void T2TLN::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem)
{
devID = myDevID;
mem = myMem;
int d = 0;
LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
InitTensor2D(&w, d, d, X_FLOAT, devID, mem);
InitTensor1D(&b, d, X_FLOAT, devID, mem);
float scale = 1.0F;
float finfout = (float)sqrt(6.0F * scale / (d + d));
w.SetDataRand(-finfout, finfout);
b.SetZeroAll();
}
/*
......@@ -60,6 +74,7 @@ y =
XTensor T2TLN::Make(XTensor &input)
{
XTensor &x = input;
XTensor xn;
XTensor mean;
XTensor variance;
XTensor standard;
......@@ -67,7 +82,7 @@ XTensor T2TLN::Make(XTensor &input)
XTensor standardFilled;
/* \mu = (sum_i x_i)/m */
mean = ReduceSum(x, x.order - 1);
mean = ReduceMean(x, x.order - 1);
/* \sigma = (sum_i (x_i - \mu)^2)/m */
variance = ReduceVariance(x, x.order - 1, mean);
......@@ -81,7 +96,10 @@ XTensor T2TLN::Make(XTensor &input)
standardFilled = Unsqueeze(standard, x.order - 1, x.GetDim(-1));
/* x' = (x - \mu)/standard */
return (x - meanFilled)/standardFilled;
xn = (x - meanFilled)/standardFilled ;
/* result = x' * w + b */
return MMul(xn, w) + b;
}
}
......@@ -29,6 +29,8 @@ using namespace nts;
namespace transformer
{
/* layer normalization: y = norm(x) * w + b
where norm(x) = (x - mean)/standardDeviation */
class T2TLN
{
public:
......@@ -38,6 +40,12 @@ public:
/* memory pool */
XMem * mem;
/* the transformation matrix w */
XTensor w;
/* the bias term b */
XTensor b;
public:
/* constructor */
T2TLN();
......
......@@ -57,8 +57,8 @@ void T2TOutput::InitModel(int argc, const char ** argv, int myDevID, XMem * myMe
float minmax = 0;
LoadParamInt(argc, argv, "vsize", &vSize, -1);
LoadParamInt(argc, argv, "d", &inSize, DEFAULT_BEDDING_SIZE);
LoadParamInt(argc, argv, "d", &hSize, DEFAULT_BEDDING_SIZE);
LoadParamInt(argc, argv, "d", &inSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "d", &hSize, DEFAULT_EMBEDDING_SIZE);
LoadParamFloat(argc, argv, "outputminmax", &minmax, 0.08F);
InitTensor2D(&w, hSize, vSize, X_FLOAT, devID, mem);
......
......@@ -113,7 +113,7 @@ void T2TTrainer::Train(const char * fn, T2TModel * model)
/* learning rate */
lr = (1 / (float)sqrt((float)d)) * (float)MIN(pow(step + 1, -0.5), (step + 1) * pow(nwarmup, -1.5));
lr = 0.000002F;
//lr = 0.00005F;
/* update the parameters */
Update(model, lr);
......@@ -138,6 +138,9 @@ void T2TTrainer::Train(const char * fn, T2TModel * model)
}
fclose(file);
if (isEnd)
break;
}
double elapsed = GetClockSec() - startT;
......@@ -345,6 +348,10 @@ void T2TTrainer::Update(T2TModel * model, const float lr)
ws.Add(&model->encoder.attentions[i].wk);
ws.Add(&model->encoder.attentions[i].wq);
ws.Add(&model->encoder.attentions[i].wv);
ws.Add(&model->encoder.fnnLayerNorms[i].w);
ws.Add(&model->encoder.fnnLayerNorms[i].b);
ws.Add(&model->encoder.attLayerNorms[i].w);
ws.Add(&model->encoder.attLayerNorms[i].b);
}
ws.Add(&model->encoder.embedder.w);
......@@ -353,11 +360,37 @@ void T2TTrainer::Update(T2TModel * model, const float lr)
XTensor * para = (XTensor*)ws.Get(i);
XTensor * paraGrad = para->grad;
if (para == NULL || paraGrad == NULL)
continue;
CheckNTErrors(para != NULL, "NULL parameter tensor!");
CheckNTErrors(paraGrad != NULL, "NULL gradient tensor!");
/*
DTYPE * d = new DTYPE[para->unitNum * para->unitSize];
DTYPE * g = new DTYPE[para->unitNum * para->unitSize];
XMemCopy(d, -1, para->data, para->devID, para->unitNum * para->unitSize);
XMemCopy(g, -1, paraGrad->data, paraGrad->devID, para->unitNum * para->unitSize);
for (int i = 0; i < para->unitNum; i++) {
if (IsNAN(d[i]) || IsINF(d[i])) {
int nnn = 0;
}
if (IsNAN(g[i]) || IsINF(g[i])) {
int nnn = 0;
}
}
delete[] d;
delete[] g;
*/
/* the delta rule */
_Sum(para, paraGrad, para, -lr);
/* clear gradient */
paraGrad->SetZeroAll();
}
}
......
......@@ -26,6 +26,8 @@
namespace transformer
{
FILE * tmpFILE;
void LoadParamString(int argc, const char ** argv, const char * name, char * p, const char * defaultP)
{
char vname[128];
......
......@@ -27,6 +27,8 @@
namespace transformer
{
extern FILE * tmpFILE;
/* load arguments */
void LoadParamString(int argc, const char ** argv, const char * name, char * p, const char * defaultP);
void LoadParamInt(int argc, const char ** argv, const char * name, int * p, int defaultP);
......
......@@ -33,6 +33,8 @@ int TransformerMain(int argc, const char ** argv)
if(argc == 0)
return 1;
tmpFILE = fopen("tmp.txt", "wb");
ShowParams(argc, argv);
char * trainFN = new char[MAX_LINE_LENGTH];
......@@ -51,6 +53,8 @@ int TransformerMain(int argc, const char ** argv)
delete[] trainFN;
fclose(tmpFILE);
return 0;
}
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论