Commit 484f3694 by xiaotong

bug fixes of transformer

parent ec9a3328
...@@ -57,9 +57,9 @@ void T2TAttention::InitModel(int argc, const char ** argv, int myDevID, XMem * m ...@@ -57,9 +57,9 @@ void T2TAttention::InitModel(int argc, const char ** argv, int myDevID, XMem * m
float minmax = 0; float minmax = 0;
LoadParamInt(argc, argv, "nhead", &nhead, 8); LoadParamInt(argc, argv, "nhead", &nhead, 8);
LoadParamInt(argc, argv, "d", &dk, DEFAULT_BEDDING_SIZE); LoadParamInt(argc, argv, "d", &dk, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "d", &dv, DEFAULT_BEDDING_SIZE); LoadParamInt(argc, argv, "d", &dv, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "d", &d, DEFAULT_BEDDING_SIZE); LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
LoadParamFloat(argc, argv, "attminmax", &minmax, 0.1F); LoadParamFloat(argc, argv, "attminmax", &minmax, 0.1F);
InitTensor2D(&wk, d, dk, X_FLOAT, devID, mem); InitTensor2D(&wk, d, dk, X_FLOAT, devID, mem);
......
...@@ -53,12 +53,10 @@ void T2TEmbedder::InitModel(int argc, const char ** argv, int myDevID, XMem * my ...@@ -53,12 +53,10 @@ void T2TEmbedder::InitModel(int argc, const char ** argv, int myDevID, XMem * my
devID = myDevID; devID = myDevID;
mem = myMem; mem = myMem;
int d = 0;
LoadParamInt(argc, argv, "vsize", &vSize, -1); LoadParamInt(argc, argv, "vsize", &vSize, -1);
LoadParamInt(argc, argv, "maxlen", &maxLength, 256); LoadParamInt(argc, argv, "maxlen", &maxLength, 512);
LoadParamInt(argc, argv, "d", &eSize, DEFAULT_BEDDING_SIZE); LoadParamInt(argc, argv, "d", &eSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "d", &d, DEFAULT_BEDDING_SIZE); LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
InitTensor2D(&w, vSize, eSize, X_FLOAT, devID, mem); InitTensor2D(&w, vSize, eSize, X_FLOAT, devID, mem);
...@@ -135,10 +133,10 @@ XTensor T2TEmbedder::Make(XTensor &input) ...@@ -135,10 +133,10 @@ XTensor T2TEmbedder::Make(XTensor &input)
XTensor wordEmbedding; XTensor wordEmbedding;
/* then we make word embeddings */ /* then we make word embeddings */
wordEmbedding = MMul(input, w); wordEmbedding = Linear(MMul(input, w), (float)sqrt((float)d));
/* we sum over the two embeddings */ /* we sum over the two embeddings */
return wordEmbedding + posEmbedding; return wordEmbedding +posEmbedding;
} }
} }
...@@ -29,7 +29,7 @@ using namespace nts; ...@@ -29,7 +29,7 @@ using namespace nts;
namespace transformer namespace transformer
{ {
#define DEFAULT_BEDDING_SIZE 512 #define DEFAULT_EMBEDDING_SIZE 512
/* /*
embedding (of word at position i): embedding (of word at position i):
...@@ -53,6 +53,9 @@ public: ...@@ -53,6 +53,9 @@ public:
/* maximum length of the sequence */ /* maximum length of the sequence */
int maxLength; int maxLength;
/* dimension size of the hidden layers in the t2t model */
int d;
/* word embedding matrix */ /* word embedding matrix */
XTensor w; XTensor w;
......
...@@ -38,7 +38,8 @@ AttEncoder::~AttEncoder() ...@@ -38,7 +38,8 @@ AttEncoder::~AttEncoder()
{ {
delete[] attentions; delete[] attentions;
delete[] fnns; delete[] fnns;
delete[] layerNorms; delete[] attLayerNorms;
delete[] fnnLayerNorms;
} }
/* /*
...@@ -54,11 +55,10 @@ void AttEncoder::InitModel(int argc, const char ** argv, int myDevID, XMem * myM ...@@ -54,11 +55,10 @@ void AttEncoder::InitModel(int argc, const char ** argv, int myDevID, XMem * myM
mem = myMem; mem = myMem;
LoadParamInt(argc, argv, "nlayer", &nlayer, 6); LoadParamInt(argc, argv, "nlayer", &nlayer, 6);
LoadParamInt(argc, argv, "hsize", &hSize, 512); LoadParamInt(argc, argv, "hsize", &hSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "esize", &eSize, 512); LoadParamInt(argc, argv, "esize", &eSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "vsize", &vSize, -1); LoadParamInt(argc, argv, "vsize", &vSize, -1);
CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!"); CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsize\""); CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsize\"");
...@@ -67,13 +67,15 @@ void AttEncoder::InitModel(int argc, const char ** argv, int myDevID, XMem * myM ...@@ -67,13 +67,15 @@ void AttEncoder::InitModel(int argc, const char ** argv, int myDevID, XMem * myM
attentions = new T2TAttention[nlayer]; attentions = new T2TAttention[nlayer];
fnns = new T2TFNN[nlayer]; fnns = new T2TFNN[nlayer];
layerNorms = new T2TLN[nlayer]; attLayerNorms = new T2TLN[nlayer];
fnnLayerNorms = new T2TLN[nlayer];
/* initialize the stacked layers */ /* initialize the stacked layers */
for(int i = 0; i < nlayer; i++){ for(int i = 0; i < nlayer; i++){
attentions[i].InitModel(argc, argv, myDevID, myMem); attentions[i].InitModel(argc, argv, myDevID, myMem);
fnns[i].InitModel(argc, argv, myDevID, myMem); fnns[i].InitModel(argc, argv, myDevID, myMem);
layerNorms[i].InitModel(argc, argv, myDevID, myMem); attLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
fnnLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
} }
} }
...@@ -103,7 +105,7 @@ XTensor AttEncoder::Make(XTensor &input) ...@@ -103,7 +105,7 @@ XTensor AttEncoder::Make(XTensor &input)
/* TODO: dropout */ /* TODO: dropout */
/* layer normalization */ /* layer normalization */
x = layerNorms[i].Make(res); x = attLayerNorms[i].Make(res);
/* fnn */ /* fnn */
fnn = fnns[i].Make(x); fnn = fnns[i].Make(x);
...@@ -114,7 +116,7 @@ XTensor AttEncoder::Make(XTensor &input) ...@@ -114,7 +116,7 @@ XTensor AttEncoder::Make(XTensor &input)
/* TODO: dropout */ /* TODO: dropout */
/* layer normalization */ /* layer normalization */
x = layerNorms[i].Make(res); x = fnnLayerNorms[i].Make(res);
} }
return x; return x;
......
...@@ -86,8 +86,11 @@ public: ...@@ -86,8 +86,11 @@ public:
/* attention model of each layer */ /* attention model of each layer */
T2TAttention * attentions; T2TAttention * attentions;
/* layer normalization */ /* layer normalization for fnn */
T2TLN * layerNorms; T2TLN * fnnLayerNorms;
/* layer normalization for attention */
T2TLN * attLayerNorms;
/* input tensor of the encoder */ /* input tensor of the encoder */
XTensor * input; XTensor * input;
......
...@@ -56,9 +56,9 @@ void T2TFNN::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem) ...@@ -56,9 +56,9 @@ void T2TFNN::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem)
float minmax = 0; float minmax = 0;
LoadParamInt(argc, argv, "d", &inSize, DEFAULT_BEDDING_SIZE); LoadParamInt(argc, argv, "d", &inSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "d", &outSize, DEFAULT_BEDDING_SIZE); LoadParamInt(argc, argv, "d", &outSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "fnnh", &hSize, DEFAULT_BEDDING_SIZE); LoadParamInt(argc, argv, "fnnh", &hSize, DEFAULT_EMBEDDING_SIZE);
LoadParamFloat(argc, argv, "fnnminmax", &minmax, 0.1F); LoadParamFloat(argc, argv, "fnnminmax", &minmax, 0.1F);
InitTensor2D(&w1, inSize, hSize, X_FLOAT, devID, mem); InitTensor2D(&w1, inSize, hSize, X_FLOAT, devID, mem);
......
...@@ -20,6 +20,8 @@ ...@@ -20,6 +20,8 @@
*/ */
#include "T2TLayerNormal.h" #include "T2TLayerNormal.h"
#include "T2TUtility.h"
#include "T2TEmbedding.h"
#include "../../tensor/core/CHeader.h" #include "../../tensor/core/CHeader.h"
namespace transformer namespace transformer
...@@ -48,6 +50,18 @@ void T2TLN::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem) ...@@ -48,6 +50,18 @@ void T2TLN::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem)
{ {
devID = myDevID; devID = myDevID;
mem = myMem; mem = myMem;
int d = 0;
LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
InitTensor2D(&w, d, d, X_FLOAT, devID, mem);
InitTensor1D(&b, d, X_FLOAT, devID, mem);
float scale = 1.0F;
float finfout = (float)sqrt(6.0F * scale / (d + d));
w.SetDataRand(-finfout, finfout);
b.SetZeroAll();
} }
/* /*
...@@ -60,6 +74,7 @@ y = ...@@ -60,6 +74,7 @@ y =
XTensor T2TLN::Make(XTensor &input) XTensor T2TLN::Make(XTensor &input)
{ {
XTensor &x = input; XTensor &x = input;
XTensor xn;
XTensor mean; XTensor mean;
XTensor variance; XTensor variance;
XTensor standard; XTensor standard;
...@@ -67,7 +82,7 @@ XTensor T2TLN::Make(XTensor &input) ...@@ -67,7 +82,7 @@ XTensor T2TLN::Make(XTensor &input)
XTensor standardFilled; XTensor standardFilled;
/* \mu = (sum_i x_i)/m */ /* \mu = (sum_i x_i)/m */
mean = ReduceSum(x, x.order - 1); mean = ReduceMean(x, x.order - 1);
/* \sigma = (sum_i (x_i - \mu)^2)/m */ /* \sigma = (sum_i (x_i - \mu)^2)/m */
variance = ReduceVariance(x, x.order - 1, mean); variance = ReduceVariance(x, x.order - 1, mean);
...@@ -81,7 +96,10 @@ XTensor T2TLN::Make(XTensor &input) ...@@ -81,7 +96,10 @@ XTensor T2TLN::Make(XTensor &input)
standardFilled = Unsqueeze(standard, x.order - 1, x.GetDim(-1)); standardFilled = Unsqueeze(standard, x.order - 1, x.GetDim(-1));
/* x' = (x - \mu)/standard */ /* x' = (x - \mu)/standard */
return (x - meanFilled)/standardFilled; xn = (x - meanFilled)/standardFilled ;
/* result = x' * w + b */
return MMul(xn, w) + b;
} }
} }
...@@ -29,6 +29,8 @@ using namespace nts; ...@@ -29,6 +29,8 @@ using namespace nts;
namespace transformer namespace transformer
{ {
/* layer normalization: y = norm(x) * w + b
where norm(x) = (x - mean)/standardDeviation */
class T2TLN class T2TLN
{ {
public: public:
...@@ -38,6 +40,12 @@ public: ...@@ -38,6 +40,12 @@ public:
/* memory pool */ /* memory pool */
XMem * mem; XMem * mem;
/* the transformation matrix w */
XTensor w;
/* the bias term b */
XTensor b;
public: public:
/* constructor */ /* constructor */
T2TLN(); T2TLN();
......
...@@ -57,8 +57,8 @@ void T2TOutput::InitModel(int argc, const char ** argv, int myDevID, XMem * myMe ...@@ -57,8 +57,8 @@ void T2TOutput::InitModel(int argc, const char ** argv, int myDevID, XMem * myMe
float minmax = 0; float minmax = 0;
LoadParamInt(argc, argv, "vsize", &vSize, -1); LoadParamInt(argc, argv, "vsize", &vSize, -1);
LoadParamInt(argc, argv, "d", &inSize, DEFAULT_BEDDING_SIZE); LoadParamInt(argc, argv, "d", &inSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "d", &hSize, DEFAULT_BEDDING_SIZE); LoadParamInt(argc, argv, "d", &hSize, DEFAULT_EMBEDDING_SIZE);
LoadParamFloat(argc, argv, "outputminmax", &minmax, 0.08F); LoadParamFloat(argc, argv, "outputminmax", &minmax, 0.08F);
InitTensor2D(&w, hSize, vSize, X_FLOAT, devID, mem); InitTensor2D(&w, hSize, vSize, X_FLOAT, devID, mem);
......
...@@ -113,7 +113,7 @@ void T2TTrainer::Train(const char * fn, T2TModel * model) ...@@ -113,7 +113,7 @@ void T2TTrainer::Train(const char * fn, T2TModel * model)
/* learning rate */ /* learning rate */
lr = (1 / (float)sqrt((float)d)) * (float)MIN(pow(step + 1, -0.5), (step + 1) * pow(nwarmup, -1.5)); lr = (1 / (float)sqrt((float)d)) * (float)MIN(pow(step + 1, -0.5), (step + 1) * pow(nwarmup, -1.5));
lr = 0.000002F; //lr = 0.00005F;
/* update the parameters */ /* update the parameters */
Update(model, lr); Update(model, lr);
...@@ -138,6 +138,9 @@ void T2TTrainer::Train(const char * fn, T2TModel * model) ...@@ -138,6 +138,9 @@ void T2TTrainer::Train(const char * fn, T2TModel * model)
} }
fclose(file); fclose(file);
if (isEnd)
break;
} }
double elapsed = GetClockSec() - startT; double elapsed = GetClockSec() - startT;
...@@ -345,6 +348,10 @@ void T2TTrainer::Update(T2TModel * model, const float lr) ...@@ -345,6 +348,10 @@ void T2TTrainer::Update(T2TModel * model, const float lr)
ws.Add(&model->encoder.attentions[i].wk); ws.Add(&model->encoder.attentions[i].wk);
ws.Add(&model->encoder.attentions[i].wq); ws.Add(&model->encoder.attentions[i].wq);
ws.Add(&model->encoder.attentions[i].wv); ws.Add(&model->encoder.attentions[i].wv);
ws.Add(&model->encoder.fnnLayerNorms[i].w);
ws.Add(&model->encoder.fnnLayerNorms[i].b);
ws.Add(&model->encoder.attLayerNorms[i].w);
ws.Add(&model->encoder.attLayerNorms[i].b);
} }
ws.Add(&model->encoder.embedder.w); ws.Add(&model->encoder.embedder.w);
...@@ -353,11 +360,37 @@ void T2TTrainer::Update(T2TModel * model, const float lr) ...@@ -353,11 +360,37 @@ void T2TTrainer::Update(T2TModel * model, const float lr)
XTensor * para = (XTensor*)ws.Get(i); XTensor * para = (XTensor*)ws.Get(i);
XTensor * paraGrad = para->grad; XTensor * paraGrad = para->grad;
if (para == NULL || paraGrad == NULL)
continue;
CheckNTErrors(para != NULL, "NULL parameter tensor!"); CheckNTErrors(para != NULL, "NULL parameter tensor!");
CheckNTErrors(paraGrad != NULL, "NULL gradient tensor!"); CheckNTErrors(paraGrad != NULL, "NULL gradient tensor!");
/*
DTYPE * d = new DTYPE[para->unitNum * para->unitSize];
DTYPE * g = new DTYPE[para->unitNum * para->unitSize];
XMemCopy(d, -1, para->data, para->devID, para->unitNum * para->unitSize);
XMemCopy(g, -1, paraGrad->data, paraGrad->devID, para->unitNum * para->unitSize);
for (int i = 0; i < para->unitNum; i++) {
if (IsNAN(d[i]) || IsINF(d[i])) {
int nnn = 0;
}
if (IsNAN(g[i]) || IsINF(g[i])) {
int nnn = 0;
}
}
delete[] d;
delete[] g;
*/
/* the delta rule */ /* the delta rule */
_Sum(para, paraGrad, para, -lr); _Sum(para, paraGrad, para, -lr);
/* clear gradient */
paraGrad->SetZeroAll();
} }
} }
......
...@@ -26,6 +26,8 @@ ...@@ -26,6 +26,8 @@
namespace transformer namespace transformer
{ {
FILE * tmpFILE;
void LoadParamString(int argc, const char ** argv, const char * name, char * p, const char * defaultP) void LoadParamString(int argc, const char ** argv, const char * name, char * p, const char * defaultP)
{ {
char vname[128]; char vname[128];
......
...@@ -27,6 +27,8 @@ ...@@ -27,6 +27,8 @@
namespace transformer namespace transformer
{ {
extern FILE * tmpFILE;
/* load arguments */ /* load arguments */
void LoadParamString(int argc, const char ** argv, const char * name, char * p, const char * defaultP); void LoadParamString(int argc, const char ** argv, const char * name, char * p, const char * defaultP);
void LoadParamInt(int argc, const char ** argv, const char * name, int * p, int defaultP); void LoadParamInt(int argc, const char ** argv, const char * name, int * p, int defaultP);
......
...@@ -33,6 +33,8 @@ int TransformerMain(int argc, const char ** argv) ...@@ -33,6 +33,8 @@ int TransformerMain(int argc, const char ** argv)
if(argc == 0) if(argc == 0)
return 1; return 1;
tmpFILE = fopen("tmp.txt", "wb");
ShowParams(argc, argv); ShowParams(argc, argv);
char * trainFN = new char[MAX_LINE_LENGTH]; char * trainFN = new char[MAX_LINE_LENGTH];
...@@ -51,6 +53,8 @@ int TransformerMain(int argc, const char ** argv) ...@@ -51,6 +53,8 @@ int TransformerMain(int argc, const char ** argv)
delete[] trainFN; delete[] trainFN;
fclose(tmpFILE);
return 0; return 0;
} }
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论