Commit b405b50e by xiaotong

bug fixes

parent 2a7e0de5
......@@ -381,7 +381,6 @@ void XMathGrad::GradMatrixMulBatched(XTensor * node)
XNoder::MakeGrad(a);
XNoder::MakeGrad(b);
XTensor * c = node;
XTensor * dedc = node->grad;
XTensor * deda = a->grad;
XTensor * dedb = b->grad;
......
......@@ -60,7 +60,7 @@ void T2TAttention::InitModel(int argc, const char ** argv, int myDevID, XMem * m
LoadParamInt(argc, argv, "d", &dk, DEFAULT_BEDDING_SIZE);
LoadParamInt(argc, argv, "d", &dv, DEFAULT_BEDDING_SIZE);
LoadParamInt(argc, argv, "d", &d, DEFAULT_BEDDING_SIZE);
LoadParamFloat(argc, argv, "attminmax", &minmax, 0.08F);
LoadParamFloat(argc, argv, "attminmax", &minmax, 0.1F);
InitTensor2D(&wk, d, dk, X_FLOAT, devID, mem);
InitTensor2D(&wq, d, dk, X_FLOAT, devID, mem);
......
......@@ -62,7 +62,7 @@ void T2TEmbedder::InitModel(int argc, const char ** argv, int myDevID, XMem * my
InitTensor2D(&w, vSize, eSize, X_FLOAT, devID, mem);
w.SetDataRandn(0, sqrt((float)eSize));
w.SetDataRandn(0, 1/sqrt((float)eSize));
/* create the positional embedding matrix */
MakePosEmbedding(eSize, d, maxLength);
......@@ -135,7 +135,7 @@ XTensor T2TEmbedder::Make(XTensor &input)
XTensor wordEmbedding;
/* then we make word embeddings */
wordEmbedding = MMul(&input, w);
wordEmbedding = MMul(input, w);
/* we sum over the two embeddings */
return wordEmbedding + posEmbedding;
......
......@@ -103,10 +103,7 @@ XTensor AttEncoder::Make(XTensor &input)
/* TODO: dropout */
/* layer normalization */
ln = layerNorms[i].Make(res);
/* input of next layer */
x = ln;
x = layerNorms[i].Make(res);
/* fnn */
fnn = fnns[i].Make(x);
......@@ -117,10 +114,7 @@ XTensor AttEncoder::Make(XTensor &input)
/* TODO: dropout */
/* layer normalization */
ln = layerNorms[i].Make(res);
/* input of next layer */
x = ln;
x = layerNorms[i].Make(res);
}
return x;
......
......@@ -19,6 +19,7 @@
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
*/
#include <math.h>
#include "T2TFNN.h"
#include "T2TUtility.h"
#include "T2TEmbedding.h"
......@@ -58,7 +59,7 @@ void T2TFNN::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem)
LoadParamInt(argc, argv, "d", &inSize, DEFAULT_BEDDING_SIZE);
LoadParamInt(argc, argv, "d", &outSize, DEFAULT_BEDDING_SIZE);
LoadParamInt(argc, argv, "fnnh", &hSize, DEFAULT_BEDDING_SIZE);
LoadParamFloat(argc, argv, "fnnminmax", &minmax, 0.08F);
LoadParamFloat(argc, argv, "fnnminmax", &minmax, 0.1F);
InitTensor2D(&w1, inSize, hSize, X_FLOAT, devID, mem);
InitTensor1D(&b1, hSize, X_FLOAT, devID, mem);
......@@ -66,10 +67,13 @@ void T2TFNN::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem)
InitTensor2D(&w2, hSize, outSize, X_FLOAT, devID, mem);
InitTensor1D(&b2, outSize, X_FLOAT, devID, mem);
w1.SetDataRand(-minmax, minmax);
b1.SetDataRand(-minmax, minmax);
w2.SetDataRand(-minmax, minmax);
b2.SetDataRand(-minmax, minmax);
float finfout1 = sqrt(6/(inSize + hSize));
float finfout2 = sqrt(6/(hSize + outSize));
w1.SetDataRand(-finfout1, finfout1);
b1.SetZeroAll();
w2.SetDataRand(-finfout2, finfout2);
b2.SetZeroAll();
}
/*
......
......@@ -22,6 +22,7 @@
#include "T2TModel.h"
#include "T2TUtility.h"
#include "../../tensor/core/CHeader.h"
namespace transformer
{
......@@ -81,9 +82,9 @@ make the entire network (with the output softmax layer)
*/
void T2TModel::Make(XTensor &input, XTensor &output)
{
if(isLM){
XTensor encoding;
if(isLM){
encoding = MakeEncoding(input);
outputLayer.Make(encoding, output);
}
......
......@@ -19,6 +19,7 @@
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
*/
#include <math.h>
#include "T2TOutput.h"
#include "T2TUtility.h"
#include "T2TEmbedding.h"
......@@ -61,7 +62,9 @@ void T2TOutput::InitModel(int argc, const char ** argv, int myDevID, XMem * myMe
LoadParamFloat(argc, argv, "outputminmax", &minmax, 0.08F);
InitTensor2D(&w, hSize, vSize, X_FLOAT, devID, mem);
w.SetDataRand(-minmax, minmax);
float finfout = sqrt(6/(hSize + vSize));
w.SetDataRand(-finfout, finfout);
}
/*
......
......@@ -59,6 +59,8 @@ void T2TTrainer::Init(int argc, const char ** argv)
LoadParamInt(argc, argv, "wbatch", &wBatchSize, 1);
LoadParamInt(argc, argv, "nepoch", &nepoch, 1);
LoadParamInt(argc, argv, "nstep", &nstep, 1);
LoadParamInt(argc, argv, "d", &d, 512);
LoadParamInt(argc, argv, "nwarmup", &nwarmup, 4000);
LoadParamInt(argc, argv, "vsize", &vSize, 1);
LoadParamBool(argc, argv, "sorted", &isLenSorted, false);
LoadParamInt(argc, argv, "bufsize", &bufSize, 50000);
......@@ -82,6 +84,7 @@ void T2TTrainer::Train(const char * fn, T2TModel * model)
int wordCountTotal = 0;
bool isEnd = false;
float loss = 0;
float lr = 0;
XNet net;
......@@ -108,8 +111,12 @@ void T2TTrainer::Train(const char * fn, T2TModel * model)
/* back-propagation for obtaining gradients */
net.Backward(output, batch, CROSSENTROPY);
/* learning rate */
lr = (1/sqrt((float)d)) * MIN(pow(step + 1, -0.5), (step + 1) * pow(nwarmup, -1.5));
lr = 0.000005F;
/* update the parameters */
Update(model);
Update(model, lr);
/* get probabilities */
float prob = GetProb(&output, &batch, NULL);
......@@ -125,8 +132,8 @@ void T2TTrainer::Train(const char * fn, T2TModel * model)
if (step % 1 == 0) {
double elapsed = GetClockSec() - startT;
XPRINT5(0, stderr, "[INFO] elapsed=%.1fs, step=%d, epoch=%d, ngram=%d, ppl=%.3f\n",
elapsed, step, epoch + 1, wordCountTotal, exp(loss / wordCount));
XPRINT6(0, stderr, "[INFO] lr=%e, elapsed=%.1fs, step=%d, epoch=%d, ngram=%d, ppl=%.3f\n",
lr, elapsed, step, epoch + 1, wordCountTotal, exp(loss / wordCount));
}
}
......@@ -135,8 +142,8 @@ void T2TTrainer::Train(const char * fn, T2TModel * model)
double elapsed = GetClockSec() - startT;
XPRINT5(0, stderr, "[INFO] elapsed=%.1fs, step=%d, epoch=%d, ngram=%d, ppl=%.3f\n",
elapsed, step, epoch, wordCountTotal, exp(loss / wordCount));
XPRINT6(0, stderr, "[INFO] lr=%e, elapsed=%.1fs, step=%d, epoch=%d, ngram=%d, ppl=%.3f\n",
lr, elapsed, step, epoch, wordCountTotal, exp(loss / wordCount));
XPRINT3(0, stderr, "[INFO] training finished (took %.1fs, step=%d and epoch=%d)\n",
elapsed, step, epoch);
}
......@@ -318,9 +325,13 @@ float T2TTrainer::GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs)
/*
update the model by delta rule
\theta_new = \theta - \lrate * grad
where
\lrate = d^-0.5 * min(stepNum^-0.5, stepNum * warmupStepNum^-1.5)
>> model - the t2t model
>> lr - learning rate
*/
void T2TTrainer::Update(T2TModel * model)
void T2TTrainer::Update(T2TModel * model, const float lr)
{
XList ws(100);
......@@ -342,8 +353,18 @@ void T2TTrainer::Update(T2TModel * model)
CheckNTErrors(para != NULL, "NULL parameter tensor!");
CheckNTErrors(paraGrad != NULL, "NULL gradient tensor!");
/*DTYPE * d = (DTYPE*)paraGrad->data;
for(int i = 0; i < paraGrad->unitNum; i++){
if(IsINF(d[i])){
fprintf(stderr, "isinf %d\n", i);
}
if(IsNAN(d[i])){
fprintf(stderr, "isnan %d\n", i);
}
}*/
/* the delta rule */
_Sum(para, paraGrad, para, -lrate);
_Sum(para, paraGrad, para, -lr);
}
}
......
......@@ -64,6 +64,12 @@ public:
/* indicates whether the sequence is sorted by length */
bool isLenSorted;
/* dimension size of each inner layer */
int d;
/* step number of warm-up for training */
int nwarmup;
/* vocabulary size of the source side */
int vSize;
......@@ -105,7 +111,7 @@ public:
float GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs);
/* update the model by delta rule */
void Update(T2TModel * model);
void Update(T2TModel * model, const float lr);
};
......
......@@ -1046,7 +1046,7 @@ bool XTensor::Set3D(DTYPE value, int d0, int d1, int d2)
CheckNTErrors(d2 >= 0 && d2 < dimSize[2], "dimension 1 is out of range!");
CheckNTErrors(dataType == DEFAULT_DTYPE, "The tensor is not in default type.");
int dims[3] = {d0, d1, d1};
int dims[3] = {d0, d1, d2};
return SetToDevice(devID, GetCell(dims, 3), value);
}
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论