Commit 0e1074ff by xiaotong

improve coding

parent efe32603
......@@ -419,7 +419,7 @@ void XMathGrad::GradDivDim(XTensor * node, bool isEfficient)
XTensor * interGradTMP = NewTensorBuf(node->grad, node->devID, node->mem);
_Negate(a, aTMP1);
_Power(b, bTMP, -2);
_Power(b, bTMP, -2.0F);
_MultiplyDim(aTMP1, bTMP, aTMP2, n);
_Multiply(node->grad, aTMP2, interGradTMP);
......@@ -433,17 +433,17 @@ void XMathGrad::GradDivDim(XTensor * node, bool isEfficient)
size of b. Then we can reduce the matrix into a row vector. */
interGradTMP->Reshape(2, reshapedSize);
if(b->outgo.tailNum > 1){
//if(b->outgo.tailNum > 1){
XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
_ReduceSum(interGradTMP, bGradTMP, 0);
_Sum(b->grad, bGradTMP, b->grad);
DelTensorBuf(bGradTMP);
}
/*}
else{
_ReduceSum(interGradTMP, b->grad, 0);
}
}*/
}
else{
int reshapedSize[MAX_TENSOR_DIM_NUM];
......@@ -465,17 +465,17 @@ void XMathGrad::GradDivDim(XTensor * node, bool isEfficient)
XTensor * interGrad = NewTensorBuf(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
_ReduceSum(interGradTMP, interGrad, 2);
if(b->outgo.tailNum > 1){
//if(b->outgo.tailNum > 1){
XTensor * bGradTMP2 = NewTensorBuf(b->grad, b->devID, b->mem);
_ReduceSum(interGrad, bGradTMP2, 0);
_Sum(b->grad, bGradTMP2, b->grad);
DelTensorBuf(bGradTMP2);
}
/*}
else{
_ReduceSum(interGrad, b->grad, 0);
}
}*/
DelTensorBuf(interGrad);
}
......@@ -765,17 +765,17 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)
size of b. Then we can reduce the matrix into a row vector. */
bGradTMP->Reshape(2, reshapedSize);
if(b->outgo.tailNum > 1){
//if(b->outgo.tailNum > 1){
XTensor * bGradTMP2 = NewTensorBuf(b->grad, b->devID, b->mem);
_ReduceSum(bGradTMP, bGradTMP2, 0);
_Sum(b->grad, bGradTMP2, b->grad);
DelTensorBuf(bGradTMP2);
}
/*}
else{
_ReduceSum(bGradTMP, b->grad, 0);
}
}*/
}
else{
int reshapedSize[MAX_TENSOR_DIM_NUM];
......@@ -797,17 +797,17 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)
XTensor * interGrad = NewTensorBuf(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
_ReduceSum(bGradTMP, interGrad, 2);
if(b->outgo.tailNum > 1){
//if(b->outgo.tailNum > 1){
XTensor * bGradTMP2 = NewTensorBuf(b->grad, b->devID, b->mem);
_ReduceSum(interGrad, bGradTMP2, 0);
_Sum(b->grad, bGradTMP2, b->grad);
DelTensorBuf(bGradTMP2);
}
/*}
else{
_ReduceSum(interGrad, b->grad, 0);
}
}*/
DelTensorBuf(interGrad);
}
......@@ -1059,20 +1059,20 @@ void XMathGrad::GradSubDim(XTensor * node, bool isEfficient)
size of b. Then we can reduce the matrix into a row vector. */
node->grad->Reshape(2, reshapedSize);
if(b->outgo.tailNum > 1){
//if(b->outgo.tailNum > 1){
XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
_ReduceSum(node->grad, bGradTMP, 0);
if(beta != 1.0F)
_ScaleAndShiftMe(bGradTMP, beta);
_Sub(b->grad, bGradTMP, b->grad);
DelTensorBuf(bGradTMP);
}
/*}
else{
_ReduceSum(node->grad, b->grad, 0);
if(beta != 1.0F)
_ScaleAndShiftMe(b->grad, beta);
_ScaleAndShiftMe(b->grad, -1.0F);
}
}*/
node->grad->Reshape(order, dimSize);
}
......@@ -1097,20 +1097,20 @@ void XMathGrad::GradSubDim(XTensor * node, bool isEfficient)
_ReduceSum(node->grad, interGrad, 2);
if(b->outgo.tailNum > 1){
//if(b->outgo.tailNum > 1){
XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
_ReduceSum(interGrad, bGradTMP, 0);
if(beta != 1.0F)
_ScaleAndShiftMe(bGradTMP, beta);
_Sub(b->grad, bGradTMP, b->grad);
DelTensorBuf(bGradTMP);
}
/*}
else{
_ReduceSum(interGrad, b->grad, 0);
if(beta != 1.0F)
_ScaleAndShiftMe(b->grad, beta);
_ScaleAndShiftMe(b->grad, -1.0F);
}
}*/
node->grad->Reshape(order, dimSize);
......@@ -1160,7 +1160,7 @@ gradient for sum with one dimension
c = a + b * \beta
where the size of b is equal to dimension n of a, i.e., |b| = a.dimSize[n]
dE/da = dE/dc
dE/db = dE/dc * b.reduce(0,...,n-1,n+1,...) * \beta
dE/db = dE/dc * a.reduce(0,...,n-1,n+1,...) * \beta
>> node - the node (c) for backward computation
>> isEfficient - indicates whether the computation is in
......@@ -1193,19 +1193,19 @@ void XMathGrad::GradSumDim(XTensor * node, bool isEfficient)
size of b. Then we can reduce the matrix into a row vector. */
node->grad->Reshape(2, reshapedSize);
if(b->outgo.tailNum > 1){
//if(b->outgo.tailNum > 1){
XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
_ReduceSum(node->grad, bGradTMP, 0);
if(beta != 1.0F)
_ScaleAndShiftMe(bGradTMP, beta);
_Sum(bGradTMP, b->grad, b->grad);
DelTensorBuf(bGradTMP);
}
/*}
else{
_ReduceSum(node->grad, b->grad, 0);
if(beta != 1.0F)
_ScaleAndShiftMe(b->grad, beta);
}
}*/
node->grad->Reshape(order, dimSize);
}
......@@ -1230,19 +1230,19 @@ void XMathGrad::GradSumDim(XTensor * node, bool isEfficient)
_ReduceSum(node->grad, interGrad, 2);
if(b->outgo.tailNum > 1){
//if(b->outgo.tailNum > 1){
XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
_ReduceSum(interGrad, bGradTMP, 0);
if(beta != 1.0F)
_ScaleAndShiftMe(bGradTMP, beta);
_Sum(bGradTMP, b->grad, b->grad);
DelTensorBuf(bGradTMP);
}
/*}
else{
_ReduceSum(interGrad, b->grad, 0);
if(beta != 1.0F)
_ScaleAndShiftMe(b->grad, beta);
}
}*/
node->grad->Reshape(order, dimSize);
......
......@@ -111,6 +111,7 @@ void T2TTrainer::Init(int argc, char ** argv)
LoadParamFloat(argc, argv, "labelsmoothing", &labelSmoothingP, 0);
LoadParamInt(argc, argv, "nstepcheckpoint", &nStepCheckpoint, -1);
LoadParamBool(argc, argv, "epochcheckpoint", &useEpochCheckpoint, false);
LoadParamInt(argc, argv, "updatestep", &updateStep, 1);
buf = new int[bufSize];
buf2 = new int[bufSize];
......@@ -144,6 +145,8 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
int nStepCheck = 0;
int nCheckpoint = 0;
int nSkipped = 0;
int gradStep = 0;
int validStep = 0;
char * trainFN = new char[(int)strlen(fn) + 10];
strcpy(trainFN, fn);
......@@ -211,17 +214,26 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
XTensor &g = labelSmoothingP > 0 ? goldSmoothed : gold;
if (doUpdate) {
/* back-propagation */
net.Backward(output, g, CROSSENTROPY);
/* learning rate */
lr = lrate * (1.0F / (float)sqrt((float)d)) * (float)MIN(pow((float)step + 1, -0.5F - lrbias), ((float)step + 1) * pow((float)nwarmup, -1.5F - lrbias));
/* update the parameters */
Update(model, lr);
gradStep += 1;
loss += -prob;
wordCount += wc;
wordCountTotal += wc;
/* update the parameters */
if(gradStep == updateStep){
/* learning rate */
lr = lrate * (1.0F / (float)sqrt((float)d)) * (float)MIN(pow((float)validStep + 1, -0.5F - lrbias), ((float)validStep + 1) * pow((float)nwarmup, -1.5F - lrbias));
/* model update */
Update(model, lr);
gradStep = 0;
validStep++;
}
}
else
nSkipped++;
......@@ -827,13 +839,14 @@ void T2TTrainer::PadOutput(XTensor * output, XTensor * gold, XTensor * padding)
XTensor * padding2 = NewTensorBuf(1, &padding->unitNum, X_FLOAT, 1.0F, padding->devID, padding->mem);
_CopyValues(padding, padding2);
_MultiplyDim(output, padding2, output, 0);
_ScaleAndShiftMe(padding2, 1e9F, -1e9F);
_SumDim(output, padding2, output, 0);
output->Reshape(on, dimso);
if(gold != NULL){
gold->Reshape(gold->unitNum/dimso[output->order - 1], dimso[output->order - 1]);
gold->Reshape(gold->unitNum/dimso[gold->order - 1], dimso[gold->order - 1]);
_CopyValues(padding, padding2);
_MultiplyDim(gold, padding2, gold, 0);
gold->Reshape(on, dimso);
......@@ -847,11 +860,10 @@ void T2TTrainer::PadOutput(XTensor * output, XTensor * gold, XTensor * padding)
perform label smoothing
>> gold - gold standard
>> smoothed - result of label smoothing
>> lsP - smoothing factor
>> p - smoothing factor
*/
void T2TTrainer::LabelSmooth(XTensor * gold, XTensor * smoothed, DTYPE lsP)
void T2TTrainer::LabelSmooth(XTensor * gold, XTensor * smoothed, DTYPE p)
{
DTYPE p = lsP;
CheckNTErrors(p >= 0 && p <= 1.0F, "Smoothing factor must be in range [0,1]");
int n = gold->GetDim(-1);
......
......@@ -124,6 +124,9 @@ public:
/* indicates whether we make a checkpoint after each traing epoch */
bool useEpochCheckpoint;
/* number of batches on which we do model update */
int updateStep;
public:
/* constructor */
......@@ -174,7 +177,7 @@ public:
void PadOutput(XTensor * output, XTensor * gold, XTensor * padding);
/* perform label smoothing */
void LabelSmooth(XTensor * gold, XTensor * smoothed, DTYPE lsP);
void LabelSmooth(XTensor * gold, XTensor * smoothed, DTYPE p);
};
......
......@@ -34,7 +34,7 @@ int TransformerMain(int argc, const char ** argv)
if(argc == 0)
return 1;
fprintf(stderr, "%e\n", exp(DTYPE_MIN));
fprintf(stderr, "%e\n", exp(-1e9F));
char ** args = new char*[argc];
for(int i = 0; i < argc; i++){
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论