Commit 0e1074ff by xiaotong

improve coding

parent efe32603
...@@ -419,7 +419,7 @@ void XMathGrad::GradDivDim(XTensor * node, bool isEfficient) ...@@ -419,7 +419,7 @@ void XMathGrad::GradDivDim(XTensor * node, bool isEfficient)
XTensor * interGradTMP = NewTensorBuf(node->grad, node->devID, node->mem); XTensor * interGradTMP = NewTensorBuf(node->grad, node->devID, node->mem);
_Negate(a, aTMP1); _Negate(a, aTMP1);
_Power(b, bTMP, -2); _Power(b, bTMP, -2.0F);
_MultiplyDim(aTMP1, bTMP, aTMP2, n); _MultiplyDim(aTMP1, bTMP, aTMP2, n);
_Multiply(node->grad, aTMP2, interGradTMP); _Multiply(node->grad, aTMP2, interGradTMP);
...@@ -433,17 +433,17 @@ void XMathGrad::GradDivDim(XTensor * node, bool isEfficient) ...@@ -433,17 +433,17 @@ void XMathGrad::GradDivDim(XTensor * node, bool isEfficient)
size of b. Then we can reduce the matrix into a row vector. */ size of b. Then we can reduce the matrix into a row vector. */
interGradTMP->Reshape(2, reshapedSize); interGradTMP->Reshape(2, reshapedSize);
if(b->outgo.tailNum > 1){ //if(b->outgo.tailNum > 1){
XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem); XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
_ReduceSum(interGradTMP, bGradTMP, 0); _ReduceSum(interGradTMP, bGradTMP, 0);
_Sum(b->grad, bGradTMP, b->grad); _Sum(b->grad, bGradTMP, b->grad);
DelTensorBuf(bGradTMP); DelTensorBuf(bGradTMP);
} /*}
else{ else{
_ReduceSum(interGradTMP, b->grad, 0); _ReduceSum(interGradTMP, b->grad, 0);
} }*/
} }
else{ else{
int reshapedSize[MAX_TENSOR_DIM_NUM]; int reshapedSize[MAX_TENSOR_DIM_NUM];
...@@ -465,17 +465,17 @@ void XMathGrad::GradDivDim(XTensor * node, bool isEfficient) ...@@ -465,17 +465,17 @@ void XMathGrad::GradDivDim(XTensor * node, bool isEfficient)
XTensor * interGrad = NewTensorBuf(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem); XTensor * interGrad = NewTensorBuf(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
_ReduceSum(interGradTMP, interGrad, 2); _ReduceSum(interGradTMP, interGrad, 2);
if(b->outgo.tailNum > 1){ //if(b->outgo.tailNum > 1){
XTensor * bGradTMP2 = NewTensorBuf(b->grad, b->devID, b->mem); XTensor * bGradTMP2 = NewTensorBuf(b->grad, b->devID, b->mem);
_ReduceSum(interGrad, bGradTMP2, 0); _ReduceSum(interGrad, bGradTMP2, 0);
_Sum(b->grad, bGradTMP2, b->grad); _Sum(b->grad, bGradTMP2, b->grad);
DelTensorBuf(bGradTMP2); DelTensorBuf(bGradTMP2);
} /*}
else{ else{
_ReduceSum(interGrad, b->grad, 0); _ReduceSum(interGrad, b->grad, 0);
} }*/
DelTensorBuf(interGrad); DelTensorBuf(interGrad);
} }
...@@ -765,17 +765,17 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient) ...@@ -765,17 +765,17 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)
size of b. Then we can reduce the matrix into a row vector. */ size of b. Then we can reduce the matrix into a row vector. */
bGradTMP->Reshape(2, reshapedSize); bGradTMP->Reshape(2, reshapedSize);
if(b->outgo.tailNum > 1){ //if(b->outgo.tailNum > 1){
XTensor * bGradTMP2 = NewTensorBuf(b->grad, b->devID, b->mem); XTensor * bGradTMP2 = NewTensorBuf(b->grad, b->devID, b->mem);
_ReduceSum(bGradTMP, bGradTMP2, 0); _ReduceSum(bGradTMP, bGradTMP2, 0);
_Sum(b->grad, bGradTMP2, b->grad); _Sum(b->grad, bGradTMP2, b->grad);
DelTensorBuf(bGradTMP2); DelTensorBuf(bGradTMP2);
} /*}
else{ else{
_ReduceSum(bGradTMP, b->grad, 0); _ReduceSum(bGradTMP, b->grad, 0);
} }*/
} }
else{ else{
int reshapedSize[MAX_TENSOR_DIM_NUM]; int reshapedSize[MAX_TENSOR_DIM_NUM];
...@@ -797,17 +797,17 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient) ...@@ -797,17 +797,17 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)
XTensor * interGrad = NewTensorBuf(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem); XTensor * interGrad = NewTensorBuf(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
_ReduceSum(bGradTMP, interGrad, 2); _ReduceSum(bGradTMP, interGrad, 2);
if(b->outgo.tailNum > 1){ //if(b->outgo.tailNum > 1){
XTensor * bGradTMP2 = NewTensorBuf(b->grad, b->devID, b->mem); XTensor * bGradTMP2 = NewTensorBuf(b->grad, b->devID, b->mem);
_ReduceSum(interGrad, bGradTMP2, 0); _ReduceSum(interGrad, bGradTMP2, 0);
_Sum(b->grad, bGradTMP2, b->grad); _Sum(b->grad, bGradTMP2, b->grad);
DelTensorBuf(bGradTMP2); DelTensorBuf(bGradTMP2);
} /*}
else{ else{
_ReduceSum(interGrad, b->grad, 0); _ReduceSum(interGrad, b->grad, 0);
} }*/
DelTensorBuf(interGrad); DelTensorBuf(interGrad);
} }
...@@ -1059,20 +1059,20 @@ void XMathGrad::GradSubDim(XTensor * node, bool isEfficient) ...@@ -1059,20 +1059,20 @@ void XMathGrad::GradSubDim(XTensor * node, bool isEfficient)
size of b. Then we can reduce the matrix into a row vector. */ size of b. Then we can reduce the matrix into a row vector. */
node->grad->Reshape(2, reshapedSize); node->grad->Reshape(2, reshapedSize);
if(b->outgo.tailNum > 1){ //if(b->outgo.tailNum > 1){
XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem); XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
_ReduceSum(node->grad, bGradTMP, 0); _ReduceSum(node->grad, bGradTMP, 0);
if(beta != 1.0F) if(beta != 1.0F)
_ScaleAndShiftMe(bGradTMP, beta); _ScaleAndShiftMe(bGradTMP, beta);
_Sub(b->grad, bGradTMP, b->grad); _Sub(b->grad, bGradTMP, b->grad);
DelTensorBuf(bGradTMP); DelTensorBuf(bGradTMP);
} /*}
else{ else{
_ReduceSum(node->grad, b->grad, 0); _ReduceSum(node->grad, b->grad, 0);
if(beta != 1.0F) if(beta != 1.0F)
_ScaleAndShiftMe(b->grad, beta); _ScaleAndShiftMe(b->grad, beta);
_ScaleAndShiftMe(b->grad, -1.0F); _ScaleAndShiftMe(b->grad, -1.0F);
} }*/
node->grad->Reshape(order, dimSize); node->grad->Reshape(order, dimSize);
} }
...@@ -1097,20 +1097,20 @@ void XMathGrad::GradSubDim(XTensor * node, bool isEfficient) ...@@ -1097,20 +1097,20 @@ void XMathGrad::GradSubDim(XTensor * node, bool isEfficient)
_ReduceSum(node->grad, interGrad, 2); _ReduceSum(node->grad, interGrad, 2);
if(b->outgo.tailNum > 1){ //if(b->outgo.tailNum > 1){
XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem); XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
_ReduceSum(interGrad, bGradTMP, 0); _ReduceSum(interGrad, bGradTMP, 0);
if(beta != 1.0F) if(beta != 1.0F)
_ScaleAndShiftMe(bGradTMP, beta); _ScaleAndShiftMe(bGradTMP, beta);
_Sub(b->grad, bGradTMP, b->grad); _Sub(b->grad, bGradTMP, b->grad);
DelTensorBuf(bGradTMP); DelTensorBuf(bGradTMP);
} /*}
else{ else{
_ReduceSum(interGrad, b->grad, 0); _ReduceSum(interGrad, b->grad, 0);
if(beta != 1.0F) if(beta != 1.0F)
_ScaleAndShiftMe(b->grad, beta); _ScaleAndShiftMe(b->grad, beta);
_ScaleAndShiftMe(b->grad, -1.0F); _ScaleAndShiftMe(b->grad, -1.0F);
} }*/
node->grad->Reshape(order, dimSize); node->grad->Reshape(order, dimSize);
...@@ -1160,7 +1160,7 @@ gradient for sum with one dimension ...@@ -1160,7 +1160,7 @@ gradient for sum with one dimension
c = a + b * \beta c = a + b * \beta
where the size of b is equal to dimension n of a, i.e., |b| = a.dimSize[n] where the size of b is equal to dimension n of a, i.e., |b| = a.dimSize[n]
dE/da = dE/dc dE/da = dE/dc
dE/db = dE/dc * b.reduce(0,...,n-1,n+1,...) * \beta dE/db = dE/dc * a.reduce(0,...,n-1,n+1,...) * \beta
>> node - the node (c) for backward computation >> node - the node (c) for backward computation
>> isEfficient - indicates whether the computation is in >> isEfficient - indicates whether the computation is in
...@@ -1193,19 +1193,19 @@ void XMathGrad::GradSumDim(XTensor * node, bool isEfficient) ...@@ -1193,19 +1193,19 @@ void XMathGrad::GradSumDim(XTensor * node, bool isEfficient)
size of b. Then we can reduce the matrix into a row vector. */ size of b. Then we can reduce the matrix into a row vector. */
node->grad->Reshape(2, reshapedSize); node->grad->Reshape(2, reshapedSize);
if(b->outgo.tailNum > 1){ //if(b->outgo.tailNum > 1){
XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem); XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
_ReduceSum(node->grad, bGradTMP, 0); _ReduceSum(node->grad, bGradTMP, 0);
if(beta != 1.0F) if(beta != 1.0F)
_ScaleAndShiftMe(bGradTMP, beta); _ScaleAndShiftMe(bGradTMP, beta);
_Sum(bGradTMP, b->grad, b->grad); _Sum(bGradTMP, b->grad, b->grad);
DelTensorBuf(bGradTMP); DelTensorBuf(bGradTMP);
} /*}
else{ else{
_ReduceSum(node->grad, b->grad, 0); _ReduceSum(node->grad, b->grad, 0);
if(beta != 1.0F) if(beta != 1.0F)
_ScaleAndShiftMe(b->grad, beta); _ScaleAndShiftMe(b->grad, beta);
} }*/
node->grad->Reshape(order, dimSize); node->grad->Reshape(order, dimSize);
} }
...@@ -1230,19 +1230,19 @@ void XMathGrad::GradSumDim(XTensor * node, bool isEfficient) ...@@ -1230,19 +1230,19 @@ void XMathGrad::GradSumDim(XTensor * node, bool isEfficient)
_ReduceSum(node->grad, interGrad, 2); _ReduceSum(node->grad, interGrad, 2);
if(b->outgo.tailNum > 1){ //if(b->outgo.tailNum > 1){
XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem); XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
_ReduceSum(interGrad, bGradTMP, 0); _ReduceSum(interGrad, bGradTMP, 0);
if(beta != 1.0F) if(beta != 1.0F)
_ScaleAndShiftMe(bGradTMP, beta); _ScaleAndShiftMe(bGradTMP, beta);
_Sum(bGradTMP, b->grad, b->grad); _Sum(bGradTMP, b->grad, b->grad);
DelTensorBuf(bGradTMP); DelTensorBuf(bGradTMP);
} /*}
else{ else{
_ReduceSum(interGrad, b->grad, 0); _ReduceSum(interGrad, b->grad, 0);
if(beta != 1.0F) if(beta != 1.0F)
_ScaleAndShiftMe(b->grad, beta); _ScaleAndShiftMe(b->grad, beta);
} }*/
node->grad->Reshape(order, dimSize); node->grad->Reshape(order, dimSize);
......
...@@ -111,6 +111,7 @@ void T2TTrainer::Init(int argc, char ** argv) ...@@ -111,6 +111,7 @@ void T2TTrainer::Init(int argc, char ** argv)
LoadParamFloat(argc, argv, "labelsmoothing", &labelSmoothingP, 0); LoadParamFloat(argc, argv, "labelsmoothing", &labelSmoothingP, 0);
LoadParamInt(argc, argv, "nstepcheckpoint", &nStepCheckpoint, -1); LoadParamInt(argc, argv, "nstepcheckpoint", &nStepCheckpoint, -1);
LoadParamBool(argc, argv, "epochcheckpoint", &useEpochCheckpoint, false); LoadParamBool(argc, argv, "epochcheckpoint", &useEpochCheckpoint, false);
LoadParamInt(argc, argv, "updatestep", &updateStep, 1);
buf = new int[bufSize]; buf = new int[bufSize];
buf2 = new int[bufSize]; buf2 = new int[bufSize];
...@@ -144,6 +145,8 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model ...@@ -144,6 +145,8 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
int nStepCheck = 0; int nStepCheck = 0;
int nCheckpoint = 0; int nCheckpoint = 0;
int nSkipped = 0; int nSkipped = 0;
int gradStep = 0;
int validStep = 0;
char * trainFN = new char[(int)strlen(fn) + 10]; char * trainFN = new char[(int)strlen(fn) + 10];
strcpy(trainFN, fn); strcpy(trainFN, fn);
...@@ -211,17 +214,26 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model ...@@ -211,17 +214,26 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
XTensor &g = labelSmoothingP > 0 ? goldSmoothed : gold; XTensor &g = labelSmoothingP > 0 ? goldSmoothed : gold;
if (doUpdate) { if (doUpdate) {
/* back-propagation */
net.Backward(output, g, CROSSENTROPY); net.Backward(output, g, CROSSENTROPY);
/* learning rate */ gradStep += 1;
lr = lrate * (1.0F / (float)sqrt((float)d)) * (float)MIN(pow((float)step + 1, -0.5F - lrbias), ((float)step + 1) * pow((float)nwarmup, -1.5F - lrbias));
/* update the parameters */
Update(model, lr);
loss += -prob; loss += -prob;
wordCount += wc; wordCount += wc;
wordCountTotal += wc; wordCountTotal += wc;
/* update the parameters */
if(gradStep == updateStep){
/* learning rate */
lr = lrate * (1.0F / (float)sqrt((float)d)) * (float)MIN(pow((float)validStep + 1, -0.5F - lrbias), ((float)validStep + 1) * pow((float)nwarmup, -1.5F - lrbias));
/* model update */
Update(model, lr);
gradStep = 0;
validStep++;
}
} }
else else
nSkipped++; nSkipped++;
...@@ -827,13 +839,14 @@ void T2TTrainer::PadOutput(XTensor * output, XTensor * gold, XTensor * padding) ...@@ -827,13 +839,14 @@ void T2TTrainer::PadOutput(XTensor * output, XTensor * gold, XTensor * padding)
XTensor * padding2 = NewTensorBuf(1, &padding->unitNum, X_FLOAT, 1.0F, padding->devID, padding->mem); XTensor * padding2 = NewTensorBuf(1, &padding->unitNum, X_FLOAT, 1.0F, padding->devID, padding->mem);
_CopyValues(padding, padding2); _CopyValues(padding, padding2);
_MultiplyDim(output, padding2, output, 0);
_ScaleAndShiftMe(padding2, 1e9F, -1e9F); _ScaleAndShiftMe(padding2, 1e9F, -1e9F);
_SumDim(output, padding2, output, 0); _SumDim(output, padding2, output, 0);
output->Reshape(on, dimso); output->Reshape(on, dimso);
if(gold != NULL){ if(gold != NULL){
gold->Reshape(gold->unitNum/dimso[output->order - 1], dimso[output->order - 1]); gold->Reshape(gold->unitNum/dimso[gold->order - 1], dimso[gold->order - 1]);
_CopyValues(padding, padding2); _CopyValues(padding, padding2);
_MultiplyDim(gold, padding2, gold, 0); _MultiplyDim(gold, padding2, gold, 0);
gold->Reshape(on, dimso); gold->Reshape(on, dimso);
...@@ -847,11 +860,10 @@ void T2TTrainer::PadOutput(XTensor * output, XTensor * gold, XTensor * padding) ...@@ -847,11 +860,10 @@ void T2TTrainer::PadOutput(XTensor * output, XTensor * gold, XTensor * padding)
perform label smoothing perform label smoothing
>> gold - gold standard >> gold - gold standard
>> smoothed - result of label smoothing >> smoothed - result of label smoothing
>> lsP - smoothing factor >> p - smoothing factor
*/ */
void T2TTrainer::LabelSmooth(XTensor * gold, XTensor * smoothed, DTYPE lsP) void T2TTrainer::LabelSmooth(XTensor * gold, XTensor * smoothed, DTYPE p)
{ {
DTYPE p = lsP;
CheckNTErrors(p >= 0 && p <= 1.0F, "Smoothing factor must be in range [0,1]"); CheckNTErrors(p >= 0 && p <= 1.0F, "Smoothing factor must be in range [0,1]");
int n = gold->GetDim(-1); int n = gold->GetDim(-1);
......
...@@ -124,6 +124,9 @@ public: ...@@ -124,6 +124,9 @@ public:
/* indicates whether we make a checkpoint after each traing epoch */ /* indicates whether we make a checkpoint after each traing epoch */
bool useEpochCheckpoint; bool useEpochCheckpoint;
/* number of batches on which we do model update */
int updateStep;
public: public:
/* constructor */ /* constructor */
...@@ -174,7 +177,7 @@ public: ...@@ -174,7 +177,7 @@ public:
void PadOutput(XTensor * output, XTensor * gold, XTensor * padding); void PadOutput(XTensor * output, XTensor * gold, XTensor * padding);
/* perform label smoothing */ /* perform label smoothing */
void LabelSmooth(XTensor * gold, XTensor * smoothed, DTYPE lsP); void LabelSmooth(XTensor * gold, XTensor * smoothed, DTYPE p);
}; };
......
...@@ -34,7 +34,7 @@ int TransformerMain(int argc, const char ** argv) ...@@ -34,7 +34,7 @@ int TransformerMain(int argc, const char ** argv)
if(argc == 0) if(argc == 0)
return 1; return 1;
fprintf(stderr, "%e\n", exp(DTYPE_MIN)); fprintf(stderr, "%e\n", exp(-1e9F));
char ** args = new char*[argc]; char ** args = new char*[argc];
for(int i = 0; i < argc; i++){ for(int i = 0; i < argc; i++){
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论