improve the implementation of softmax

63e2cfa7 · xiaotong · 7f483801 · 63e2cfa7 · 63e2cfa7 · 63e2cfa7
Commit 63e2cfa7 authored Sep 27, 2018 by xiaotong
--- a/source/sample/transformer/T2TEncoder.cpp
+++ b/source/sample/transformer/T2TEncoder.cpp
@@ -98,6 +98,8 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool isTraining)

    x = embedder.Make(input);

+    x.Dump(tmpFILE, "embedding: ");
+
    /* dropout */
    if(isTraining && dropoutP > 0)
        x = Dropout(x, dropoutP);

--- a/source/sample/transformer/T2TTrainer.cpp
+++ b/source/sample/transformer/T2TTrainer.cpp
@@ -406,7 +406,7 @@ void T2TTrainer::MakeCheckpoint(T2TModel * model, const char * validFN, const ch
    sprintf(fn, "%s.%s.%03d", modelFN, label, id);
    sprintf(fn2, "%s.%s.%03d.output", modelFN, label, id);

-    //model->Dump(fn);
+    model->Dump(fn);
    if(validFN != NULL){
        T2TTrainer trainer;
        trainer.Init(argNum, argArray);

--- a/source/sample/transformer/Transformer.cpp
+++ b/source/sample/transformer/Transformer.cpp
@@ -34,7 +34,7 @@ int TransformerMain(int argc, const char ** argv)
    if(argc == 0)
        return 1;
    
-    fprintf(stderr, "%e\n", exp(-1e9F));
+    fprintf(stderr, "%e\n", log(1e-8F));

    char ** args = new char*[argc];
    for(int i = 0; i < argc; i++){

--- a/source/tensor/XGlobal.h
+++ b/source/tensor/XGlobal.h
@@ -55,7 +55,7 @@ namespace nts {
 #define DTYPE_MIN (DTYPE)-3.40E+38
 #endif

-#define LOGPROB_MIN (DTYPE)-1E+15
+#define LOGPROB_MIN (DTYPE)-2E+1
 #define GRAD_MAX (DTYPE)1E+5

 #if WIN32

--- a/source/tensor/function/LogSoftmax.cu
+++ b/source/tensor/function/LogSoftmax.cu
@@ -78,6 +78,7 @@ void KernelLogSoftmaxComputeByRow(DTYPE * x, DTYPE * max, DTYPE * sum, DTYPE * y
    if (i < rowNum && j < colNum) {
        int key = i * colNum + j;
        DTYPE r = log(exp(x[key] - inputMax[threadIdx.x]) / inputSum[threadIdx.x]);
+
        if (isnan(r))
            r = LOGPROB_MIN;
        if (isinf(r))
@@ -124,6 +125,12 @@ void KernelLogSoftmaxComputeByCol(DTYPE * x, DTYPE * max, DTYPE * sum, DTYPE * y
    if (i < rowNum && j < colNum) {
        int key = i * colNum + j;
        DTYPE r = log(exp(x[key] - inputMax[threadIdx.y]) / inputSum[threadIdx.y]);
+
+        /*if (r < LOGPROB_MIN)
+        {
+            printf("min %e %e, %e %e, %e %e\n", r, x[key] - inputMax[threadIdx.y], x[key], inputMax[threadIdx.y], exp(x[key] - inputMax[threadIdx.y]), inputSum[threadIdx.y]);
+        }*/
+
        if (isnan(r))
            r = LOGPROB_MIN;
        if (isinf(r))