bug fixes

1a687dab · xiaotong · 117d5109 · 1a687dab · 1a687dab · 1a687dab
Commit 1a687dab authored Sep 07, 2018 by xiaotong
--- a/source/sample/transformer/T2TAttention.cpp
+++ b/source/sample/transformer/T2TAttention.cpp
@@ -119,9 +119,19 @@ XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask)

    /* scalar = softmax(Q * K^T / sqrt(dk)) * V */
    dot = BMMul(qheads, X_NOTRANS, kheads, X_TRANS);
+
    if(isMasked)
        dot = dot + mask;
-    scalar = Softmax(Linear(dot, 1.0F/(float)sqrt((float)dk)), -1);
+
+    dot = Linear(dot, 1.0F/(float)sqrt((float)dk));
+
+    //if(llnum == 1)
+    //    dot.Dump(tf, "dot:");
+
+    scalar = Softmax(dot, -1);
+
+    //if(llnum == 1)
+    //    scalar.Dump(tf, "scalar:");

    //if(ignored > 0)
    //    _SetDataDim(&scalar, 0, ignored, scalar.order - 2, 1e-9F);

--- a/source/sample/transformer/T2TEncoder.cpp
+++ b/source/sample/transformer/T2TEncoder.cpp
@@ -103,6 +103,8 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool skipInputRes)
        XTensor fnn;
        XTensor res;

+        llnum = -1;
+
        /* we skip the residual connection for the first layer if
           the encoder is used in language modeling. */
        if(skipInputRes && i == 0){
@@ -115,6 +117,11 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool skipInputRes)
            x = attLayerNorms[i].Make(att); 
        }
        else{
+            //if(i == 1)
+            //    x.Dump(tf, "x:");
+            //if(i == 1)
+            //    llnum = 1;
+
            /* self attention */
            att = attentions[i].Make(x, x, x, mask);

@@ -125,6 +132,8 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool skipInputRes)

            /* layer normalization */
            x = attLayerNorms[i].Make(res);
+
+            llnum = -1;
        }

        /* fnn */

--- a/source/sample/transformer/T2TLayerNormal.cpp
+++ b/source/sample/transformer/T2TLayerNormal.cpp
--- a/source/sample/transformer/T2TModel.cpp
+++ b/source/sample/transformer/T2TModel.cpp
@@ -130,7 +130,7 @@ void T2TModel::Make(XTensor &input, XTensor &output, XTensor &padding)
        
        _ScaleAndShiftMe(padding3, 1e9F, -1e9F);
        
-        _Sum(&mask, padding3, &mask);
+        //_Sum(&mask, padding3, &mask);

        encoding = MakeEncoding(input, mask, true);
        outputLayer.Make(encoding, output);

--- a/source/sample/transformer/T2TTrainer.cpp
+++ b/source/sample/transformer/T2TTrainer.cpp
@@ -90,7 +90,6 @@ void T2TTrainer::Init(int argc, const char ** argv)

 }

-FILE * tf = NULL;
 int tc = 0;

 /* 
@@ -257,7 +256,7 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
    
    ClearBuf();

-    while(LoadBatch(file, true, &batch, &padding, &gold, seqs, 1, vSize, 1, 512, isLenSorted, wc, devID, mem)){
+    while(LoadBatch(file, true, &batch, &padding, &gold, seqs, 1, vSize, 1, 1, isLenSorted, wc, devID, mem)){

        CheckNTErrors(batch.order == 3, "wrong tensor order of the sequence batch");
            
@@ -503,11 +502,11 @@ int T2TTrainer::LoadBatch(FILE * file, bool isLM,
                if(w == seqLen[s] - 1)
                    output->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w]);
                wCount++;
-                //fprintf(tf, "%d", buf[seqOffset[s] + w]);
-                //if(w < seqLen[s] - 1)
-                //    fprintf(tf, " ");
-                //else
-                //    fprintf(tf, "\n");
+                /*fprintf(tf, "%d", buf[seqOffset[s] + w]);
+                if(w < seqLen[s] - 1)
+                    fprintf(tf, " ");
+                else
+                    fprintf(tf, "\n");*/
                if(seqs != NULL)
                    seqs[seqSize++] = buf[seqOffset[s] + w];
            }

--- a/source/sample/transformer/T2TUtility.cpp
+++ b/source/sample/transformer/T2TUtility.cpp
@@ -27,6 +27,8 @@ namespace transformer
 {

 FILE * tmpFILE;
+int llnum = 0;
+FILE * tf = NULL;

 void LoadParamString(int argc, const char ** argv, const char * name, char * p, const char * defaultP)
 {

--- a/source/sample/transformer/T2TUtility.h
+++ b/source/sample/transformer/T2TUtility.h
@@ -38,6 +38,9 @@ void LoadParamFloat(int argc, const char ** argv, const char * name, float * p, 
 /* show arguments */
 void ShowParams(int argc, const char ** argv);

+extern int llnum;
+extern FILE * tf;
+
 }

 #endif
--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -1377,9 +1377,10 @@ dump data to a file
 >> file - where to domp the data
 >> label - label of the tensor
 >> n - number of items to dump
+>> beg - the first item id
 >> verbose - verbose level
 */
-void XTensor::Dump(FILE * file, const char * label, const int n, const int verbose)
+void XTensor::Dump(FILE * file, const char * label, const int n, const int beg, const int verbose)
 {
    if (verbose > verboseLevel)
        return;
@@ -1437,28 +1438,26 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int verbo
    }
    if (!isSparse) {
        if (dataType == DEFAULT_DTYPE) {
-            if (unitNum > 0) {
-                DTYPE f = *(DTYPE*)d;
+            int end = MIN(n > 0 ? beg + n : beg + unitNum, unitNum);
+            for(int i = beg; i < end; i++){
+                DTYPE f = ((DTYPE*)d)[i];
+                if(i == beg)
                    fprintf(file, "%e", f);
-            }
-            int num = unitNum;
-            if (n > 0)
-                num = MIN(num, n);
-            for (int i = 1; i < num; i++) {
-                DTYPE * f = ((DTYPE*)d) + i;
-                fprintf(file, " %e", *f);
+                else
+                    fprintf(file, " %e", f);
+
            }
        }
        else {
-            ShowNTErrors("Cannot dump the tensor to the file in non-float values!");
+            ShowNTErrors("TODO!");
        }
    }
    else {
        int num = this->unitNumNonZero > 0 ? *(int*)d : 0;
-        if (n > 0)
-            num = MIN(num, n);
+        if (beg + n > 0)
+            num = MIN(num, beg + n);
        fprintf(file, "%d ", num);
-        for (int i = 0; i < num; i++) {
+        for (int i = beg; i < num; i++) {
            int key = GetKeyInSparse(i);
            DTYPE value = GetInSparse(i);
            fprintf(file, "[%d]%e ", key, value);
@@ -1481,13 +1480,14 @@ dump data to a file
 >> file - where to domp the data
 >> label - label of the tensor
 >> n - number of items to dump
+>> beg - the first item id
 >> verbose - verbose level
 */
-void XTensor::Dump(const XTensor * tensor, FILE * file, const char * label, const int n, const int verbose)
+void XTensor::Dump(const XTensor * tensor, FILE * file, const char * label, const int n, const int beg, const int verbose)
 {
    XTensor a(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, tensor->devID, tensor->mem);
    _CopyValues(tensor, &a);
-    a.Dump(file, label, n, verbose);
+    a.Dump(file, label, n, beg, verbose);
 }

 /* 

--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
@@ -339,11 +339,11 @@ public:
    bool BinarySearch(int key, DTYPE &value, void * &position) const;

    /* dump data to a file */
-    void Dump(FILE * file, const char * label = NULL, const int n = -1, const int verbose = 0);
+    void Dump(FILE * file, const char * label = NULL, const int n = -1, const int beg = 0, const int verbose = 0);

    /* dump data to a file */
    static
-    void Dump(const XTensor * tensor, FILE * file, const char * label = NULL, const int n = -1, const int verbose = 0);
+    void Dump(const XTensor * tensor, FILE * file, const char * label = NULL, const int n = -1, const int beg = 0, const int verbose = 0);

    /* read data from a file */
    void Read(FILE * file, const char * label = NULL);

--- a/source/tensor/core/reduce/ReduceMax.cu
+++ b/source/tensor/core/reduce/ReduceMax.cu
@@ -482,7 +482,7 @@ void KernelReduceMaxOp(DTYPE * input, DTYPE * output,int stride, int strideNum,
    if (tid < 32){
        if (tid < blockDim.y / 32)
            threadMax = data[tid];
-        else threadMax = 0;
+        else threadMax = FLOAT_MIN;
        threadMax = shflDownReduceMax(threadMax);
        if (tid == 0 && blockIdx.y < reducedStrideNum)
            output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = threadMax;