DEBUG HELP

90ffc411 · ltb · 9d7cb741 · 90ffc411 · 90ffc411 · 90ffc411
Commit 90ffc411 authored Sep 23, 2019 by ltb
--- a/source/network/Main.cpp
+++ b/source/network/Main.cpp
--- a/source/network/XBackwardData.cpp
+++ b/source/network/XBackwardData.cpp
@@ -15,10 +15,10 @@
 * limitations under the License.
 */

-/*
- * backward computation for data operation
- * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-12-26
- */
+ /*
+  * backward computation for data operation
+  * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-12-26
+  */

 #include "XNoder.h"
 #include "XBackwardData.h"
@@ -27,103 +27,139 @@
 #include "../tensor/core/CHeader.h"
 #include "../tensor/core/getandset/SetData.h"

-namespace nts{
-
-/* compute dE/dx of a node */
-void XDataGrad::MakeGrad(XTensor * node, bool isEfficent)
-{
-    CheckNTErrors(node->grad != NULL, "No gradient found!");
-
-    XLink &income = node->income;
-    int operID = income.typeID;
-
-    if(operID == GETANDSET_CONVERTDATATYPE)
-        GradConvertDataType(node, isEfficent);
-    else if(operID == GETANDSET_INDEXTOONEHOT)
-        GradIndexToOnehot(node, isEfficent);
-    else if(operID == GETANDSET_ONEHOTTOINDEX)
-        GradOnehotToIndex(node, isEfficent);
-    else{
-        ShowNTErrors("TODO!");
-    }
-}
-
-/* indicates whether the node is for a data operation */
-bool XDataGrad::IsDataOP(XTensor * node)
-{
-    XLink &income = node->income;
-    return (income.typeID & DATA_BASE) != 0;
-}
-
-/* 
-gradient computation for convert datatype
-for
-b = converdatatype(a) 
-we have
-dE/da = convertdatatype(b)
->> node - the node (c) for backward computation
->> isEfficient - indicates whether the computation is in
-                 an efficient manner
-*/
-void XDataGrad::GradConvertDataType(XTensor * node, bool isEfficent)
-{
-    XLink &income = node->income;
-    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for ConvertDataType!");
-
-    XTensor * input = income.tails[0];
-
-    XNoder::MakeGrad(input);
-
-	XTensor * tmp = NewTensorBuf(input->grad, input->devID, input->mem);
-    _ConvertDataType(node->grad, tmp);
-	_SumMe(input->grad, tmp);
-	DelTensorBuf(tmp);
-
-	node->visitMark = NODE_FINISHED;
-}
-
-/* 
-gradient computation for OnehotToIndex
-for
-b = OnehotToIndex(a) 
-we have
-dE/da = IndexToOnehot(b)
->> node - the node (c) for backward computation
->> isEfficient - indicates whether the computation is in
-                 an efficient manner
-*/
-void XDataGrad::GradOnehotToIndex(XTensor * node, bool isEfficent)
-{
-    XLink &income = node->income;
-    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for IndexToOnehot!");
-
-    XTensor * input = income.tails[0];
-
-    XNoder::MakeGrad(input);
-
-	node->visitMark = NODE_FINISHED;
-}
-
-/* 
-gradient computation for IndexToOnehot
-for
-b = IndexToOnehot(a) 
-we have
-dE/da = IndexToOnehot(b)
->> node - the node (c) for backward computation
->> isEfficient - indicates whether the computation is in
-                 an efficient manner
-*/
-void XDataGrad::GradIndexToOnehot(XTensor * node, bool isEfficent)
-{
-    XLink &income = node->income;
-    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for IndexToOnehot!");
-
-    XTensor * input = income.tails[0];
-
-    XNoder::MakeGrad(input);
-
-	node->visitMark = NODE_FINISHED;
-}
+namespace nts {
+
+	/* compute dE/dx of a node */
+	void XDataGrad::MakeGrad(XTensor * node, bool isEfficent)
+	{
+		CheckNTErrors(node->grad != NULL, "No gradient found!");
+
+		XLink &income = node->income;
+		int operID = income.typeID;
+
+		if (operID == GETANDSET_CONVERTDATATYPE)
+			GradConvertDataType(node, isEfficent);
+		else if (operID == GETANDSET_INDEXTOONEHOT)
+			GradIndexToOnehot(node, isEfficent);
+		else if (operID == GETANDSET_ONEHOTTOINDEX)
+			GradOnehotToIndex(node, isEfficent);
+		else {
+			ShowNTErrors("TODO!");
+		}
+	}
+
+	/* indicates whether the node is for a data operation */
+	bool XDataGrad::IsDataOP(XTensor * node)
+	{
+		XLink &income = node->income;
+		return (income.typeID & DATA_BASE) != 0;
+	}
+
+	/*
+	gradient computation for convert datatype
+	for
+	b = converdatatype(a)
+	we have
+	dE/da = convertdatatype(b)
+	>> node - the node (c) for backward computation
+	>> isEfficient - indicates whether the computation is in
+					 an efficient manner
+	*/
+	void XDataGrad::GradConvertDataType(XTensor * node, bool isEfficent)
+	{
+		XLink &income = node->income;
+		CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for ConvertDataType!");
+
+		XTensor * input = income.tails[0];
+
+		XNoder::MakeGrad(input);
+
+		XTensor * tmp = NewTensorBuf(input->grad, input->devID, input->mem);
+
+		//if (node->dataType == X_FLOAT) {
+		//	FILE * Convert_grad_0 = fopen("Convert_grad_0", "wb");
+		//	node->grad->Dump(node->grad, Convert_grad_0, "Convert_grad_0");
+		//	fclose(Convert_grad_0);
+		//}
+		//if (node->dataType == X_FLOAT16) {
+		//	FILE * Convert_grad_00 = fopen("Convert_grad_00", "wb");
+		//	node->grad->Dump(node->grad, Convert_grad_00, "Convert_grad_00");
+		//	fclose(Convert_grad_00);
+		//}
+
+		_ConvertDataType(node->grad, tmp);
+
+		//if (node->dataType == X_FLOAT) {
+		//	FILE *Convert_grad_1 = fopen("Convert_grad_1", "wb");
+		//	tmp->Dump(tmp, Convert_grad_1, "Convert_grad_1");
+		//	fclose(Convert_grad_1);
+		//}
+		//if (node->dataType == X_FLOAT16) {
+		//	FILE *Convert_grad_10 = fopen("Convert_grad_10", "wb");
+		//	tmp->Dump(tmp, Convert_grad_10, "Convert_grad_10");
+		//	fclose(Convert_grad_10);
+		//}
+
+		_SumMe(input->grad, tmp);
+
+		//if (node->dataType == X_FLOAT) {
+		//	FILE *Convert_grad_2 = fopen("Convert_grad_2", "wb");
+		//	input->grad->Dump(input->grad, Convert_grad_2, "Convert_grad_2");
+		//	fclose(Convert_grad_2);
+		//}
+		//if (node->dataType == X_FLOAT16) {
+		//	FILE *Convert_grad_20 = fopen("Convert_grad_20", "wb");
+		//	input->grad->Dump(input->grad, Convert_grad_20, "Convert_grad_20");
+		//	fclose(Convert_grad_20);
+		//}
+
+		DelTensorBuf(tmp);
+
+		node->visitMark = NODE_FINISHED;
+	}
+
+	/*
+	gradient computation for OnehotToIndex
+	for
+	b = OnehotToIndex(a)
+	we have
+	dE/da = IndexToOnehot(b)
+	>> node - the node (c) for backward computation
+	>> isEfficient - indicates whether the computation is in
+					 an efficient manner
+	*/
+	void XDataGrad::GradOnehotToIndex(XTensor * node, bool isEfficent)
+	{
+		XLink &income = node->income;
+		CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for IndexToOnehot!");
+
+		XTensor * input = income.tails[0];
+
+		XNoder::MakeGrad(input);
+
+		node->visitMark = NODE_FINISHED;
+	}
+
+	/*
+	gradient computation for IndexToOnehot
+	for
+	b = IndexToOnehot(a)
+	we have
+	dE/da = IndexToOnehot(b)
+	>> node - the node (c) for backward computation
+	>> isEfficient - indicates whether the computation is in
+					 an efficient manner
+	*/
+	void XDataGrad::GradIndexToOnehot(XTensor * node, bool isEfficent)
+	{
+		XLink &income = node->income;
+		CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for IndexToOnehot!");
+
+		XTensor * input = income.tails[0];
+
+		XNoder::MakeGrad(input);
+
+		node->visitMark = NODE_FINISHED;
+	}

 } // namespace nts(NiuTrans.Tensor)
--- a/source/network/XNoder.cpp
+++ b/source/network/XNoder.cpp
@@ -41,7 +41,7 @@ bool XNoder::IsLeaf(XTensor * node)
 {
    if(node == NULL)
        return false;
-
+	// weight
    if(node->income.tailNum == 0)
        return true;
    else

--- a/source/sample/fnnlm/FNNLM.cpp
+++ b/source/sample/fnnlm/FNNLM.cpp
@@ -35,7 +35,7 @@

 namespace fnnlm
 {
-
+int step = 0;
 #define MAX_NAME_LENGTH 1024
 #define MAX_LINE_LENGTH_HERE 1024 * 32

@@ -63,7 +63,7 @@ void Clear(FNNModel &model, bool isNodeGrad);
 void InitModelTensor1D(XTensor &tensor, int num, FNNModel &model);
 void InitModelTensor2D(XTensor &tensor, int rowNum, int colNum, FNNModel &model);
 void Train(const char * train, bool isShuffled, FNNModel &model);
-void Update(FNNModel &model, FNNModel &grad, float epsilon, bool isNodeGrad);
+void Update(FNNModel &model, FNNModel &grad, float epsilon, bool isNodeGrad,int step);
 float GetProb(XTensor &output, XTensor &gold, XTensor * wordProbs = NULL);
 void Dump(const char * fn, FNNModel &model);
 void Read(const char * fn, FNNModel &model);
@@ -77,7 +77,7 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
              FNNModel &model, FNNModel &grad, FNNNet &net);
 void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model);
 void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model);
-void ForwardAutoDiffLin(NGram * ngrams, int batch, XTensor &output, FNNModel &model);
+void ForwardAutoDiffLin(NGram * ngrams, int batch, XTensor &output, FNNModel &model,int step);

 /* 
 entry of the program 
@@ -463,7 +463,7 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
        strcpy(name, train);
    
    int epoch = 0;
-    int step = 0;
+    //int step = 0;
    int wordCount = 0;
    int wordCountTotal = 0;
    int ngramNum = 1;
@@ -533,7 +533,7 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
                Backward(inputs, output, gold, CROSSENTROPY, model, grad, net);

                /* update model parameters */
-                Update(model, grad, learningRate, false);
+                Update(model, grad, learningRate, false,step);

                /* get probabilities */
                float prob = GetProb(output, gold);
@@ -547,8 +547,12 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
                
                /* this is implemented by gather function */
                //ForwardAutoDiff(ngrams, ngramNum, output, model);
+				//if (step == 114) {
+				//	exit(1);
+				//}
+

-                ForwardAutoDiffLin(ngrams, ngramNum, output, model);
+                ForwardAutoDiffLin(ngrams, ngramNum, output, model,step);

                //XNet net;
                //net.ShowNetwork(stdout, &output);
@@ -564,22 +568,53 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
                //}
                
                /* this is implemented by multiply function */
-                lossTensor = CrossEntropy(output, gold);
-
-                //FILE* fOut1 = fopen("test3", "a");
-                //fprintf(fOut1, "step=%d ", step);
-                //lossTensor.Dump(&lossTensor, fOut1, "lossTensor:");
-                //fclose(fOut1);
-                //fflush(fOut1);
+				//if (step >112) {
+				//	output.Dump(&output, stderr, "output:", 20);
+				//}

-                int stepTmp = step+1;
+                lossTensor = CrossEntropy(output, gold);

+				if (step > 680 && step < 685) {
+					char op[MAX_NAME_LENGTH];
+					sprintf(op, "output-%d", step);
+					FILE *out = fopen(op, "wb");
+					output.Dump(&output, out, "output:");
+					fclose(out);
+					fflush(out);
+
+					char gd[MAX_NAME_LENGTH];
+					sprintf(gd, "gold-%d", step);
+					FILE *golds = fopen(gd, "wb");
+					gold.Dump(&gold, golds, "gold:");
+					fclose(golds);
+					fflush(golds);
+
+					char lossTen[MAX_NAME_LENGTH];
+					sprintf(lossTen, "lossTensor-%d", step);
+					FILE *loss = fopen(lossTen, "wb");
+					lossTensor.Dump(&lossTensor, loss, "loss:");
+					fclose(loss);
+					fflush(loss);
+				}
+
+				//if (step > 110 && step < 116) {
+				//	char lt[MAX_NAME_LENGTH];
+				//	sprintf(lt, "lossTensor-%d", step);
+				//	FILE *loss = fopen(lt, "wb");
+				//	lossTensor.Dump(&lossTensor, loss, "loss:");
+				//	fclose(loss);
+				//}
                /* automatic differentiation */
                autoDiffer.Backward(lossTensor);

                /* update model parameters */
-                Update(model, grad, learningRate, true);
-
+                Update(model, grad, learningRate, true,step);
+				//if (step > 680 && step < 688) {
+				//	char aw[MAX_NAME_LENGTH];
+				//	sprintf(aw, "wf-%d", step);
+				//	FILE *file = fopen(aw, "wb");
+				//	model.embeddingW.Dump(file, aw);
+				//}
                /* get probabilities */
                float prob = ReduceSumAll(lossTensor);
                loss += prob;
@@ -625,7 +660,7 @@ update the model parameters using the delta rule
 >> epsilon - learning rate
 >> isNodeGrad - indicates whether the gradient is associated with the node
 */
-void Update(FNNModel &model, FNNModel &grad, float epsilon, bool isNodeGrad)
+void Update(FNNModel &model, FNNModel &grad, float epsilon, bool isNodeGrad,int step)
 {
    TensorList paraList(10);
    TensorList gradList(10);
@@ -648,7 +683,6 @@ void Update(FNNModel &model, FNNModel &grad, float epsilon, bool isNodeGrad)
            gradList.Add(&grad.hiddenW[i]);
            gradList.Add(&grad.hiddenB[i]);
        }
-;
        gradList.Add(&grad.embeddingW);
    }
    else{
@@ -663,21 +697,29 @@ void Update(FNNModel &model, FNNModel &grad, float epsilon, bool isNodeGrad)
        gradList.Add(model.embeddingW.grad);
    }

-    //FILE* fOut1 = fopen("test-2", "a");
    for (int i = 0; i < paraList.count; i++) {
        XTensor * para = (XTensor*)paraList.GetItem(i);
        XTensor * paraGrad = (XTensor*)gradList.GetItem(i);
-       
-        //fprintf(fOut1, "id=%d ", para->id);
-        //para->Dump(para, fOut1, "para:", 50);
-        //paraGrad->Dump(paraGrad, fOut1, "paraGrad:", 50);
+
+		if (step > 680 && step < 685) {
+			char embeddingW_grad[MAX_NAME_LENGTH];
+			sprintf(embeddingW_grad, "embeddingW_grad-%d", step);
+
+			FILE *ewg = fopen(embeddingW_grad, "wb");
+			model.embeddingW.grad->Dump(model.embeddingW.grad, ewg, "ewg");
+			fclose(ewg);
+			
+			char outputW[MAX_NAME_LENGTH];
+			sprintf(outputW, "outputW_grad-%d", step);
+
+			FILE * owg = fopen(outputW, "wb");
+			model.outputW.grad->Dump(model.outputW.grad, owg, "outputW_grad:");
+			fclose(owg);
+		}

        /* the delta rule */
        _Sum(para, paraGrad, para, -epsilon);
    }
-    //fprintf(fOut1, "\n");
-    //fclose(fOut1);
-    //fflush(fOut1);
 }
  
 /*
@@ -1161,7 +1203,7 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model
    output = Softmax(MMul(hidden, model.outputW) + model.outputB, 1);
 }

-void ForwardAutoDiffLin(NGram * ngrams, int batch, XTensor &output, FNNModel &model)
+void ForwardAutoDiffLin(NGram * ngrams, int batch, XTensor &output, FNNModel &model, int xstep)
 {
    int n = model.n;
    int depth = model.hDepth;
@@ -1184,6 +1226,7 @@ void ForwardAutoDiffLin(NGram * ngrams, int batch, XTensor &output, FNNModel &mo
    InitTensor1DV2(&words, size, X_INT, model.devID);
    words.SetData(index, size);

+	/*words.Dump(&words, stderr, "words", 40);*/
    /*test for Gather float16 datatype backward*/
    //XTensor embeddingW16;
    //XTensor embeddingBig16;
@@ -1192,6 +1235,21 @@ void ForwardAutoDiffLin(NGram * ngrams, int batch, XTensor &output, FNNModel &mo
    //embeddingBig = ConvertDataType(embeddingBig16, X_FLOAT);

    embeddingBig = Gather(model.embeddingW, words);
+	//embeddingBig.Dump(&embeddingBig, stderr, "embeddingBig", 20);
+	//if (step > 112 && step < 118) {
+
+	//	char e[MAX_NAME_LENGTH];
+	//	sprintf(e, "ew-%d", step);
+
+	//	FILE *ew = fopen(e, "wb");
+	//	model.embeddingW.Dump(ew, "ew");
+	//	fclose(ew);
+	//}
+
+	//float words_reduce = ReduceSumAll(words);
+	//float emW_reduce = ReduceSumAll(model.embeddingW);
+	//float embed = ReduceSumAll(embeddingBig);
+	//fprintf(stderr, "words_reduce:%f emW_reduce:%f embed:%f\n", words_reduce,emW_reduce,embed);

    delete[] index;

@@ -1267,8 +1325,31 @@ void ForwardAutoDiffLin(NGram * ngrams, int batch, XTensor &output, FNNModel &mo
    XTensor softmaxBefore16;
    XTensor output16;
    softmaxBefore = MMul(hidden, model.outputW) + model.outputB;
+
+	//float softmaxReduce = ReduceSumAll(softmaxBefore);
+	//fprintf(stderr,"softmaxReduce:%f\n", softmaxReduce);
+
    softmaxBefore16 = ConvertDataType(softmaxBefore, X_FLOAT16);
+	//softmaxBefore.Dump(&softmaxBefore, stderr, "softmaxBefore", 20);
    output16 = Softmax(softmaxBefore16, 1);
+
+	if (step > 680 && step < 685) {
+
+		char sm[MAX_NAME_LENGTH];
+		sprintf(sm,"output16-%d", step);
+
+		char smb[MAX_NAME_LENGTH];
+		sprintf(smb, "softmaxBefore16-%d", step);
+
+		FILE *softb = fopen(smb, "wb");
+		softmaxBefore16.Dump(&softmaxBefore16, softb, "softmaxBefore16:");
+		fclose(softb);
+
+		FILE *soft = fopen(sm, "wb");
+		output16.Dump(&output16, soft, "output16:");
+		fclose(soft);
+	}
+ 
    output = ConvertDataType(output16, X_FLOAT);

    //output = Softmax(MMul(hidden, model.outputW) + model.outputB, 1);

--- a/source/sample/fnnlm/FNNLM.h
+++ b/source/sample/fnnlm/FNNLM.h
@@ -38,7 +38,7 @@ using namespace nts;

 namespace fnnlm
 {
-
+	extern int step;
 #define _EXIT_(x)// exit(x)
 #define CheckErrors(x, msg) { if(!(x)) { fprintf(stderr, "Error! calling '%s' (%s line %d): %s\n", #x, __FILENAME__, __LINE__, msg);  _EXIT_(1); } }
 #define ShowErrors(msg) { { fprintf(stderr, "Error! (%s line %d): %s\n", __FILENAME__, __LINE__, msg); _EXIT_(1); } } 

--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -1912,6 +1912,11 @@ void XTensor::Dump(const XTensor * tensor, FILE * file, const char * label, cons
 		_ConvertDataType(tensor, &a);
 		a.Dump(file, label, n, beg, verbose);
 	}
+	else if (tensor->dataType == X_INT) {
+		XTensor a(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, tensor->devID, tensor->mem);
+		_CopyValues(tensor, &a);
+		a.Dump(file, label, n, beg, verbose);	
+    }
 	else
 	{
 		ShowNTErrors("TO DO!");
@@ -2951,4 +2956,30 @@ void DelTensorBuf(XTensor * tensor)
    delete tensor;
 }

+void Range(XTensor * tensor, int start, int end, int step)
+{
+	if (tensor == NULL)
+		return;
+	/* get the length of tensor */
+	int length = tensor->GetDim(0);
+
+	/* compute the true length according to the (start, end, step) */
+	int a = abs(end - start);
+	int freq = ceil(1.0 * a / abs(step));
+
+	/* init a integer array to store the sequence */
+	int* index = new int[freq];
+	for (int i = 0; i < freq; i++)
+		index[i] = start + i * step;
+
+	CheckNTErrors((length == freq), "the length of the tensor is not matched");
+
+	/* set the data from the array */
+	tensor->SetData(index, freq);
+
+
+	delete[] index;
+
+}
+
 } /* end of the nts (NiuTrans.Tensor) namespace */
--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
@@ -600,6 +600,8 @@ void DelTensor(XTensor * tensor);
 /* free the data space of a given tensor (on the buffer) */
 void DelTensorBuf(XTensor * tensor);

+void Range(XTensor * tensor, int start, int end, int step);
+
 } /* end of the nts (NiuTrans.Tensor) namespace */

 #endif
--- a/source/tensor/core/getandset/ConvertDataType.cu
+++ b/source/tensor/core/getandset/ConvertDataType.cu
@@ -23,6 +23,7 @@
 #include "../../XTensor.h"
 #include "../../XDevice.h"
 #include "ConvertDataType.cuh"
+#include "../../core/math/Clip.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

@@ -156,6 +157,7 @@ convert data type (cuda code)
 */
 void _CudaConvertDataType(const XTensor * input, XTensor * output)
 {
+	
    if (input->dataType == output->dataType)
        return;

@@ -170,12 +172,16 @@ void _CudaConvertDataType(const XTensor * input, XTensor * output)
    int devIDBackup;
    ProtectCudaDev(input->devID, devIDBackup);

-    if(input->dataType == X_FLOAT && output->dataType == X_INT)
-        KernelFloatToInt<<<blocks, threads>>>((float*)input->data, (int*)output->data, input->unitNum);
-    else if(input->dataType == X_INT && output->dataType == X_FLOAT)
-        KernelIntToFloat<<<blocks, threads>>>((int*)input->data, (float*)output->data, input->unitNum);
-    else if(input->dataType == X_FLOAT && output->dataType == X_FLOAT16)
-        KernelFloatToFloat16<<<blocks, threads>>>((float*)input->data, (__half*)output->data, input->unitNum);
+	if (input->dataType == X_FLOAT && output->dataType == X_INT)
+		KernelFloatToInt << <blocks, threads >> > ((float*)input->data, (int*)output->data, input->unitNum);
+	else if (input->dataType == X_INT && output->dataType == X_FLOAT)
+		KernelIntToFloat << <blocks, threads >> > ((int*)input->data, (float*)output->data, input->unitNum);
+	else if (input->dataType == X_FLOAT && output->dataType == X_FLOAT16) {
+		XTensor *temp = NewTensor(input);
+		_Clip(input,temp, -60000.0F, 60000.0F);
+		KernelFloatToFloat16 << <blocks, threads >> > ((float*)temp->data, (__half*)output->data, input->unitNum);
+		delete temp;
+	}
    else if(input->dataType == X_FLOAT16 && output->dataType == X_FLOAT)
        KernelFloat16ToFloat<<<blocks, threads>>>((__half*)input->data, (float*)output->data, input->unitNum);
    else if (input->dataType == X_FLOAT && output->dataType == X_INT8)

--- a/source/tensor/function/Softmax.cu
+++ b/source/tensor/function/Softmax.cu
@@ -29,6 +29,9 @@
 #include "../core/arithmetic/Sum.h"
 #include "../XDevice.h"
 #include "../XUtility.h"
+#include "../../sample/fnnlm/FNNLM.h"
+
+using namespace fnnlm;

 namespace nts { // namespace nts(NiuTrans.Tensor)

@@ -382,21 +385,77 @@ void _CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,

        /* make a matrix to keep \beta */
        XTensor * beta = NewTensor(y->order - 1, dimSize, y->dataType, y->denseRatio, y->devID, y->mem);
+		int a = 680;
+		int b = 685;
+		if (step > a && step < b) {
+			char softmax_dedys[1024];
+			sprintf(softmax_dedys, "softmax_dedy-%d", step);
+
+			FILE *softmax_dedy = fopen(softmax_dedys, "wb");
+			dedy->Dump(dedy, softmax_dedy, "softmax_dedy:");
+			fclose(softmax_dedy);
+		}

        /* \beta = \sum_i (dE/dy_i * y_i) */
        _Multiply(dedy, y, ytmp, 0, 0);
+
+
+		if (step > a && step < b) {
+			char softmax_ytmp1[1024];
+			sprintf(softmax_ytmp1, "softmax_ytmp1-%d", step);
+
+			FILE *ytemp_1 = fopen(softmax_ytmp1, "wb");
+			ytmp->Dump(ytmp, ytemp_1, "ytemp-1:");
+			fclose(ytemp_1);
+		}
+
        _ReduceSum(ytmp, beta, leadDim);

+		if (step > a && step < b) {
+			char softmax_betas[1024];
+			sprintf(softmax_betas, "softmax_ytmp1-%d", step);
+
+			FILE *softmax_beta = fopen(softmax_betas, "wb");
+			beta->Dump(beta, softmax_beta, "beta:");
+			fclose(softmax_beta);
+		}
+
+
        /* ytmp = dE/dy_j - \beta */
        _Unsqueeze(beta, ytmp, leadDim, y->dimSize[leadDim]);
+
+		if (step > a && step < b) {
+			char softmax_ytmp2[1024];
+			sprintf(softmax_ytmp2, "softmax_ytmp2-%d", step);
+
+			FILE *ytmp_2 = fopen(softmax_ytmp2, "wb");
+			ytmp->Dump(ytmp, ytmp_2, "ytmp-2:");
+			fclose(ytmp_2);
+		}
+
+
        _Sum(dedy, ytmp, ytmp, -1.0F);

+		if (step > a && step < b) {
+			char softmax_ytmp3[1024];
+			sprintf(softmax_ytmp3, "softmax_ytmp3-%d", step);
+
+			FILE *ytmp_3 = fopen(softmax_ytmp3, "wb");
+			ytmp->Dump(ytmp, ytmp_3, "ytmp-3:");
+			fclose(ytmp_3);
+		}
+
+
        /* dE/ds_j = y_j * ytmp = y_j * (dE/dy_j - \beta) */
        _Multiply(y, ytmp, dedx, 0, 0);

        delete[] dimSize;
        delete ytmp;
        delete beta;
+
+		//FILE *file = fopen("softmax_dedx", "wb");
+		//dedx->Dump(dedx, file, "softmax_dedx:");
+		//fclose(file);
    }
    else{
        ShowNTErrors("TODO!");

--- a/source/tensor/loss/CrossEntropy.cpp
+++ b/source/tensor/loss/CrossEntropy.cpp
@@ -30,6 +30,8 @@
 #include "../core/math/ScaleAndShift.h"
 #include "../core/reduce/ReduceSum.h"
 #include "../core/reduce/ReduceSumAll.h"
+#include "../../sample/fnnlm/FNNLM.h"
+using namespace fnnlm;

 namespace nts{ // namespace nts(NiuTrans.Tensor)

@@ -63,8 +65,16 @@ void _CrossEntropy(const XTensor * output, const XTensor * gold,
    //CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, "TODO!");

    XTensor * inter = NewTensor(output);
-    
+
    _Log(output, inter);
+	if (step > 680 && step < 685) {
+		char log[1024];
+		sprintf(log, "loginter-%d", step);
+		FILE *loginter = fopen(log, "wb");
+		inter->Dump(inter, loginter, "loginter:");
+		fclose(loginter);
+		fflush(loginter);
+	}
    _MultiplyMe(inter, gold);

    if(weight != NULL)

--- a/source/tensor/loss/CrossEntropy.cu
+++ b/source/tensor/loss/CrossEntropy.cu
@@ -35,6 +35,9 @@
 #include "../core/reduce/ReduceSumAll.h"
 #include "../core/shape/Transpose.h"
 #include "../core/shape/Unsqueeze.h"
+#include "../../sample/fnnlm/FNNLM.h"
+
+using namespace fnnlm;

 namespace nts{ // namespace nts(NiuTrans.Tensor)

@@ -192,8 +195,18 @@ void _CudaCrossEntropyBackward(XTensor * dedy, const XTensor * output,
        dedy->Reshape(order, dims);

        delete[] paddingDims;
-        delete[] dims;
+        delete[] dims; 
    }
+	if (step > 680 && step < 685) {
+		char output_grad[1024];
+		sprintf(output_grad, "output_grad-%d", step);
+
+		FILE * file = fopen(output_grad, "wb");
+		dedy->Dump(dedy, file, "output_grad:");
+		fclose(file);
+	}
+
+

    //if(padding != NULL) {
    //    XTensor * tmp = NewTensor(padding);