fixed FNNLM of branch of xiao

2c4061e9 · ltb · 3800528b · 2c4061e9 · 2c4061e9 · 2c4061e9
Commit 2c4061e9 authored Jul 30, 2019 by ltb
--- a/source/network/Main.cpp
+++ b/source/network/Main.cpp
@@ -24,7 +24,6 @@
 #include "../tensor/XUtility.h"
 #include "../tensor/function/FHeader.h"
 #include "../tensor/core/CHeader.h"
-#include "../tensor/test/Test.h"
 #include "../sample/fnnlm/FNNLM.h"
 #include "../sample/transformer/Transformer.h"


--- a/source/sample/fnnlm/FNNLM.cpp
+++ b/source/sample/fnnlm/FNNLM.cpp
@@ -15,7 +15,7 @@
 * limitations under the License.
 */

-/*
+ /*
  *
  * This is a simple impelementation of the feed-forward network-baesd language
  * model (FNNLM). See more details about FNNLM in
@@ -32,6 +32,7 @@
 #include "../../tensor/XDevice.h"
 #include "../../tensor/function/FHeader.h"
 #include "../../network/XNet.h"
+#include "../../tensor/core/math/ScaleAndShift.h"

 namespace fnnlm
 {
@@ -39,50 +40,50 @@ namespace fnnlm
 #define MAX_NAME_LENGTH 1024
 #define MAX_LINE_LENGTH_HERE 1024 * 32

-char trainFN[MAX_NAME_LENGTH] = "";   // file name of the training data
-char modelFN[MAX_NAME_LENGTH] = "";   // file name of the FNN model
-char testFN[MAX_NAME_LENGTH] = "";    // file name of the test data
-char outputFN[MAX_NAME_LENGTH] = "";  // file name of the result data
-    
-float learningRate = 0.01F;           // learning rate
-int nStep = 10000000;                   // max learning steps (or model updates)
-int nEpoch = 10;                      // max training epochs
-float minmax = 0.08F;                 // range [-p,p] for parameter initialization
-int sentBatch = 0;                    // batch size at the sentence level
-int wordBatch = 1;                    // batch size at the word level
-bool shuffled = false;                // shuffled the training data file or not
-bool autoDiff = false;                // indicator of automatic differentiation
-
-void LoadArgs(int argc, const char ** argv, FNNModel &model);
-void Init(FNNModel &model);
-void Check(FNNModel &model);
-void Copy(FNNModel &tgt, FNNModel &src);
-void Clear(FNNModel &model, bool isNodeGrad);
-void InitModelTensor1D(XTensor &tensor, int num, FNNModel &model);
-void InitModelTensor2D(XTensor &tensor, int rowNum, int colNum, FNNModel &model);
-void Train(const char * train, bool isShuffled, FNNModel &model);
-void Update(FNNModel &model, FNNModel &grad, float epsilon, bool isNodeGrad);
-float GetProb(XTensor &output, XTensor &gold, XTensor * wordProbs = NULL);
-void Dump(const char * fn, FNNModel &model);
-void Read(const char * fn, FNNModel &model);
-void Test(const char * test, const char * result, FNNModel &model);
-int  LoadNGrams(FILE * file, int n, NGram * ngrams, int sentNum, int wordNum);
-void InitZeroOneTensor2D(XTensor &tensor, int rowNum, int colNum, int * rows, int * cols, 
+	char trainFN[MAX_NAME_LENGTH] = "";   // file name of the training data
+	char modelFN[MAX_NAME_LENGTH] = "";   // file name of the FNN model
+	char testFN[MAX_NAME_LENGTH] = "";    // file name of the test data
+	char outputFN[MAX_NAME_LENGTH] = "";  // file name of the result data
+
+	float learningRate = 0.01F;           // learning rate
+	int nStep = 10000000;                   // max learning steps (or model updates)
+	int nEpoch = 10;                      // max training epochs
+	float minmax = 0.08F;                 // range [-p,p] for parameter initialization
+	int sentBatch = 0;                    // batch size at the sentence level
+	int wordBatch = 1;                    // batch size at the word level
+	bool shuffled = false;                // shuffled the training data file or not
+	bool autoDiff = false;                // indicator of automatic differentiation
+
+	void LoadArgs(int argc, const char ** argv, FNNModel &model);
+	void Init(FNNModel &model);
+	void Check(FNNModel &model);
+	void Copy(FNNModel &tgt, FNNModel &src);
+	void Clear(FNNModel &model, bool isNodeGrad);
+	void InitModelTensor1D(XTensor &tensor, int num, FNNModel &model);
+	void InitModelTensor2D(XTensor &tensor, int rowNum, int colNum, FNNModel &model);
+	void Train(const char * train, bool isShuffled, FNNModel &model);
+	void Update(FNNModel &model, FNNModel &grad, float epsilon, bool isNodeGrad);
+	float GetProb(XTensor &output, XTensor &gold, XTensor * wordProbs = NULL);
+	void Dump(const char * fn, FNNModel &model);
+	void Read(const char * fn, FNNModel &model);
+	void Test(const char * test, const char * result, FNNModel &model);
+	int  LoadNGrams(FILE * file, int n, NGram * ngrams, int sentNum, int wordNum);
+	void InitZeroOneTensor2D(XTensor &tensor, int rowNum, int colNum, int * rows, int * cols,
 		int itemNum, int devID, XMem * mem);
-void MakeWordBatch(XTensor &batch, NGram * ngrams, int ngramNum, int n, int vSize, int devID, XMem * mem);
-void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net);
-void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NAME loss, 
+	void MakeWordBatch(XTensor &batch, NGram * ngrams, int ngramNum, int n, int vSize, int devID, XMem * mem);
+	void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net);
+	void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NAME loss,
 		FNNModel &model, FNNModel &grad, FNNNet &net);
-void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model);
-void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model);
+	void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model);
+	void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model);

-/* 
-entry of the program 
->> argc - number of the arguments
->> argv - pointers to the arguments
-<< return - error code
+	/*
+	entry of the program
+	>> argc - number of the arguments
+	>> argv - pointers to the arguments
+	<< return - error code

-arguments:
+	arguments:
 	 -train S: specify training data file name
 	 -model S: specify model file name
 	 -test S: specify test data file name
@@ -110,10 +111,10 @@ arguments:
 	 E.g.,
 	 0 29 2 11 1
 	 might be a line of the file.
-*/
-int FNNLMMain(int argc, const char ** argv)
-{
-    if(argc == 0)
+	*/
+	int FNNLMMain(int argc, const char ** argv)
+	{
+		if (argc == 0)
 			return 1;

 		FNNModel model;
@@ -128,127 +129,127 @@ int FNNLMMain(int argc, const char ** argv)
 		Init(model);

 		/* learn model parameters */
-    if(strcmp(trainFN, ""))
+		if (strcmp(trainFN, ""))
 			Train(trainFN, shuffled, model);

 		/* save the final model */
-    if(strcmp(modelFN, "") && strcmp(trainFN, ""))
+		if (strcmp(modelFN, "") && strcmp(trainFN, ""))
 			Dump(modelFN, model);

 		/* load the model if neccessary */
-    if(strcmp(modelFN, ""))
+		if (strcmp(modelFN, ""))
 			Read(modelFN, model);

 		/* test the model on the new data */
-    if(strcmp(testFN, "") && strcmp(outputFN, ""))
+		if (strcmp(testFN, "") && strcmp(outputFN, ""))
 			Test(testFN, outputFN, model);

 		return 0;
-}
-
-/* 
-load arguments 
->> argc - number of the arguments
->> argv - pointers to the arguments
->> model - the fnn model
-*/
-void LoadArgs(int argc, const char ** argv, FNNModel &model)
-{
+	}
+
+	/*
+	load arguments
+	>> argc - number of the arguments
+	>> argv - pointers to the arguments
+	>> model - the fnn model
+	*/
+	void LoadArgs(int argc, const char ** argv, FNNModel &model)
+	{
 		fprintf(stderr, "args:\n");
-    for(int i = 0; i < argc; i++){
-        if(!strcmp(argv[i], "-train") && i + 1 < argc){
+		for (int i = 0; i < argc; i++) {
+			if (!strcmp(argv[i], "-train") && i + 1 < argc) {
 				strcpy(trainFN, argv[i + 1]);
 				fprintf(stderr, " -train=%s\n", argv[i + 1]);
 			}
-        if(!strcmp(argv[i], "-model") && i + 1 < argc){
+			if (!strcmp(argv[i], "-model") && i + 1 < argc) {
 				strcpy(modelFN, argv[i + 1]);
 				fprintf(stderr, " -model=%s\n", argv[i + 1]);
 			}
-        if(!strcmp(argv[i], "-test") && i + 1 < argc){
+			if (!strcmp(argv[i], "-test") && i + 1 < argc) {
 				strcpy(testFN, argv[i + 1]);
 				fprintf(stderr, " -test=%s\n", argv[i + 1]);
 			}
-        if(!strcmp(argv[i], "-output") && i + 1 < argc){
+			if (!strcmp(argv[i], "-output") && i + 1 < argc) {
 				strcpy(outputFN, argv[i + 1]);
 				fprintf(stderr, " -output=%s\n", argv[i + 1]);
 			}
-        if(!strcmp(argv[i], "-n") && i + 1 < argc){
+			if (!strcmp(argv[i], "-n") && i + 1 < argc) {
 				model.n = atoi(argv[i + 1]);
 				fprintf(stderr, " -n=%d\n", model.n);
 			}
-        if(!strcmp(argv[i], "-esize") && i + 1 < argc){
+			if (!strcmp(argv[i], "-esize") && i + 1 < argc) {
 				model.eSize = atoi(argv[i + 1]);
 				fprintf(stderr, " -esize=%d\n", model.eSize);
 			}
-        if(!strcmp(argv[i], "-vsize") && i + 1 < argc){
+			if (!strcmp(argv[i], "-vsize") && i + 1 < argc) {
 				model.vSize = atoi(argv[i + 1]);
 				fprintf(stderr, " -vsize=%d\n", model.vSize);
 			}
-        if(!strcmp(argv[i], "-hdepth") && i + 1 < argc){
+			if (!strcmp(argv[i], "-hdepth") && i + 1 < argc) {
 				model.hDepth = atoi(argv[i + 1]);
 				fprintf(stderr, " -hdepth=%d\n", model.hDepth);
 			}
-        if(!strcmp(argv[i], "-hsize") && i + 1 < argc){
+			if (!strcmp(argv[i], "-hsize") && i + 1 < argc) {
 				model.hSize = atoi(argv[i + 1]);
 				fprintf(stderr, " -hsize=%d\n", model.hSize);
 			}
-        if(!strcmp(argv[i], "-lrate") && i + 1 < argc){
+			if (!strcmp(argv[i], "-lrate") && i + 1 < argc) {
 				learningRate = (float)atof(argv[i + 1]);
 				fprintf(stderr, " -lrate=%f\n", learningRate);
 			}
-        if(!strcmp(argv[i], "-nstep") && i + 1 < argc){
+			if (!strcmp(argv[i], "-nstep") && i + 1 < argc) {
 				nStep = atoi(argv[i + 1]);
 				fprintf(stderr, " -nstep=%d\n", nStep);
 			}
-        if(!strcmp(argv[i], "-nepoch") && i + 1 < argc){
+			if (!strcmp(argv[i], "-nepoch") && i + 1 < argc) {
 				nEpoch = atoi(argv[i + 1]);
 				fprintf(stderr, " -nepoch=%d\n", nEpoch);
 			}
-        if(!strcmp(argv[i], "-minmax") && i + 1 < argc){
+			if (!strcmp(argv[i], "-minmax") && i + 1 < argc) {
 				minmax = (float)fabs(atof(argv[i + 1]));
 				fprintf(stderr, " -minmax=%f\n", minmax);
 			}
-        if(!strcmp(argv[i], "-batch") && i + 1 < argc){
+			if (!strcmp(argv[i], "-batch") && i + 1 < argc) {
 				sentBatch = atoi(argv[i + 1]);
 				fprintf(stderr, " -batch=%d\n", sentBatch);
 			}
-        if(!strcmp(argv[i], "-wbatch") && i + 1 < argc){
+			if (!strcmp(argv[i], "-wbatch") && i + 1 < argc) {
 				wordBatch = atoi(argv[i + 1]);
 				fprintf(stderr, " -wbatch=%d\n", wordBatch);
 			}
-        if(!strcmp(argv[i], "-shuffle")){
+			if (!strcmp(argv[i], "-shuffle")) {
 				shuffled = true;
 				fprintf(stderr, " -shuffle=true\n");
 			}
-        if(!strcmp(argv[i], "-autodiff")){
+			if (!strcmp(argv[i], "-autodiff")) {
 				autoDiff = true;
 				fprintf(stderr, " -autodiff=true\n");
 			}
-        if(!strcmp(argv[i], "-dev") && i + 1 < argc){
+			if (!strcmp(argv[i], "-dev") && i + 1 < argc) {
 				model.devID = atoi(argv[i + 1]);
 				fprintf(stderr, " -dev=%d\n", model.devID);
 			}
 		}

-    for(int i = 0; i < argc; i++){
+		for (int i = 0; i < argc; i++) {
 			if (!strcmp(argv[i], "-mempool"))
 				model.mem = new XMem(model.devID);
 		}
-}
+	}

-/* check model settings */
-void Check(FNNModel &model)
-{
+	/* check model settings */
+	void Check(FNNModel &model)
+	{
 		CheckErrors(model.n > 0 && model.n <= MAX_N_GRAM, "The LM order is out of range (use -n)!");
 		CheckErrors(model.vSize > 0, "no vocabulary size found (use -vsize)!");
 		CheckErrors(model.eSize > 0, "no embedding size found (use -esize)!");
-}
+	}

-/* make a hard copy of the fnn model */
-void Copy(FNNModel &tgt, FNNModel &src)
-{
+	/* make a hard copy of the fnn model */
+	void Copy(FNNModel &tgt, FNNModel &src)
+	{
 		InitTensorV2(&tgt.embeddingW, &src.embeddingW);
-    for(int i = 0; i < MAX_HIDDEN_NUM; i++){
+		for (int i = 0; i < MAX_HIDDEN_NUM; i++) {
 			InitTensorV2(&tgt.hiddenW[i], &src.hiddenW[i]);
 			InitTensorV2(&tgt.hiddenB[i], &src.hiddenB[i]);
 		}
@@ -262,33 +263,33 @@ void Copy(FNNModel &tgt, FNNModel &src)
 		tgt.vSize = src.vSize;
 		tgt.devID = src.devID;
 		tgt.useMemPool = src.useMemPool;
-    if(src.mem != NULL){
+		if (src.mem != NULL) {
 			tgt.mem = new XMem(src.mem->devID, src.mem->mode,
 				src.mem->maxBlockSize, src.mem->blockNum,
 				src.mem->bufSize);
 		}
-}
+	}

-/* 
-reset model parameters 
->> model - the model whose parameter (gradient) is set to 0
->> isNodeGrad - indicates whether the tensor node keeps the 
+	/*
+	reset model parameters
+	>> model - the model whose parameter (gradient) is set to 0
+	>> isNodeGrad - indicates whether the tensor node keeps the
 					gradient information
-*/
-void Clear(FNNModel &model, bool isNodeGrad)
-{
+	*/
+	void Clear(FNNModel &model, bool isNodeGrad)
+	{
 		if (isNodeGrad) {
-        if(model.embeddingW.grad != NULL)
+			if (model.embeddingW.grad != NULL)
 				model.embeddingW.grad->SetZeroAll();
 			for (int i = 0; i < MAX_HIDDEN_NUM; i++) {
-            if(model.hiddenW[i].grad != NULL)
+				if (model.hiddenW[i].grad != NULL)
 					model.hiddenW[i].grad->SetZeroAll();
-            if(model.hiddenB[i].grad != NULL)
+				if (model.hiddenB[i].grad != NULL)
 					model.hiddenB[i].grad->SetZeroAll();
 			}
-        if(model.outputW.grad != NULL)
+			if (model.outputW.grad != NULL)
 				model.outputW.grad->SetZeroAll();
-        if(model.outputB.grad != NULL)
+			if (model.outputB.grad != NULL)
 				model.outputB.grad->SetZeroAll();
 		}
 		else {
@@ -300,76 +301,78 @@ void Clear(FNNModel &model, bool isNodeGrad)
 			model.outputW.SetZeroAll();
 			model.outputB.SetZeroAll();
 		}
-}
+	}

-/* 
-initialize a 1d tensor using the fnn model setting 
->> tensor - the tensor to initialize
->> num - number of items
->> model - the fnn model
-*/
-void InitModelTensor1D(XTensor &tensor, int num, FNNModel &model)
-{
+	/*
+	initialize a 1d tensor using the fnn model setting
+	>> tensor - the tensor to initialize
+	>> num - number of items
+	>> model - the fnn model
+	*/
+	void InitModelTensor1D(XTensor &tensor, int num, FNNModel &model)
+	{
 		InitTensor1DV2(&tensor, num, X_FLOAT, model.devID);
-}
-
-/* 
-initialize a 2d tensor using the fnn model setting 
->> tensor - the tensor to initialize
->> rowNum - number of rows
->> colNum - number of columns
->> model - the fnn model
-*/
-void InitModelTensor2D(XTensor &tensor, int rowNum, int colNum, FNNModel &model)
-{
+	}
+
+	/*
+	initialize a 2d tensor using the fnn model setting
+	>> tensor - the tensor to initialize
+	>> rowNum - number of rows
+	>> colNum - number of columns
+	>> model - the fnn model
+	*/
+	void InitModelTensor2D(XTensor &tensor, int rowNum, int colNum, FNNModel &model)
+	{
 		InitTensor2DV2(&tensor, rowNum, colNum, X_FLOAT, model.devID);
-}
+	}


-/* initialize the model */
-void Init(FNNModel &model)
-{
+	/* initialize the model */
+	void Init(FNNModel &model)
+	{
 		/* create embedding parameter matrix: vSize * eSize */
 		InitModelTensor2D(model.embeddingW, model.vSize, model.eSize, model);
-    
+		model.embeddingW.SetVarFlag();
 		/* create hidden layer parameter matrics */
-    for(int i = 0; i < model.hDepth; i++){
+		for (int i = 0; i < model.hDepth; i++) {
 			/* hidden layer parameter matrix: (n-1)eSize * hsize if it is the first layer
 											   hsize * hsize otherwise */
-        if(i == 0)
+			if (i == 0)
 				InitModelTensor2D(model.hiddenW[i], (model.n - 1) * model.eSize, model.hSize, model);
 			else
 				InitModelTensor2D(model.hiddenW[i], model.hSize, model.hSize, model);
-        
+			model.hiddenW[i].SetVarFlag();
 			/* bias term: a row vector of hSize entries */
 			InitModelTensor1D(model.hiddenB[i], model.hSize, model);
+			model.hiddenB[i].SetVarFlag();
 		}

 		/* create the output layer parameter matrix and bias term */
 		int iSize = model.hDepth == 0 ? (model.n - 1) * model.eSize : model.hSize;
 		InitModelTensor2D(model.outputW, iSize, model.vSize, model);
 		InitModelTensor1D(model.outputB, model.vSize, model);
-    
+		model.outputW.SetVarFlag();
+		model.outputB.SetVarFlag();
 		/* then, we initialize model parameters using a uniform distribution in range
 		   of [-minmax, minmax] */
 		model.embeddingW.SetDataRand(-minmax, minmax);
 		model.outputW.SetDataRand(-minmax, minmax);
-    for(int i = 0; i < model.hDepth; i++)
+		for (int i = 0; i < model.hDepth; i++)
 			model.hiddenW[i].SetDataRand(-minmax, minmax);

 		/* all bias terms are set to zero */
 		model.outputB.SetZeroAll();
-    for(int i = 0; i < model.hDepth; i++)
+		for (int i = 0; i < model.hDepth; i++)
 			model.hiddenB[i].SetZeroAll();
-}
+	}

-/*
+	/*
 	 shuffle lines of the file
 	 >> srcFile - the source file to shuffle
 	 >> tgtFile - the resulting file
 	 */
-void Shuffle(const char * srcFile, const char * tgtFile)
-{
+	void Shuffle(const char * srcFile, const char * tgtFile)
+	{
 		char * line = new char[MAX_LINE_LENGTH_HERE];
 #ifndef WIN32
 		sprintf(line, "shuf %s > %s", srcFile, tgtFile);
@@ -379,23 +382,23 @@ void Shuffle(const char * srcFile, const char * tgtFile)
 #endif
 		delete[] line;

-}
+	}

-char lineBuf[MAX_LINE_LENGTH_HERE];
-int wordBuf[MAX_LINE_LENGTH_HERE];
+	char lineBuf[MAX_LINE_LENGTH_HERE];
+	int wordBuf[MAX_LINE_LENGTH_HERE];

-/* 
-train the model with the standard SGD method
->> train - training data file
->> isShuffled - shuffle the data file or not
->> model - the fnn model
-*/
-void Train(const char * train, bool isShuffled, FNNModel &model)
-{
+	/*
+	train the model with the standard SGD method
+	>> train - training data file
+	>> isShuffled - shuffle the data file or not
+	>> model - the fnn model
+	*/
+	void Train(const char * train, bool isShuffled, FNNModel &model)
+	{
 		char name[MAX_NAME_LENGTH];

 		/* shuffle the data */
-    if(isShuffled){
+		if (isShuffled) {
 			sprintf(name, "%s-tmp", train);
 			Shuffle(train, name);
 		}
@@ -420,9 +423,8 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
 		XNet autoDiffer;

 		double startT = GetClockSec();
-    
 		/* iterate for a number of epochs */
-    for(epoch = 0; epoch < nEpoch; epoch++){
+		for (epoch = 0; epoch < nEpoch; epoch++) {

 			/* data file */
 			FILE * file = fopen(name, "rb");
@@ -432,7 +434,7 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
 			loss = 0;
 			ngramNum = 1;

-        while(ngramNum > 0){
+			while (ngramNum > 0) {

 				/* load a minibatch of ngrams */
 				ngramNum = LoadNGrams(file, model.n, ngrams, sentBatch, wordBatch);
@@ -453,13 +455,13 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
 				XTensor lossTensor;

 				/* make the input tensor for position i */
-            for(int i = 0; i < model.n - 1; i++)
+				for (int i = 0; i < model.n - 1; i++)
 					MakeWordBatch(inputs[i], ngrams, ngramNum, i, model.vSize, model.devID, model.mem);

 				/* make the gold tensor */
 				MakeWordBatch(gold, ngrams, ngramNum, model.n - 1, model.vSize, model.devID, model.mem);

-            if(!autoDiff){
+				if (!autoDiff) {
 					/* prepare an empty network for building the fnn */
 					FNNNet net;

@@ -469,15 +471,13 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
 					/* forward computation */
 					Forward(inputs, output, model, net);

-
-
 					/* backward computation to obtain gradients */
 					Backward(inputs, output, gold, CROSSENTROPY, model, grad, net);

 					/* update model parameters */
 					Update(model, grad, learningRate, false);
 				}
-            else{
+				else {
 					/* gradient = 0 */
 					Clear(model, true);

@@ -489,6 +489,9 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
 					/* this is implemented by multiply function */
 					//ForwardAutoDiff(inputs, output, model);
 					lossTensor = CrossEntropy(output, gold);
+					output.Dump(stderr, "output:",10);
+					gold.Dump(stderr, "gold:", 10);
+					lossTensor.Dump(stderr, "lossTensor:",10);

 					/* automatic differentiation */
 					autoDiffer.Backward(lossTensor);
@@ -500,14 +503,15 @@ void Train(const char * train, bool isShuffled, FNNModel &model)

 				/* get probabilities */
 				float prob = GetProb(output, gold);
-            
-            prob = ReduceSumAll(lossTensor);
-
-            loss += prob;
+				if (autoDiff) {
+					prob = -ReduceSumAll(lossTensor);
+				}
+				//printf("prob:%f", prob);
+				loss += -prob;
 				wordCount += ngramNum;
 				wordCountTotal += ngramNum;

-            if(++step >= nStep){
+				if (++step >= nStep) {
 					isEnd = true;
 					break;
 				}
@@ -521,7 +525,7 @@ void Train(const char * train, bool isShuffled, FNNModel &model)

 			fclose(file);

-        if(isEnd)
+			if (isEnd)
 				break;

 			Test(testFN, outputFN, model);
@@ -535,17 +539,17 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
 			elapsed, step, epoch);

 		delete[] ngrams;
-}
-
-/* 
-update the model parameters using the delta rule
->> model - the model to update
->> grad - gradients
->> epsilon - learning rate
->> isNodeGrad - indicates whether the gradient is associated with the node
-*/
-void Update(FNNModel &model, FNNModel &grad, float epsilon, bool isNodeGrad)
-{
+	}
+
+	/*
+	update the model parameters using the delta rule
+	>> model - the model to update
+	>> grad - gradients
+	>> epsilon - learning rate
+	>> isNodeGrad - indicates whether the gradient is associated with the node
+	*/
+	void Update(FNNModel &model, FNNModel &grad, float epsilon, bool isNodeGrad)
+	{
 		TensorList paraList(10);
 		TensorList gradList(10);

@@ -559,7 +563,7 @@ void Update(FNNModel &model, FNNModel &grad, float epsilon, bool isNodeGrad)

 		paraList.Add(&model.embeddingW);

-    if(!isNodeGrad){
+		if (!isNodeGrad) {
 			gradList.Add(&grad.outputW);
 			gradList.Add(&grad.outputB);

@@ -567,10 +571,10 @@ void Update(FNNModel &model, FNNModel &grad, float epsilon, bool isNodeGrad)
 				gradList.Add(&grad.hiddenW[i]);
 				gradList.Add(&grad.hiddenB[i]);
 			}
-;
+			;
 			gradList.Add(&grad.embeddingW);
 		}
-    else{
+		else {
 			gradList.Add(model.outputW.grad);
 			gradList.Add(model.outputB.grad);

@@ -592,17 +596,17 @@ void Update(FNNModel &model, FNNModel &grad, float epsilon, bool isNodeGrad)
 			/* the delta rule */
 			_Sum(para, paraGrad, para, -epsilon);
 		}
-}
+	}

-/*
-get prediction probabilites of the gold words
->> output - output probabilities
->> gold - gold standard
->> wordPobs - probability of each word
-<< return - probability of the batch
-*/
-float GetProb(XTensor &output, XTensor &gold, XTensor * wordProbs)
-{
+	/*
+	get prediction probabilites of the gold words
+	>> output - output probabilities
+	>> gold - gold standard
+	>> wordPobs - probability of each word
+	<< return - probability of the batch
+	*/
+	float GetProb(XTensor &output, XTensor &gold, XTensor * wordProbs)
+	{
 		XTensor probs;
 		InitTensorV2(&probs, &output);

@@ -613,7 +617,7 @@ float GetProb(XTensor &output, XTensor &gold, XTensor * wordProbs)
 		XTensor wprobs;
 		InitTensor1DV2(&wprobs, output.GetDim(0), output.dataType, output.devID);
 		_ReduceSum(&probs, &wprobs, 1);
-    if(wordProbs != NULL)
+		if (wordProbs != NULL)
 			_CopyValues(&wprobs, wordProbs);

 		/* reshape the tensor to fit it into the reduce procedure
@@ -629,34 +633,34 @@ float GetProb(XTensor &output, XTensor &gold, XTensor * wordProbs)
 		_ReduceSum(&probs, &result, 1);

 		return result.Get1D(0);
-}
-
-int pin = 0;
-int wordBufCount = 0;
-
-/*
-load a minibatch of ngrams
->> file - data file
->> n - order of the language model
->> ngrams - the loaded ngrams
->> sentNum - maximum sentences kept in the minibatch
->> wordNum - maximum words kept in the minibatch
-*/
-int LoadNGrams(FILE * file, int n, NGram * ngrams, int sentNum, int wordNum)
-{
+	}
+
+	int pin = 0;
+	int wordBufCount = 0;
+
+	/*
+	load a minibatch of ngrams
+	>> file - data file
+	>> n - order of the language model
+	>> ngrams - the loaded ngrams
+	>> sentNum - maximum sentences kept in the minibatch
+	>> wordNum - maximum words kept in the minibatch
+	*/
+	int LoadNGrams(FILE * file, int n, NGram * ngrams, int sentNum, int wordNum)
+	{
 		int num = 0;
 		int lineNum = 0;
-    while(pin > 0 || fgets(lineBuf, MAX_LINE_LENGTH_HERE - 1, file)){
-        if(pin <= 0){
+		while (pin > 0 || fgets(lineBuf, MAX_LINE_LENGTH_HERE - 1, file)) {
+			if (pin <= 0) {
 				int len = (int)strlen(lineBuf);

-            while(lineBuf[len - 1] == '\r' || lineBuf[len - 1] == '\n'){
+				while (lineBuf[len - 1] == '\r' || lineBuf[len - 1] == '\n') {
 					lineBuf[len - 1] = 0;
 					len--;
 				}

 				len = (int)strlen(lineBuf);
-            if(len == 0)
+				if (len == 0)
 					continue;

 				/* how many characters are in a word */
@@ -666,9 +670,9 @@ int LoadNGrams(FILE * file, int n, NGram * ngrams, int sentNum, int wordNum)
 				int wNum = 0;
 				int i = 0;

-            for(i = pin; i < len; i++){
+				for (i = pin; i < len; i++) {
 					/* load word (id) seperated by space or tab */
-                if((lineBuf[i] == ' ' || lineBuf[i] == '\t') && wSize > 0){
+					if ((lineBuf[i] == ' ' || lineBuf[i] == '\t') && wSize > 0) {
 						lineBuf[i] = 0;
 						wordBuf[wNum++] = atoi(lineBuf + i - wSize);
 						wSize = 0;
@@ -677,7 +681,7 @@ int LoadNGrams(FILE * file, int n, NGram * ngrams, int sentNum, int wordNum)
 						wSize++;
 				}

-            if(wSize > 0)
+				if (wSize > 0)
 					wordBuf[wNum++] = atoi(lineBuf + i - wSize);

 				wordBufCount = wNum;
@@ -689,69 +693,69 @@ int LoadNGrams(FILE * file, int n, NGram * ngrams, int sentNum, int wordNum)
 			int i = -MAX_INT;

 			/* create ngrams */
-        for(i = MAX(pin, n - 1); i < wordBufCount - 1; i++){
+			for (i = MAX(pin, n - 1); i < wordBufCount - 1; i++) {
 				memcpy(ngrams[num++].words, wordBuf + i - n + 1, sizeof(int) * n);
-            if(num >= wordNum)
+				if (num >= wordNum)
 					break;
 			}

 			/* set a finished flag if we reach the end of the sentence*/
-        if(i >= wordBufCount - 1){
+			if (i >= wordBufCount - 1) {
 				pin = 0;
 				wordBufCount = 0;
 			}
 			/* record where to start next time if we break in the middle */
-        else{
+			else {
 				pin = i + 1;
 			}

-        if((sentNum > 0 && lineNum >= sentNum) || num >= wordNum)
+			if ((sentNum > 0 && lineNum >= sentNum) || num >= wordNum)
 				break;
 		}

 		return num;
-}
-
-/*
-make a 2d tensor in zero-one representation
-The indexed cell is set to 1, and 0 otherwise.
->> tensor - the tensor to initialize
->> rowNum - number of rows
->> colNum - number of columns
->> rows - row index
->> cols - column index
->> itemNum - number of non-zero items
->> devID - device id
->> mem - memory pool
-*/
-void InitZeroOneTensor2D(XTensor &tensor, int rowNum, int colNum, int * rows, int * cols, 
+	}
+
+	/*
+	make a 2d tensor in zero-one representation
+	The indexed cell is set to 1, and 0 otherwise.
+	>> tensor - the tensor to initialize
+	>> rowNum - number of rows
+	>> colNum - number of columns
+	>> rows - row index
+	>> cols - column index
+	>> itemNum - number of non-zero items
+	>> devID - device id
+	>> mem - memory pool
+	*/
+	void InitZeroOneTensor2D(XTensor &tensor, int rowNum, int colNum, int * rows, int * cols,
 		int itemNum, int devID, XMem * mem)
-{
+	{
 		InitTensor2DV2(&tensor, rowNum, colNum, X_FLOAT, devID);

 		tensor.SetZeroAll();

 		/* set none-zero cells */
-    for(int i = 0; i < itemNum; i++)
+		for (int i = 0; i < itemNum; i++)
 			tensor.Set2D(1.0F, rows[i], cols[i]);
-}
-
-/*
-make a tensor that encodes a batch of words
->> batch - the tensor encoding a batch of words
->> ngrams - the ngram batch
->> ngramNum - batch size
->> n - indicate which word is encode for each ngram
->> vSize - vocabulary size
->> devID - device id
->> mem - memory pool
-*/
-void MakeWordBatch(XTensor &batch, NGram * ngrams, int ngramNum, int n, int vSize, int devID, XMem * mem)
-{
+	}
+
+	/*
+	make a tensor that encodes a batch of words
+	>> batch - the tensor encoding a batch of words
+	>> ngrams - the ngram batch
+	>> ngramNum - batch size
+	>> n - indicate which word is encode for each ngram
+	>> vSize - vocabulary size
+	>> devID - device id
+	>> mem - memory pool
+	*/
+	void MakeWordBatch(XTensor &batch, NGram * ngrams, int ngramNum, int n, int vSize, int devID, XMem * mem)
+	{
 		int * rows = new int[ngramNum];
 		int * cols = new int[ngramNum];

-    for(int i = 0; i < ngramNum; i++){
+		for (int i = 0; i < ngramNum; i++) {
 			rows[i] = i;
 			cols[i] = ngrams[i].words[n];
 		}
@@ -760,31 +764,31 @@ void MakeWordBatch(XTensor &batch, NGram * ngrams, int ngramNum, int n, int vSiz

 		delete[] rows;
 		delete[] cols;
-}
-
-/*
-forward procedure
->> inputs - input word representations
->> output - output probability
->> model - the fnn model
->> net - the network that keeps the internal tensors generated in the process
-*/
-void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
-{
+	}
+
+	/*
+	forward procedure
+	>> inputs - input word representations
+	>> output - output probability
+	>> model - the fnn model
+	>> net - the network that keeps the internal tensors generated in the process
+	*/
+	void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
+	{
 		int batchSize = -1;
 		int n = model.n;
 		int depth = model.hDepth;
 		TensorList eList(n - 1);

 		/* previoius n - 1 words */
-    for(int i = 0; i < n - 1; i++){
+		for (int i = 0; i < n - 1; i++) {
 			XTensor &input = inputs[i];
 			XTensor &w = model.embeddingW;
 			XTensor &embedding = net.embeddings[i];

-        if(batchSize == -1)
+			if (batchSize == -1)
 				batchSize = input.dimSize[0];
-        else{
+			else {
 				CheckErrors(batchSize == input.dimSize[0], "Wrong input word representations!");
 			}

@@ -804,7 +808,7 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
 		_Concatenate(&eList, &net.embeddingCat, 1);

 		/* go over each hidden layer */
-    for(int i = 0; i < depth; i++){
+		for (int i = 0; i < depth; i++) {
 			XTensor &h_pre = i == 0 ? net.embeddingCat : net.hiddens[i - 1];
 			XTensor &w = model.hiddenW[i];
 			XTensor &b = model.hiddenB[i];
@@ -860,21 +864,21 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
 			/* y = softmax(s) */
 			_LogSoftmax(&s, &y, 1);
 		}
-}
-
-/*
-backward procedure
->> inputs - input word representations
->> output - output probability
->> gold - gold standard
->> loss - loss function name
->> model - the fnn model
->> grad - the model that keeps the gradient information
->> net - the network that keeps the internal tensors generated in the process
-*/
-void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NAME loss, 
+	}
+
+	/*
+	backward procedure
+	>> inputs - input word representations
+	>> output - output probability
+	>> gold - gold standard
+	>> loss - loss function name
+	>> model - the fnn model
+	>> grad - the model that keeps the gradient information
+	>> net - the network that keeps the internal tensors generated in the process
+	*/
+	void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NAME loss,
 		FNNModel &model, FNNModel &grad, FNNNet &net)
-{
+	{
 		int batchSize = output.GetDim(0);
 		int n = model.n;
 		int depth = model.hDepth;
@@ -979,17 +983,17 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA

 			delete dedy;
 		}
-}
+	}

-/*
-forward process (with tensor connections) (this is implemented by gather function)
->> ngrams - the loaded ngrams
->> batch - the tensor encoding a batch of words
->> output - output probability
->> model - the fnn model
-*/
-void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model)
-{
+	/*
+	forward process (with tensor connections) (this is implemented by gather function)
+	>> ngrams - the loaded ngrams
+	>> batch - the tensor encoding a batch of words
+	>> output - output probability
+	>> model - the fnn model
+	*/
+	void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model)
+	{
 		int n = model.n;
 		int depth = model.hDepth;

@@ -998,11 +1002,11 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model
 		XTensor hidden;
 		XTensor b;

-    int size = batch * (n-1);
+		int size = batch * (n - 1);
 		int * index = new int[size];

-    for(int i = 0; i < batch; i++){
-        for (int j = 0; j < n-1; j++){
+		for (int i = 0; i < batch; i++) {
+			for (int j = 0; j < n - 1; j++) {
 				int a = i * (n - 1) + j;
 				index[a] = ngrams[i].words[j];
 			}
@@ -1010,7 +1014,7 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model

 		InitTensor1DV2(&words, size, X_INT, model.devID);
 		words.SetData(index, size);
-
+		words.Dump(stderr, "word:", 10);
 		embeddingBig = Gather(model.embeddingW, words);

 		delete[] index;
@@ -1018,26 +1022,26 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model
 		int dimSize[2];
 		dimSize[0] = embeddingBig.GetDim(0) / (n - 1);
 		dimSize[1] = embeddingBig.GetDim(1) * (n - 1);
-
+		embeddingBig.Dump(stderr, "embeddingBig:", 10);
 		hidden = Reshape(embeddingBig, embeddingBig.order, dimSize);
-
+		hidden.Dump(stderr, "hidden-0:", 10);
 		/* hidden layers */
-    for(int i = 0; i < depth; i++)
+		for (int i = 0; i < depth; i++)
 			hidden = HardTanH(MMul(hidden, model.hiddenW[i]) + model.hiddenB[i]);
-
+		hidden.Dump(stderr, "hidden-1:", 10);
 		/* output layer */
 		//output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1);
 		output = Softmax(MMul(hidden, model.outputW) + model.outputB, 1);
-}
-
-/*
-forward process (with tensor connections) (this is implemented by multiply function)
->> inputs - input word representations
->> output - output probability
->> model - the fnn model
-*/
-void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model)
-{
+	}
+
+	/*
+	forward process (with tensor connections) (this is implemented by multiply function)
+	>> inputs - input word representations
+	>> output - output probability
+	>> model - the fnn model
+	*/
+	void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model)
+	{
 		int n = model.n;
 		int depth = model.hDepth;

@@ -1047,7 +1051,7 @@ void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model)
 		XTensor b;

 		TensorList inputList(n - 1);
-    for(int i = 0; i < n - 1; i++)
+		for (int i = 0; i < n - 1; i++)
 			inputList.Add(inputs + i);

 		/* represent n - 1 words in one tensor */
@@ -1061,21 +1065,21 @@ void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model)
 		hidden = Merge(hidden, 2, 0);

 		/* hidden layers */
-    for(int i = 0; i < depth; i++)
+		for (int i = 0; i < depth; i++)
 			hidden = MMul(hidden, model.hiddenW[i]) + model.hiddenB[i];

 		/* output layer */
 		output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1);

-}
+	}

-/* 
-dump the model to the disk space
->> fn - where to keep the model
->> model - the fnn model
-*/
-void Dump(const char * fn, FNNModel &model)
-{
+	/*
+	dump the model to the disk space
+	>> fn - where to keep the model
+	>> model - the fnn model
+	*/
+	void Dump(const char * fn, FNNModel &model)
+	{
 		FILE * file = fopen(fn, "wb");
 		CheckErrors(file, "Cannot open the model file");

@@ -1094,15 +1098,15 @@ void Dump(const char * fn, FNNModel &model)
 		fclose(file);

 		XPRINT(0, stderr, "[INFO] model saved\n");
-}
-
-/* 
-read the model from the disk space
->> fn - where to keep the model
->> model - the fnn model
-*/
-void Read(const char * fn, FNNModel &model)
-{
+	}
+
+	/*
+	read the model from the disk space
+	>> fn - where to keep the model
+	>> model - the fnn model
+	*/
+	void Read(const char * fn, FNNModel &model)
+	{
 		FILE * file = fopen(fn, "rb");
 		CheckErrors(file, "Cannot open the model file");

@@ -1121,16 +1125,16 @@ void Read(const char * fn, FNNModel &model)
 		fclose(file);

 		XPRINT(0, stderr, "[INFO] model loaded\n");
-}
-
-/* 
-test the model
->> test - test data file
->> result - where to keep the result
->> model - the fnn model
-*/
-void Test(const char * test, const char * result, FNNModel &model)
-{
+	}
+
+	/*
+	test the model
+	>> test - test data file
+	>> result - where to keep the result
+	>> model - the fnn model
+	*/
+	void Test(const char * test, const char * result, FNNModel &model)
+	{
 		int wordCount = 0;
 		int sentCount = 0;
 		float loss = 0;
@@ -1173,14 +1177,13 @@ void Test(const char * test, const char * result, FNNModel &model)
 			if (!autoDiff) {
 				/* prepare an empty network for building the fnn */
 				FNNNet net;
-
 				/* forward computation */
 				Forward(inputs, output, model, net);
 			}
 			else {
 				/* this is implemented by gather function */
 				ForwardAutoDiff(ngrams, ngramNum, output, model);
-				
+				output = Log(output);
 				/* this is implemented by multiply function */
 				//ForwardAutoDiff(inputs, output, model);
 			}
@@ -1213,11 +1216,11 @@ void Test(const char * test, const char * result, FNNModel &model)

 		double elapsed = GetClockSec() - startT;

-    XPRINT1(0, stderr, "[INFO] ppl=%.2f\n", exp(loss/wordCount));
+		XPRINT1(0, stderr, "[INFO] ppl=%.2f\n", exp(loss / wordCount));
 		XPRINT3(0, stderr, "[INFO] test finished (took %.1fs, sentence=%d and ngram=%d)\n",
 			elapsed, sentCount, wordCount);

 		delete[] ngrams;
-}
+	}

 };
--- a/source/tensor/Main.cpp
+++ b/source/tensor/Main.cpp
@@ -28,7 +28,7 @@
 #include <time.h>
 #include "XTensor.h"
 #include "XDevice.h"
-#include "./test/Test.h"
+//#include "./test/Test.h"
 #include "./core/CHeader.h"
 #include "./loss/CrossEntropy.h"

@@ -44,7 +44,7 @@ void LittleTest();
 void T2TTest();
 void T2TTest2();
 void PowerTest();
-
+void Tests();
 int main( int argc, const char ** argv )
 {
    //PowerTest();
@@ -63,7 +63,7 @@ int main( int argc, const char ** argv )
    //return 0;

    if(argc > 1 && !strcmp(argv[1], "-test"))
-        Test();
+        Tests();
    else{
        fprintf(stderr, "Thanks for using NiuTrans.Tensor! This is a library that eases the\n");
        fprintf(stderr, "use of tensors. All you need is to ... \n\n");
@@ -75,219 +75,223 @@ int main( int argc, const char ** argv )
    return 0;
 }

-void myRead(XTensor * tensor, const char * filename, const char * label)
-{
-    FILE * file = fopen(filename, "rb");
-    if(file == NULL)
-        printf("%s\n", filename);
-    tensor->Read(file, label);
-}
-
-void myDump(XTensor * tensor, const char * filename, const char * label)
-{
-    FILE * file = fopen(filename, "wb");
-    if(file == NULL)
-        printf("%s\n", filename);
-    tensor->Dump(file, label);
-}
-
-void PowerTest()
-{
-    XTensor input;
-    XTensor output;
-    InitTensor2D(&input, 256, 10000, X_FLOAT, 0);
-    InitTensor2D(&output, 256, 10000, X_FLOAT, 0);
-    myRead(&input, "1.txt", "");
-
-    _Power(&input, &output, 2);
-    output.Dump(stderr, "", 200);
-}
-
-void SmallTest()
-{
-    XTensor a;
-    XTensor b;
-    XTensor c;
-    XTensor d;
-
-    InitTensor2D(&a, 2, 2);
-    InitTensor2D(&b, 2, 2);
-    a.SetZeroAll();
-    b.SetZeroAll();
-    a.Set2D(1.0F, 0, 0);
-    a.Set2D(2.0F, 1, 1);
-
-    b = Sum(a, Multiply(a, a));
-
-    /* this is prohibited !!!!!!!!!!!!! */
-    //XTensor c = a * b + a;
-    //XTensor d = a + b + c.Lin(0.5F);
-    
-    c = a * b + a;
-    d = a + b + c.Lin(0.5F);
-
-    XLink::CheckNetwork(&d);
-    //XLink::ShowNetwork(stderr, &d);
-        
-    a.Dump(stderr, "a:");
-    b.Dump(stderr, "b:");
-    c.Dump(stderr, "c:");
-    d.Dump(stderr, "d:");
-}
-
-void TransposeTest()
-{
-    XTensor a;
-    XTensor b;
-
-    int I = 2;
-    int J = 3;
-
-    InitTensor4D(&a, 2, 3, 4, 5);
-
-    int * dims = new int[a.order];
-    memcpy(dims, a.dimSize, sizeof(int) * a.order);
-    dims[I] = a.dimSize[J];
-    dims[J] = a.dimSize[I];
-
-    InitTensor(&b, 4, dims);
-
-    a.SetZeroAll();
-    b.SetZeroAll();
-
-    float * data = new float[a.unitNum];
-    for(int i = 0; i < a.unitNum; i++)
-        data[i] = (float)i;
-
-    a.SetData(data, a.unitNum, 0);
-
-    _Transpose(&a, &b, I, J);
-    b.Dump(stderr, "b:");
-
-    delete[] data;
-}
-
-void LittleTest()
-{
-    int a = 5000;
-    int b = 100000;
-    int c = a*b;
-    printf("%d\n", c);
+void Tests() {

-    exit(1);
 }

-void T2TTest()
-{
-    XTensor * input;
-    XTensor * weight;
-    XTensor * output;
-    XTensor * gold;
-    XTensor * dedy;
-    XTensor * dedx;
-    XTensor * dedxTmp;
-    XTensor * dedw;
-    XTensor * padding;
-
-    DTYPE loss;
-
-    int * dimSize = new int[2];
-    dimSize[0] = 256;
-    dimSize[1] = 10001;
-
-    int * dimSize2 = new int[3];
-    dimSize2[0] = 2;
-    dimSize2[1] = 31;
-    dimSize2[2] = 256;
-   
-    int * dimSize3 = new int[3];
-    dimSize3[0] = 2;
-    dimSize3[1] = 31;
-    dimSize3[2] = 10001;
-
-    int * dimSize4 = new int[2];
-    dimSize4[0] = 2;
-    dimSize4[1] = 31;
-
-    input = NewTensor(3, dimSize2, X_FLOAT, 1.0F, 0);
-    weight = NewTensor(2, dimSize, X_FLOAT, 1.0F, 0);
-    dedw = NewTensor(2, dimSize, X_FLOAT, 1.0F, 0);
-    gold = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
-    output = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
-    dedy = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
-    dedx = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
-    dedxTmp = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
-    padding = NewTensor(2, dimSize4, X_FLOAT, 1.0F, 0);
-
-    //weight = NewTensor(2, dimSize);
-    //dedw = NewTensor(2, dimSize);
-    //input = NewTensor(3, dimSize2);
-    //gold = NewTensor(3, dimSize3);
-    //output = NewTensor(3, dimSize3);
-    //dedy = NewTensor(3, dimSize3);
-    //dedx = NewTensor(3, dimSize3);
-    //dedxTmp = NewTensor(3, dimSize3);
-    //padding = NewTensor(2, dimSize4);
-
-    myRead(input, "x.txt", "x");
-    myRead(weight, "w.txt", "w");
-    myRead(gold, "gold.txt", "gold");
-    myRead(padding, "padding.txt", "padding");
-
-    XTensor inter;
-    inter = MMul(*input, *weight);
-
-    _Softmax(&inter, output, 2);
-
-    //_LogMe(output);
-    loss = _CrossEntropyFast(output, gold, REDUCE_MEAN, NULL, padding);
-
-    printf("loss: %f\n", loss);
-
-    _CrossEntropyBackward(dedy, output, gold, NULL);
-    //_CrossEntropyBackward(dedy, output, gold, NULL, padding);
-
-    myDump(dedy, "dedy.txt", "dedy");
-
-    _SoftmaxBackward(NULL, output, input, dedy, dedx, NULL, -1, NOLOSS);
-    _Sub(output, gold, dedxTmp);
-
-    myDump(dedx, "dedx.txt", "dedx");
-    dedx->Dump(stderr, "dedx", 200);
-    dedxTmp->Dump(stderr, "dedxTmp", 200);
-
-    input->Reshape(input->unitNum/input->GetDim(-1), input->GetDim(-1));
-    dedx->Reshape(dedx->unitNum/dedx->GetDim(-1), dedx->GetDim(-1));
-
-    _MatrixMulBatched(input, X_TRANS, dedx, X_NOTRANS, dedw);
-
-    myDump(dedw, "dedw.txt", "dedw");
-}
-
-void T2TTest2()
-{
-    int dimSize[3];
-    dimSize[0] = 161;
-    dimSize[1] = 47;
-    dimSize[2] = 10001;
-    XTensor * probs = NewTensor(3, dimSize, X_FLOAT, 1.0F, 0);
-    //XTensor * probs = NewTensor(3, dimSize, X_FLOAT, 1.0F, -1);
-
-    //myRead(probs, "probs.txt", " ");
-    _SetDataFixedFloat(probs, 1.0F);
-
-    probs->Reshape(1, probs->unitNum);
-
-    DTYPE sum = _ReduceSumAll(probs);
-    printf("%e\n", sum);
-
-    //XTensor tmp;
-    //tmp = IsNonZero(*probs);
-    //DTYPE nonZeroNum = ReduceSumAll(tmp);
-    //printf("%f\n", nonZeroNum);
-    //
-    //DTYPE gpu = ReduceSum(*probs, 1).Get2D(0, 0);
-
-    //printf("%e\n", gpu);
-}
+//void myRead(XTensor * tensor, const char * filename, const char * label)
+//{
+//    FILE * file = fopen(filename, "rb");
+//    if(file == NULL)
+//        printf("%s\n", filename);
+//    tensor->Read(file, label);
+//}
+//
+//void myDump(XTensor * tensor, const char * filename, const char * label)
+//{
+//    FILE * file = fopen(filename, "wb");
+//    if(file == NULL)
+//        printf("%s\n", filename);
+//    tensor->Dump(file, label);
+//}
+//
+//void PowerTest()
+//{
+//    XTensor input;
+//    XTensor output;
+//    InitTensor2D(&input, 256, 10000, X_FLOAT, 0);
+//    InitTensor2D(&output, 256, 10000, X_FLOAT, 0);
+//    myRead(&input, "1.txt", "");
+//
+//    _Power(&input, &output, 2);
+//    output.Dump(stderr, "", 200);
+//}
+//
+//void SmallTest()
+//{
+//    XTensor a;
+//    XTensor b;
+//    XTensor c;
+//    XTensor d;
+//
+//    InitTensor2D(&a, 2, 2);
+//    InitTensor2D(&b, 2, 2);
+//    a.SetZeroAll();
+//    b.SetZeroAll();
+//    a.Set2D(1.0F, 0, 0);
+//    a.Set2D(2.0F, 1, 1);
+//
+//    b = Sum(a, Multiply(a, a));
+//
+//    /* this is prohibited !!!!!!!!!!!!! */
+//    //XTensor c = a * b + a;
+//    //XTensor d = a + b + c.Lin(0.5F);
+//    
+//    c = a * b + a;
+//    d = a + b + c.Lin(0.5F);
+//
+//    XLink::CheckNetwork(&d);
+//    //XLink::ShowNetwork(stderr, &d);
+//        
+//    a.Dump(stderr, "a:");
+//    b.Dump(stderr, "b:");
+//    c.Dump(stderr, "c:");
+//    d.Dump(stderr, "d:");
+//}
+//
+//void TransposeTest()
+//{
+//    XTensor a;
+//    XTensor b;
+//
+//    int I = 2;
+//    int J = 3;
+//
+//    InitTensor4D(&a, 2, 3, 4, 5);
+//
+//    int * dims = new int[a.order];
+//    memcpy(dims, a.dimSize, sizeof(int) * a.order);
+//    dims[I] = a.dimSize[J];
+//    dims[J] = a.dimSize[I];
+//
+//    InitTensor(&b, 4, dims);
+//
+//    a.SetZeroAll();
+//    b.SetZeroAll();
+//
+//    float * data = new float[a.unitNum];
+//    for(int i = 0; i < a.unitNum; i++)
+//        data[i] = (float)i;
+//
+//    a.SetData(data, a.unitNum, 0);
+//
+//    _Transpose(&a, &b, I, J);
+//    b.Dump(stderr, "b:");
+//
+//    delete[] data;
+//}
+//
+//void LittleTest()
+//{
+//    int a = 5000;
+//    int b = 100000;
+//    int c = a*b;
+//    printf("%d\n", c);
+//
+//    exit(1);
+//}
+//
+//void T2TTest()
+//{
+//    XTensor * input;
+//    XTensor * weight;
+//    XTensor * output;
+//    XTensor * gold;
+//    XTensor * dedy;
+//    XTensor * dedx;
+//    XTensor * dedxTmp;
+//    XTensor * dedw;
+//    XTensor * padding;
+//
+//    DTYPE loss;
+//
+//    int * dimSize = new int[2];
+//    dimSize[0] = 256;
+//    dimSize[1] = 10001;
+//
+//    int * dimSize2 = new int[3];
+//    dimSize2[0] = 2;
+//    dimSize2[1] = 31;
+//    dimSize2[2] = 256;
+//   
+//    int * dimSize3 = new int[3];
+//    dimSize3[0] = 2;
+//    dimSize3[1] = 31;
+//    dimSize3[2] = 10001;
+//
+//    int * dimSize4 = new int[2];
+//    dimSize4[0] = 2;
+//    dimSize4[1] = 31;
+//
+//    input = NewTensor(3, dimSize2, X_FLOAT, 1.0F, 0);
+//    weight = NewTensor(2, dimSize, X_FLOAT, 1.0F, 0);
+//    dedw = NewTensor(2, dimSize, X_FLOAT, 1.0F, 0);
+//    gold = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
+//    output = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
+//    dedy = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
+//    dedx = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
+//    dedxTmp = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
+//    padding = NewTensor(2, dimSize4, X_FLOAT, 1.0F, 0);
+//
+//    //weight = NewTensor(2, dimSize);
+//    //dedw = NewTensor(2, dimSize);
+//    //input = NewTensor(3, dimSize2);
+//    //gold = NewTensor(3, dimSize3);
+//    //output = NewTensor(3, dimSize3);
+//    //dedy = NewTensor(3, dimSize3);
+//    //dedx = NewTensor(3, dimSize3);
+//    //dedxTmp = NewTensor(3, dimSize3);
+//    //padding = NewTensor(2, dimSize4);
+//
+//    myRead(input, "x.txt", "x");
+//    myRead(weight, "w.txt", "w");
+//    myRead(gold, "gold.txt", "gold");
+//    myRead(padding, "padding.txt", "padding");
+//
+//    XTensor inter;
+//    inter = MMul(*input, *weight);
+//
+//    _Softmax(&inter, output, 2);
+//
+//    //_LogMe(output);
+//    loss = _CrossEntropyFast(output, gold, REDUCE_MEAN, NULL, padding);
+//
+//    printf("loss: %f\n", loss);
+//
+//    _CrossEntropyBackward(dedy, output, gold, NULL);
+//    //_CrossEntropyBackward(dedy, output, gold, NULL, padding);
+//
+//    myDump(dedy, "dedy.txt", "dedy");
+//
+//    _SoftmaxBackward(NULL, output, input, dedy, dedx, NULL, -1, NOLOSS);
+//    _Sub(output, gold, dedxTmp);
+//
+//    myDump(dedx, "dedx.txt", "dedx");
+//    dedx->Dump(stderr, "dedx", 200);
+//    dedxTmp->Dump(stderr, "dedxTmp", 200);
+//
+//    input->Reshape(input->unitNum/input->GetDim(-1), input->GetDim(-1));
+//    dedx->Reshape(dedx->unitNum/dedx->GetDim(-1), dedx->GetDim(-1));
+//
+//    _MatrixMulBatched(input, X_TRANS, dedx, X_NOTRANS, dedw);
+//
+//    myDump(dedw, "dedw.txt", "dedw");
+//}
+//
+//void T2TTest2()
+//{
+//    int dimSize[3];
+//    dimSize[0] = 161;
+//    dimSize[1] = 47;
+//    dimSize[2] = 10001;
+//    XTensor * probs = NewTensor(3, dimSize, X_FLOAT, 1.0F, 0);
+//    //XTensor * probs = NewTensor(3, dimSize, X_FLOAT, 1.0F, -1);
+//
+//    //myRead(probs, "probs.txt", " ");
+//    _SetDataFixedFloat(probs, 1.0F);
+//
+//    probs->Reshape(1, probs->unitNum);
+//
+//    DTYPE sum = _ReduceSumAll(probs);
+//    printf("%e\n", sum);
+//
+//    //XTensor tmp;
+//    //tmp = IsNonZero(*probs);
+//    //DTYPE nonZeroNum = ReduceSumAll(tmp);
+//    //printf("%f\n", nonZeroNum);
+//    //
+//    //DTYPE gpu = ReduceSum(*probs, 1).Get2D(0, 0);
+//
+//    //printf("%e\n", gpu);
+//}

--- a/source/tensor/loss/CrossEntropy.cu
+++ b/source/tensor/loss/CrossEntropy.cu
@@ -196,17 +196,17 @@ void _CudaCrossEntropyBackward(XTensor * dedy, const XTensor * output,
        delete[] dims;
    }

-    if(padding != NULL) {
-        XTensor * tmp = NewTensor(padding);
-        _IsNonZero(padding, tmp);
-        int nonZeroNum = (int)_ReduceSumAll(tmp);
-        _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum);
-        delete tmp;
-    }
-    else {
-        int num = dedy->unitNum / dedy->GetDim(n);
-        _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)num);
-    }
+    //if(padding != NULL) {
+    //    XTensor * tmp = NewTensor(padding);
+    //    _IsNonZero(padding, tmp);
+    //    int nonZeroNum = (int)_ReduceSumAll(tmp);
+    //    _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum);
+    //    delete tmp;
+    //}
+    //else {
+    //    int num = dedy->unitNum / dedy->GetDim(n);
+    //    _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)num);
+    //}

 }