fixed FNNLM of branch of xiao

2c4061e9 · ltb · 3800528b · 2c4061e9 · 2c4061e9 · 2c4061e9
Commit 2c4061e9 authored Jul 30, 2019 by ltb
--- a/source/network/Main.cpp
+++ b/source/network/Main.cpp
@@ -24,7 +24,6 @@
 #include "../tensor/XUtility.h"
 #include "../tensor/function/FHeader.h"
 #include "../tensor/core/CHeader.h"
-#include "../tensor/test/Test.h"
 #include "../sample/fnnlm/FNNLM.h"
 #include "../sample/transformer/Transformer.h"


--- a/source/sample/fnnlm/FNNLM.cpp
+++ b/source/sample/fnnlm/FNNLM.cpp
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,15 +15,15 @@
 * limitations under the License.
 */

-/*
- *
- * This is a simple impelementation of the feed-forward network-baesd language
- * model (FNNLM). See more details about FNNLM in
- * "A Neural Probabilistic Language Model" by Bengio et al.
- * Journal of Machine Learning Research 3 (2003) 1137C1155
- *
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-06-22
- */
+ /*
+  *
+  * This is a simple impelementation of the feed-forward network-baesd language
+  * model (FNNLM). See more details about FNNLM in
+  * "A Neural Probabilistic Language Model" by Bengio et al.
+  * Journal of Machine Learning Research 3 (2003) 1137C1155
+  *
+  * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-06-22
+  */

 #include <math.h>
 #include "FNNLM.h"
@@ -32,6 +32,7 @@
 #include "../../tensor/XDevice.h"
 #include "../../tensor/function/FHeader.h"
 #include "../../network/XNet.h"
+#include "../../tensor/core/math/ScaleAndShift.h"

 namespace fnnlm
 {
@@ -39,1185 +40,1187 @@ namespace fnnlm
 #define MAX_NAME_LENGTH 1024
 #define MAX_LINE_LENGTH_HERE 1024 * 32

-char trainFN[MAX_NAME_LENGTH] = "";   // file name of the training data
-char modelFN[MAX_NAME_LENGTH] = "";   // file name of the FNN model
-char testFN[MAX_NAME_LENGTH] = "";    // file name of the test data
-char outputFN[MAX_NAME_LENGTH] = "";  // file name of the result data
-    
-float learningRate = 0.01F;           // learning rate
-int nStep = 10000000;                   // max learning steps (or model updates)
-int nEpoch = 10;                      // max training epochs
-float minmax = 0.08F;                 // range [-p,p] for parameter initialization
-int sentBatch = 0;                    // batch size at the sentence level
-int wordBatch = 1;                    // batch size at the word level
-bool shuffled = false;                // shuffled the training data file or not
-bool autoDiff = false;                // indicator of automatic differentiation
-
-void LoadArgs(int argc, const char ** argv, FNNModel &model);
-void Init(FNNModel &model);
-void Check(FNNModel &model);
-void Copy(FNNModel &tgt, FNNModel &src);
-void Clear(FNNModel &model, bool isNodeGrad);
-void InitModelTensor1D(XTensor &tensor, int num, FNNModel &model);
-void InitModelTensor2D(XTensor &tensor, int rowNum, int colNum, FNNModel &model);
-void Train(const char * train, bool isShuffled, FNNModel &model);
-void Update(FNNModel &model, FNNModel &grad, float epsilon, bool isNodeGrad);
-float GetProb(XTensor &output, XTensor &gold, XTensor * wordProbs = NULL);
-void Dump(const char * fn, FNNModel &model);
-void Read(const char * fn, FNNModel &model);
-void Test(const char * test, const char * result, FNNModel &model);
-int  LoadNGrams(FILE * file, int n, NGram * ngrams, int sentNum, int wordNum);
-void InitZeroOneTensor2D(XTensor &tensor, int rowNum, int colNum, int * rows, int * cols, 
-                         int itemNum, int devID, XMem * mem);
-void MakeWordBatch(XTensor &batch, NGram * ngrams, int ngramNum, int n, int vSize, int devID, XMem * mem);
-void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net);
-void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NAME loss, 
-              FNNModel &model, FNNModel &grad, FNNNet &net);
-void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model);
-void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model);
-
-/* 
-entry of the program 
->> argc - number of the arguments
->> argv - pointers to the arguments
-<< return - error code
-
-arguments:
- -train S: specify training data file name
- -model S: specify model file name
- -test S: specify test data file name
- -output S: specify result data file name
- -n D: order of the language model
- -eSize D: embedding size
- -vSize D: vocabulary size
- -hdepth D: number of stacked hidden layers
- -hsize D: size of each hidden layer
- -lrate F: learning rate
- -nstep D: maximum number of model updates
- -nepoch D: maximum number of training epochs
- -batch D: batch size (how many sentences)
- -wbatch D: batch size at the word level
-            (how many words)
- -shuffle: shuffle the training data
- -devid D: the id of the device used
-           -1: CPU, >=0: GPUs
- -mempool: use memory pools for memory management
- -autodiff: use automatic differentiation for training
- 
- where S=string, D=integer and F=float.
- All words in the training and test data files
- are encoded as thire indeces in the vocabulary.
- E.g.,
- 0 29 2 11 1
- might be a line of the file.
-*/
-int FNNLMMain(int argc, const char ** argv)
-{
-    if(argc == 0)
-        return 1;
-
-    FNNModel model;
-
-    /* load arguments */
-    LoadArgs(argc, argv, model);
-
-    /* check the setting */
-    Check(model);
-
-    /* initialize model parameters */
-    Init(model);
-
-    /* learn model parameters */
-    if(strcmp(trainFN, ""))
-        Train(trainFN, shuffled, model);
-
-    /* save the final model */
-    if(strcmp(modelFN, "") && strcmp(trainFN, ""))
-        Dump(modelFN, model);
-
-    /* load the model if neccessary */
-    if(strcmp(modelFN, ""))
-        Read(modelFN, model);
-
-    /* test the model on the new data */
-    if(strcmp(testFN, "") && strcmp(outputFN, ""))
-        Test(testFN, outputFN, model);
-
-    return 0;
-}
-
-/* 
-load arguments 
->> argc - number of the arguments
->> argv - pointers to the arguments
->> model - the fnn model
-*/
-void LoadArgs(int argc, const char ** argv, FNNModel &model)
-{
-    fprintf(stderr, "args:\n");
-    for(int i = 0; i < argc; i++){
-        if(!strcmp(argv[i], "-train") && i + 1 < argc){
-            strcpy(trainFN, argv[i + 1]);
-            fprintf(stderr, " -train=%s\n", argv[i + 1]);
-        }
-        if(!strcmp(argv[i], "-model") && i + 1 < argc){
-            strcpy(modelFN, argv[i + 1]);
-            fprintf(stderr, " -model=%s\n", argv[i + 1]);
-        }
-        if(!strcmp(argv[i], "-test") && i + 1 < argc){
-            strcpy(testFN, argv[i + 1]);
-            fprintf(stderr, " -test=%s\n", argv[i + 1]);
-        }
-        if(!strcmp(argv[i], "-output") && i + 1 < argc){
-            strcpy(outputFN, argv[i + 1]);
-            fprintf(stderr, " -output=%s\n", argv[i + 1]);
-        }
-        if(!strcmp(argv[i], "-n") && i + 1 < argc){
-            model.n = atoi(argv[i + 1]);
-            fprintf(stderr, " -n=%d\n", model.n);
-        }
-        if(!strcmp(argv[i], "-esize") && i + 1 < argc){
-            model.eSize = atoi(argv[i + 1]);
-            fprintf(stderr, " -esize=%d\n", model.eSize);
-        }
-        if(!strcmp(argv[i], "-vsize") && i + 1 < argc){
-            model.vSize = atoi(argv[i + 1]);
-            fprintf(stderr, " -vsize=%d\n", model.vSize);
-        }
-        if(!strcmp(argv[i], "-hdepth") && i + 1 < argc){
-            model.hDepth = atoi(argv[i + 1]);
-            fprintf(stderr, " -hdepth=%d\n", model.hDepth);
-        }
-        if(!strcmp(argv[i], "-hsize") && i + 1 < argc){
-            model.hSize = atoi(argv[i + 1]);
-            fprintf(stderr, " -hsize=%d\n", model.hSize);
-        }
-        if(!strcmp(argv[i], "-lrate") && i + 1 < argc){
-            learningRate = (float)atof(argv[i + 1]);
-            fprintf(stderr, " -lrate=%f\n", learningRate);
-        }
-        if(!strcmp(argv[i], "-nstep") && i + 1 < argc){
-            nStep = atoi(argv[i + 1]);
-            fprintf(stderr, " -nstep=%d\n", nStep);
-        }
-        if(!strcmp(argv[i], "-nepoch") && i + 1 < argc){
-            nEpoch = atoi(argv[i + 1]);
-            fprintf(stderr, " -nepoch=%d\n", nEpoch);
-        }
-        if(!strcmp(argv[i], "-minmax") && i + 1 < argc){
-            minmax = (float)fabs(atof(argv[i + 1]));
-            fprintf(stderr, " -minmax=%f\n", minmax);
-        }
-        if(!strcmp(argv[i], "-batch") && i + 1 < argc){
-            sentBatch = atoi(argv[i + 1]);
-            fprintf(stderr, " -batch=%d\n", sentBatch);
-        }
-        if(!strcmp(argv[i], "-wbatch") && i + 1 < argc){
-            wordBatch = atoi(argv[i + 1]);
-            fprintf(stderr, " -wbatch=%d\n", wordBatch);
-        }
-        if(!strcmp(argv[i], "-shuffle")){
-            shuffled = true;
-            fprintf(stderr, " -shuffle=true\n");
-        }
-        if(!strcmp(argv[i], "-autodiff")){
-            autoDiff = true;
-            fprintf(stderr, " -autodiff=true\n");
-        }
-        if(!strcmp(argv[i], "-dev") && i + 1 < argc){
-            model.devID = atoi(argv[i + 1]);
-            fprintf(stderr, " -dev=%d\n", model.devID);
-        }
-    }
-
-    for(int i = 0; i < argc; i++){
-        if (!strcmp(argv[i], "-mempool"))
-            model.mem = new XMem(model.devID);
-    }
-}
-
-/* check model settings */
-void Check(FNNModel &model)
-{
-    CheckErrors(model.n > 0 && model.n <= MAX_N_GRAM, "The LM order is out of range (use -n)!");
-    CheckErrors(model.vSize > 0, "no vocabulary size found (use -vsize)!");
-    CheckErrors(model.eSize > 0, "no embedding size found (use -esize)!");
-}
-
-/* make a hard copy of the fnn model */
-void Copy(FNNModel &tgt, FNNModel &src)
-{
-    InitTensorV2(&tgt.embeddingW, &src.embeddingW);
-    for(int i = 0; i < MAX_HIDDEN_NUM; i++){
-        InitTensorV2(&tgt.hiddenW[i], &src.hiddenW[i]);
-        InitTensorV2(&tgt.hiddenB[i], &src.hiddenB[i]);
-    }
-    InitTensorV2(&tgt.outputW, &src.outputW);
-    InitTensorV2(&tgt.outputB, &src.outputB);
-
-    tgt.n = src.n;
-    tgt.eSize = src.eSize;
-    tgt.hDepth = src.hDepth;
-    tgt.hSize = src.hSize;
-    tgt.vSize = src.vSize;
-    tgt.devID = src.devID;
-    tgt.useMemPool = src.useMemPool;
-    if(src.mem != NULL){
-        tgt.mem = new XMem(src.mem->devID, src.mem->mode, 
-                           src.mem->maxBlockSize, src.mem->blockNum, 
-                           src.mem->bufSize);
-    }
-}
-
-/* 
-reset model parameters 
->> model - the model whose parameter (gradient) is set to 0
->> isNodeGrad - indicates whether the tensor node keeps the 
-                gradient information
-*/
-void Clear(FNNModel &model, bool isNodeGrad)
-{
-    if (isNodeGrad) {
-        if(model.embeddingW.grad != NULL)
-            model.embeddingW.grad->SetZeroAll();
-        for (int i = 0; i < MAX_HIDDEN_NUM; i++) {
-            if(model.hiddenW[i].grad != NULL)
-                model.hiddenW[i].grad->SetZeroAll();
-            if(model.hiddenB[i].grad != NULL)
-                model.hiddenB[i].grad->SetZeroAll();
-        }
-        if(model.outputW.grad != NULL)
-            model.outputW.grad->SetZeroAll();
-        if(model.outputB.grad != NULL)
-            model.outputB.grad->SetZeroAll();
-    }
-    else {
-        model.embeddingW.SetZeroAll();
-        for (int i = 0; i < MAX_HIDDEN_NUM; i++) {
-            model.hiddenW[i].SetZeroAll();
-            model.hiddenB[i].SetZeroAll();
-        }
-        model.outputW.SetZeroAll();
-        model.outputB.SetZeroAll();
-    }
-}
-
-/* 
-initialize a 1d tensor using the fnn model setting 
->> tensor - the tensor to initialize
->> num - number of items
->> model - the fnn model
-*/
-void InitModelTensor1D(XTensor &tensor, int num, FNNModel &model)
-{
-    InitTensor1DV2(&tensor, num, X_FLOAT, model.devID);
-}
-
-/* 
-initialize a 2d tensor using the fnn model setting 
->> tensor - the tensor to initialize
->> rowNum - number of rows
->> colNum - number of columns
->> model - the fnn model
-*/
-void InitModelTensor2D(XTensor &tensor, int rowNum, int colNum, FNNModel &model)
-{
-    InitTensor2DV2(&tensor, rowNum, colNum, X_FLOAT, model.devID);
-}
-
-
-/* initialize the model */
-void Init(FNNModel &model)
-{
-    /* create embedding parameter matrix: vSize * eSize */
-    InitModelTensor2D(model.embeddingW, model.vSize, model.eSize, model);
-    
-    /* create hidden layer parameter matrics */
-    for(int i = 0; i < model.hDepth; i++){
-        /* hidden layer parameter matrix: (n-1)eSize * hsize if it is the first layer
-                                           hsize * hsize otherwise */
-        if(i == 0)
-            InitModelTensor2D(model.hiddenW[i], (model.n - 1) * model.eSize, model.hSize, model);
-        else
-            InitModelTensor2D(model.hiddenW[i], model.hSize, model.hSize, model);
-        
-        /* bias term: a row vector of hSize entries */
-        InitModelTensor1D(model.hiddenB[i], model.hSize, model);
-    }
-    
-    /* create the output layer parameter matrix and bias term */
-    int iSize = model.hDepth == 0 ? (model.n - 1) * model.eSize : model.hSize;
-    InitModelTensor2D(model.outputW, iSize, model.vSize, model);
-    InitModelTensor1D(model.outputB, model.vSize, model);
-    
-    /* then, we initialize model parameters using a uniform distribution in range
-       of [-minmax, minmax] */
-    model.embeddingW.SetDataRand(-minmax, minmax);
-    model.outputW.SetDataRand(-minmax, minmax);
-    for(int i = 0; i < model.hDepth; i++)
-        model.hiddenW[i].SetDataRand(-minmax, minmax);
-    
-    /* all bias terms are set to zero */
-    model.outputB.SetZeroAll();
-    for(int i = 0; i < model.hDepth; i++)
-        model.hiddenB[i].SetZeroAll();
-}
-    
-/*
- shuffle lines of the file
- >> srcFile - the source file to shuffle
- >> tgtFile - the resulting file
- */
-void Shuffle(const char * srcFile, const char * tgtFile)
-{
-    char * line = new char[MAX_LINE_LENGTH_HERE];
+	char trainFN[MAX_NAME_LENGTH] = "";   // file name of the training data
+	char modelFN[MAX_NAME_LENGTH] = "";   // file name of the FNN model
+	char testFN[MAX_NAME_LENGTH] = "";    // file name of the test data
+	char outputFN[MAX_NAME_LENGTH] = "";  // file name of the result data
+
+	float learningRate = 0.01F;           // learning rate
+	int nStep = 10000000;                   // max learning steps (or model updates)
+	int nEpoch = 10;                      // max training epochs
+	float minmax = 0.08F;                 // range [-p,p] for parameter initialization
+	int sentBatch = 0;                    // batch size at the sentence level
+	int wordBatch = 1;                    // batch size at the word level
+	bool shuffled = false;                // shuffled the training data file or not
+	bool autoDiff = false;                // indicator of automatic differentiation
+
+	void LoadArgs(int argc, const char ** argv, FNNModel &model);
+	void Init(FNNModel &model);
+	void Check(FNNModel &model);
+	void Copy(FNNModel &tgt, FNNModel &src);
+	void Clear(FNNModel &model, bool isNodeGrad);
+	void InitModelTensor1D(XTensor &tensor, int num, FNNModel &model);
+	void InitModelTensor2D(XTensor &tensor, int rowNum, int colNum, FNNModel &model);
+	void Train(const char * train, bool isShuffled, FNNModel &model);
+	void Update(FNNModel &model, FNNModel &grad, float epsilon, bool isNodeGrad);
+	float GetProb(XTensor &output, XTensor &gold, XTensor * wordProbs = NULL);
+	void Dump(const char * fn, FNNModel &model);
+	void Read(const char * fn, FNNModel &model);
+	void Test(const char * test, const char * result, FNNModel &model);
+	int  LoadNGrams(FILE * file, int n, NGram * ngrams, int sentNum, int wordNum);
+	void InitZeroOneTensor2D(XTensor &tensor, int rowNum, int colNum, int * rows, int * cols,
+		int itemNum, int devID, XMem * mem);
+	void MakeWordBatch(XTensor &batch, NGram * ngrams, int ngramNum, int n, int vSize, int devID, XMem * mem);
+	void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net);
+	void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NAME loss,
+		FNNModel &model, FNNModel &grad, FNNNet &net);
+	void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model);
+	void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model);
+
+	/*
+	entry of the program
+	>> argc - number of the arguments
+	>> argv - pointers to the arguments
+	<< return - error code
+
+	arguments:
+	 -train S: specify training data file name
+	 -model S: specify model file name
+	 -test S: specify test data file name
+	 -output S: specify result data file name
+	 -n D: order of the language model
+	 -eSize D: embedding size
+	 -vSize D: vocabulary size
+	 -hdepth D: number of stacked hidden layers
+	 -hsize D: size of each hidden layer
+	 -lrate F: learning rate
+	 -nstep D: maximum number of model updates
+	 -nepoch D: maximum number of training epochs
+	 -batch D: batch size (how many sentences)
+	 -wbatch D: batch size at the word level
+				(how many words)
+	 -shuffle: shuffle the training data
+	 -devid D: the id of the device used
+			   -1: CPU, >=0: GPUs
+	 -mempool: use memory pools for memory management
+	 -autodiff: use automatic differentiation for training
+
+	 where S=string, D=integer and F=float.
+	 All words in the training and test data files
+	 are encoded as thire indeces in the vocabulary.
+	 E.g.,
+	 0 29 2 11 1
+	 might be a line of the file.
+	*/
+	int FNNLMMain(int argc, const char ** argv)
+	{
+		if (argc == 0)
+			return 1;
+
+		FNNModel model;
+
+		/* load arguments */
+		LoadArgs(argc, argv, model);
+
+		/* check the setting */
+		Check(model);
+
+		/* initialize model parameters */
+		Init(model);
+
+		/* learn model parameters */
+		if (strcmp(trainFN, ""))
+			Train(trainFN, shuffled, model);
+
+		/* save the final model */
+		if (strcmp(modelFN, "") && strcmp(trainFN, ""))
+			Dump(modelFN, model);
+
+		/* load the model if neccessary */
+		if (strcmp(modelFN, ""))
+			Read(modelFN, model);
+
+		/* test the model on the new data */
+		if (strcmp(testFN, "") && strcmp(outputFN, ""))
+			Test(testFN, outputFN, model);
+
+		return 0;
+	}
+
+	/*
+	load arguments
+	>> argc - number of the arguments
+	>> argv - pointers to the arguments
+	>> model - the fnn model
+	*/
+	void LoadArgs(int argc, const char ** argv, FNNModel &model)
+	{
+		fprintf(stderr, "args:\n");
+		for (int i = 0; i < argc; i++) {
+			if (!strcmp(argv[i], "-train") && i + 1 < argc) {
+				strcpy(trainFN, argv[i + 1]);
+				fprintf(stderr, " -train=%s\n", argv[i + 1]);
+			}
+			if (!strcmp(argv[i], "-model") && i + 1 < argc) {
+				strcpy(modelFN, argv[i + 1]);
+				fprintf(stderr, " -model=%s\n", argv[i + 1]);
+			}
+			if (!strcmp(argv[i], "-test") && i + 1 < argc) {
+				strcpy(testFN, argv[i + 1]);
+				fprintf(stderr, " -test=%s\n", argv[i + 1]);
+			}
+			if (!strcmp(argv[i], "-output") && i + 1 < argc) {
+				strcpy(outputFN, argv[i + 1]);
+				fprintf(stderr, " -output=%s\n", argv[i + 1]);
+			}
+			if (!strcmp(argv[i], "-n") && i + 1 < argc) {
+				model.n = atoi(argv[i + 1]);
+				fprintf(stderr, " -n=%d\n", model.n);
+			}
+			if (!strcmp(argv[i], "-esize") && i + 1 < argc) {
+				model.eSize = atoi(argv[i + 1]);
+				fprintf(stderr, " -esize=%d\n", model.eSize);
+			}
+			if (!strcmp(argv[i], "-vsize") && i + 1 < argc) {
+				model.vSize = atoi(argv[i + 1]);
+				fprintf(stderr, " -vsize=%d\n", model.vSize);
+			}
+			if (!strcmp(argv[i], "-hdepth") && i + 1 < argc) {
+				model.hDepth = atoi(argv[i + 1]);
+				fprintf(stderr, " -hdepth=%d\n", model.hDepth);
+			}
+			if (!strcmp(argv[i], "-hsize") && i + 1 < argc) {
+				model.hSize = atoi(argv[i + 1]);
+				fprintf(stderr, " -hsize=%d\n", model.hSize);
+			}
+			if (!strcmp(argv[i], "-lrate") && i + 1 < argc) {
+				learningRate = (float)atof(argv[i + 1]);
+				fprintf(stderr, " -lrate=%f\n", learningRate);
+			}
+			if (!strcmp(argv[i], "-nstep") && i + 1 < argc) {
+				nStep = atoi(argv[i + 1]);
+				fprintf(stderr, " -nstep=%d\n", nStep);
+			}
+			if (!strcmp(argv[i], "-nepoch") && i + 1 < argc) {
+				nEpoch = atoi(argv[i + 1]);
+				fprintf(stderr, " -nepoch=%d\n", nEpoch);
+			}
+			if (!strcmp(argv[i], "-minmax") && i + 1 < argc) {
+				minmax = (float)fabs(atof(argv[i + 1]));
+				fprintf(stderr, " -minmax=%f\n", minmax);
+			}
+			if (!strcmp(argv[i], "-batch") && i + 1 < argc) {
+				sentBatch = atoi(argv[i + 1]);
+				fprintf(stderr, " -batch=%d\n", sentBatch);
+			}
+			if (!strcmp(argv[i], "-wbatch") && i + 1 < argc) {
+				wordBatch = atoi(argv[i + 1]);
+				fprintf(stderr, " -wbatch=%d\n", wordBatch);
+			}
+			if (!strcmp(argv[i], "-shuffle")) {
+				shuffled = true;
+				fprintf(stderr, " -shuffle=true\n");
+			}
+			if (!strcmp(argv[i], "-autodiff")) {
+				autoDiff = true;
+				fprintf(stderr, " -autodiff=true\n");
+			}
+			if (!strcmp(argv[i], "-dev") && i + 1 < argc) {
+				model.devID = atoi(argv[i + 1]);
+				fprintf(stderr, " -dev=%d\n", model.devID);
+			}
+		}
+
+		for (int i = 0; i < argc; i++) {
+			if (!strcmp(argv[i], "-mempool"))
+				model.mem = new XMem(model.devID);
+		}
+	}
+
+	/* check model settings */
+	void Check(FNNModel &model)
+	{
+		CheckErrors(model.n > 0 && model.n <= MAX_N_GRAM, "The LM order is out of range (use -n)!");
+		CheckErrors(model.vSize > 0, "no vocabulary size found (use -vsize)!");
+		CheckErrors(model.eSize > 0, "no embedding size found (use -esize)!");
+	}
+
+	/* make a hard copy of the fnn model */
+	void Copy(FNNModel &tgt, FNNModel &src)
+	{
+		InitTensorV2(&tgt.embeddingW, &src.embeddingW);
+		for (int i = 0; i < MAX_HIDDEN_NUM; i++) {
+			InitTensorV2(&tgt.hiddenW[i], &src.hiddenW[i]);
+			InitTensorV2(&tgt.hiddenB[i], &src.hiddenB[i]);
+		}
+		InitTensorV2(&tgt.outputW, &src.outputW);
+		InitTensorV2(&tgt.outputB, &src.outputB);
+
+		tgt.n = src.n;
+		tgt.eSize = src.eSize;
+		tgt.hDepth = src.hDepth;
+		tgt.hSize = src.hSize;
+		tgt.vSize = src.vSize;
+		tgt.devID = src.devID;
+		tgt.useMemPool = src.useMemPool;
+		if (src.mem != NULL) {
+			tgt.mem = new XMem(src.mem->devID, src.mem->mode,
+				src.mem->maxBlockSize, src.mem->blockNum,
+				src.mem->bufSize);
+		}
+	}
+
+	/*
+	reset model parameters
+	>> model - the model whose parameter (gradient) is set to 0
+	>> isNodeGrad - indicates whether the tensor node keeps the
+					gradient information
+	*/
+	void Clear(FNNModel &model, bool isNodeGrad)
+	{
+		if (isNodeGrad) {
+			if (model.embeddingW.grad != NULL)
+				model.embeddingW.grad->SetZeroAll();
+			for (int i = 0; i < MAX_HIDDEN_NUM; i++) {
+				if (model.hiddenW[i].grad != NULL)
+					model.hiddenW[i].grad->SetZeroAll();
+				if (model.hiddenB[i].grad != NULL)
+					model.hiddenB[i].grad->SetZeroAll();
+			}
+			if (model.outputW.grad != NULL)
+				model.outputW.grad->SetZeroAll();
+			if (model.outputB.grad != NULL)
+				model.outputB.grad->SetZeroAll();
+		}
+		else {
+			model.embeddingW.SetZeroAll();
+			for (int i = 0; i < MAX_HIDDEN_NUM; i++) {
+				model.hiddenW[i].SetZeroAll();
+				model.hiddenB[i].SetZeroAll();
+			}
+			model.outputW.SetZeroAll();
+			model.outputB.SetZeroAll();
+		}
+	}
+
+	/*
+	initialize a 1d tensor using the fnn model setting
+	>> tensor - the tensor to initialize
+	>> num - number of items
+	>> model - the fnn model
+	*/
+	void InitModelTensor1D(XTensor &tensor, int num, FNNModel &model)
+	{
+		InitTensor1DV2(&tensor, num, X_FLOAT, model.devID);
+	}
+
+	/*
+	initialize a 2d tensor using the fnn model setting
+	>> tensor - the tensor to initialize
+	>> rowNum - number of rows
+	>> colNum - number of columns
+	>> model - the fnn model
+	*/
+	void InitModelTensor2D(XTensor &tensor, int rowNum, int colNum, FNNModel &model)
+	{
+		InitTensor2DV2(&tensor, rowNum, colNum, X_FLOAT, model.devID);
+	}
+
+
+	/* initialize the model */
+	void Init(FNNModel &model)
+	{
+		/* create embedding parameter matrix: vSize * eSize */
+		InitModelTensor2D(model.embeddingW, model.vSize, model.eSize, model);
+		model.embeddingW.SetVarFlag();
+		/* create hidden layer parameter matrics */
+		for (int i = 0; i < model.hDepth; i++) {
+			/* hidden layer parameter matrix: (n-1)eSize * hsize if it is the first layer
+											   hsize * hsize otherwise */
+			if (i == 0)
+				InitModelTensor2D(model.hiddenW[i], (model.n - 1) * model.eSize, model.hSize, model);
+			else
+				InitModelTensor2D(model.hiddenW[i], model.hSize, model.hSize, model);
+			model.hiddenW[i].SetVarFlag();
+			/* bias term: a row vector of hSize entries */
+			InitModelTensor1D(model.hiddenB[i], model.hSize, model);
+			model.hiddenB[i].SetVarFlag();
+		}
+
+		/* create the output layer parameter matrix and bias term */
+		int iSize = model.hDepth == 0 ? (model.n - 1) * model.eSize : model.hSize;
+		InitModelTensor2D(model.outputW, iSize, model.vSize, model);
+		InitModelTensor1D(model.outputB, model.vSize, model);
+		model.outputW.SetVarFlag();
+		model.outputB.SetVarFlag();
+		/* then, we initialize model parameters using a uniform distribution in range
+		   of [-minmax, minmax] */
+		model.embeddingW.SetDataRand(-minmax, minmax);
+		model.outputW.SetDataRand(-minmax, minmax);
+		for (int i = 0; i < model.hDepth; i++)
+			model.hiddenW[i].SetDataRand(-minmax, minmax);
+
+		/* all bias terms are set to zero */
+		model.outputB.SetZeroAll();
+		for (int i = 0; i < model.hDepth; i++)
+			model.hiddenB[i].SetZeroAll();
+	}
+
+	/*
+	 shuffle lines of the file
+	 >> srcFile - the source file to shuffle
+	 >> tgtFile - the resulting file
+	 */
+	void Shuffle(const char * srcFile, const char * tgtFile)
+	{
+		char * line = new char[MAX_LINE_LENGTH_HERE];
 #ifndef WIN32
-    sprintf(line, "shuf %s > %s", srcFile, tgtFile);
-    system(line);
+		sprintf(line, "shuf %s > %s", srcFile, tgtFile);
+		system(line);
 #else
-    ShowErrors("Cannot shuffle the file on WINDOWS systems!");
+		ShowErrors("Cannot shuffle the file on WINDOWS systems!");
 #endif
-    delete[] line;
-    
-}
-    
-char lineBuf[MAX_LINE_LENGTH_HERE];
-int wordBuf[MAX_LINE_LENGTH_HERE];
-
-/* 
-train the model with the standard SGD method
->> train - training data file
->> isShuffled - shuffle the data file or not
->> model - the fnn model
-*/
-void Train(const char * train, bool isShuffled, FNNModel &model)
-{
-    char name[MAX_NAME_LENGTH];
-    
-    /* shuffle the data */
-    if(isShuffled){
-        sprintf(name, "%s-tmp", train);
-        Shuffle(train, name);
-    }
-    else
-        strcpy(name, train);
-    
-    int epoch = 0;
-    int step = 0;
-    int wordCount = 0;
-    int wordCountTotal = 0;
-    int ngramNum = 1;
-    float loss = 0;
-    bool isEnd = false;
-    
-    NGram * ngrams = new NGram[MAX_LINE_LENGTH_HERE];
-
-    /* make a model to keep gradients */
-    FNNModel grad;
-    Copy(grad, model);
-
-    /* XNet for automatic differentiation */
-    XNet autoDiffer;
-
-    double startT = GetClockSec();
-    
-    /* iterate for a number of epochs */
-    for(epoch = 0; epoch < nEpoch; epoch++){
-
-        /* data file */
-        FILE * file = fopen(name, "rb");
-        CheckErrors(file, "Cannot open the training file");
-
-        wordCount = 0;
-        loss = 0;
-        ngramNum = 1;
-
-        while(ngramNum > 0){
-            
-            /* load a minibatch of ngrams */
-            ngramNum = LoadNGrams(file, model.n, ngrams, sentBatch, wordBatch);
-
-            if (ngramNum <= 0)
-                break;
-
-            /* previous n - 1 words */
-            XTensor inputs[MAX_N_GRAM];
-
-            /* the predicted word */
-            XTensor output;
-
-            /* the gold standard */
-            XTensor gold;
-
-            /* the loss tensor */
-            XTensor lossTensor;
-
-            /* make the input tensor for position i */
-            for(int i = 0; i < model.n - 1; i++)
-                MakeWordBatch(inputs[i], ngrams, ngramNum, i, model.vSize, model.devID, model.mem);
-
-            /* make the gold tensor */
-            MakeWordBatch(gold, ngrams, ngramNum, model.n - 1, model.vSize, model.devID, model.mem);
-
-            if(!autoDiff){
-                /* prepare an empty network for building the fnn */
-                FNNNet net;
-
-                /* gradident = 0 */
-                Clear(grad, false);
-
-                /* forward computation */
-                Forward(inputs, output, model, net);
-
-
-
-                /* backward computation to obtain gradients */
-                Backward(inputs, output, gold, CROSSENTROPY, model, grad, net);
-
-                /* update model parameters */
-                Update(model, grad, learningRate, false);
-            }
-            else{
-                /* gradient = 0 */
-                Clear(model, true);
-
-                /* forward + backward process */
-				
-				/* this is implemented by gather function */
-                ForwardAutoDiff(ngrams, ngramNum, output, model);
-				
-				/* this is implemented by multiply function */
-				//ForwardAutoDiff(inputs, output, model);
-                lossTensor = CrossEntropy(output, gold);
-
-                /* automatic differentiation */
-                autoDiffer.Backward(lossTensor);
-                //autoDiffer.Backward(output, gold, CROSSENTROPY);
-
-                /* update model parameters */
-                Update(model, grad, learningRate, true);
-            }
-                
-            /* get probabilities */
-            float prob = GetProb(output, gold);
-            
-            prob = ReduceSumAll(lossTensor);
-
-            loss += prob;
-            wordCount += ngramNum;
-            wordCountTotal += ngramNum;
-            
-            if(++step >= nStep){
-                isEnd = true;
-                break;
-            }
-
-            if (step % 100 == 0) {
-                double elapsed = GetClockSec() - startT;
-                XPRINT5(0, stderr, "[INFO] elapsed=%.1fs, step=%d, epoch=%d, ngram=%d, ppl=%.3f\n",
-                           elapsed, step, epoch + 1, wordCountTotal, exp(loss / wordCount));
-            }
-        }
-
-        fclose(file);
-        
-        if(isEnd)
-            break;
-
-        Test(testFN, outputFN, model);
-    }
-
-    double elapsed = GetClockSec() - startT;
-    
-    XPRINT5(0, stderr, "[INFO] elapsed=%.1fs, step=%d, epoch=%d, ngram=%d, ppl=%.3f\n", 
-               elapsed, step, epoch, wordCountTotal, exp(loss / wordCount));
-    XPRINT3(0, stderr, "[INFO] training finished (took %.1fs, step=%d and epoch=%d)\n", 
-               elapsed, step, epoch);
-    
-    delete[] ngrams;
-}
-
-/* 
-update the model parameters using the delta rule
->> model - the model to update
->> grad - gradients
->> epsilon - learning rate
->> isNodeGrad - indicates whether the gradient is associated with the node
-*/
-void Update(FNNModel &model, FNNModel &grad, float epsilon, bool isNodeGrad)
-{
-    TensorList paraList(10);
-    TensorList gradList(10);
-
-    paraList.Add(&model.outputW);
-    paraList.Add(&model.outputB);
-
-    for (int i = 0; i < model.hDepth; i++) {
-        paraList.Add(&model.hiddenW[i]);
-        paraList.Add(&model.hiddenB[i]);
-    }
-
-    paraList.Add(&model.embeddingW);
-
-    if(!isNodeGrad){
-        gradList.Add(&grad.outputW);
-        gradList.Add(&grad.outputB);
-
-        for (int i = 0; i < model.hDepth; i++) {
-            gradList.Add(&grad.hiddenW[i]);
-            gradList.Add(&grad.hiddenB[i]);
-        }
-;
-        gradList.Add(&grad.embeddingW);
-    }
-    else{
-        gradList.Add(model.outputW.grad);
-        gradList.Add(model.outputB.grad);
-
-        for (int i = 0; i < model.hDepth; i++) {
-            gradList.Add(model.hiddenW[i].grad);
-            gradList.Add(model.hiddenB[i].grad);
-        }
-
-        gradList.Add(model.embeddingW.grad);
-    }
-
-    for (int i = 0; i < paraList.count; i++) {
-        XTensor * para = (XTensor*)paraList.GetItem(i);
-        XTensor * paraGrad = (XTensor*)gradList.GetItem(i);
-
-        //fprintf(stderr, "%d\n", i);
-        //paraGrad->Dump(stderr, "grad:", 10);
-
-        /* the delta rule */
-        _Sum(para, paraGrad, para, -epsilon);
-    }
-}
-  
-/*
-get prediction probabilites of the gold words
->> output - output probabilities
->> gold - gold standard
->> wordPobs - probability of each word
-<< return - probability of the batch
-*/
-float GetProb(XTensor &output, XTensor &gold, XTensor * wordProbs)
-{
-    XTensor probs;
-    InitTensorV2(&probs, &output);
-    
-    /* probs[i,j] = output[i,j] * gold[i,j] */
-    _Multiply(&output, &gold, &probs);
-
-    /* probability of each word */
-    XTensor wprobs;
-    InitTensor1DV2(&wprobs, output.GetDim(0), output.dataType, output.devID);
-    _ReduceSum(&probs, &wprobs, 1);
-    if(wordProbs != NULL)
-        _CopyValues(&wprobs, wordProbs);
-
-    /* reshape the tensor to fit it into the reduce procedure 
-       TODO: XTensor supports scalars */
-    int dims[2];
-    dims[0] = 1;
-    dims[1] = probs.unitNum;
-    probs.Reshape(2, dims);
- 
-    /* probability for the batch */
-    XTensor result;
-    InitTensor1DV2(&result, 1, X_FLOAT, output.devID);
-    _ReduceSum(&probs, &result, 1);
-    
-    return result.Get1D(0);
-}
-
-int pin = 0;
-int wordBufCount = 0;
-
-/*
-load a minibatch of ngrams
->> file - data file
->> n - order of the language model
->> ngrams - the loaded ngrams
->> sentNum - maximum sentences kept in the minibatch
->> wordNum - maximum words kept in the minibatch
-*/
-int LoadNGrams(FILE * file, int n, NGram * ngrams, int sentNum, int wordNum)
-{
-    int num = 0;
-    int lineNum = 0;
-    while(pin > 0 || fgets(lineBuf, MAX_LINE_LENGTH_HERE - 1, file)){
-        if(pin <= 0){
-            int len = (int)strlen(lineBuf);
-
-            while(lineBuf[len - 1] == '\r' || lineBuf[len - 1] == '\n'){
-                lineBuf[len - 1] = 0;
-                len--;
-            }
-
-            len = (int)strlen(lineBuf);
-            if(len == 0)
-                continue;
-        
-            /* how many characters are in a word */
-            int wSize = 0;
-        
-            /* how many words are in the sentence */
-            int wNum = 0;
-            int i = 0;
-
-            for(i = pin; i < len; i++){
-                /* load word (id) seperated by space or tab */
-                if((lineBuf[i] == ' ' || lineBuf[i] == '\t') && wSize > 0){
-                    lineBuf[i] = 0;
-                    wordBuf[wNum++] = atoi(lineBuf + i - wSize);
-                    wSize = 0;
-                }
-                else
-                    wSize++;
-            }
-
-            if(wSize > 0)
-                wordBuf[wNum++] = atoi(lineBuf + i - wSize);
-
-            wordBufCount = wNum;
-            lineNum++;
-        }
-        else
-            lineNum = 1;
-
-        int i = -MAX_INT;
-
-        /* create ngrams */
-        for(i = MAX(pin, n - 1); i < wordBufCount - 1; i++){
-            memcpy(ngrams[num++].words, wordBuf + i - n + 1, sizeof(int) * n);
-            if(num >= wordNum)
-                break;
-        }
-
-        /* set a finished flag if we reach the end of the sentence*/
-        if(i >= wordBufCount - 1){
-            pin = 0;
-            wordBufCount = 0;
-        }
-        /* record where to start next time if we break in the middle */
-        else{
-            pin = i + 1;
-        }
-        
-        if((sentNum > 0 && lineNum >= sentNum) || num >= wordNum)
-            break;
-    }
-    
-    return num;
-}
-
-/*
-make a 2d tensor in zero-one representation
-The indexed cell is set to 1, and 0 otherwise.
->> tensor - the tensor to initialize
->> rowNum - number of rows
->> colNum - number of columns
->> rows - row index
->> cols - column index
->> itemNum - number of non-zero items
->> devID - device id
->> mem - memory pool
-*/
-void InitZeroOneTensor2D(XTensor &tensor, int rowNum, int colNum, int * rows, int * cols, 
-                         int itemNum, int devID, XMem * mem)
-{
-    InitTensor2DV2(&tensor, rowNum, colNum, X_FLOAT, devID);
-
-    tensor.SetZeroAll();
-
-    /* set none-zero cells */
-    for(int i = 0; i < itemNum; i++)
-        tensor.Set2D(1.0F, rows[i], cols[i]);
-}
-
-/*
-make a tensor that encodes a batch of words
->> batch - the tensor encoding a batch of words
->> ngrams - the ngram batch
->> ngramNum - batch size
->> n - indicate which word is encode for each ngram
->> vSize - vocabulary size
->> devID - device id
->> mem - memory pool
-*/
-void MakeWordBatch(XTensor &batch, NGram * ngrams, int ngramNum, int n, int vSize, int devID, XMem * mem)
-{
-    int * rows = new int[ngramNum];
-    int * cols = new int[ngramNum];
-
-    for(int i = 0; i < ngramNum; i++){
-        rows[i] = i;
-        cols[i] = ngrams[i].words[n];
-    }
-
-    InitZeroOneTensor2D(batch, ngramNum, vSize, rows, cols, ngramNum, devID, mem);
-
-    delete[] rows;
-    delete[] cols;
-}
-
-/*
-forward procedure
->> inputs - input word representations
->> output - output probability
->> model - the fnn model
->> net - the network that keeps the internal tensors generated in the process
-*/
-void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
-{
-    int batchSize = -1;
-    int n = model.n;
-    int depth = model.hDepth;
-    TensorList eList(n - 1);
-
-    /* previoius n - 1 words */
-    for(int i = 0; i < n - 1; i++){
-        XTensor &input = inputs[i];
-        XTensor &w = model.embeddingW;
-        XTensor &embedding = net.embeddings[i];
-
-        if(batchSize == -1)
-            batchSize = input.dimSize[0];
-        else{
-            CheckErrors(batchSize == input.dimSize[0], "Wrong input word representations!");
-        }
-
-        /* embedding output tensor of position i */
-        InitModelTensor2D(embedding, batchSize, model.eSize, model);
-
-        /* generate word embedding of position i:
-           embedding = input * w   */
-        _MatrixMul(&input, X_NOTRANS, &w, X_NOTRANS, &embedding);
-
-        eList.Add(&net.embeddings[i]);
-    }
-
-    /* concatenate word embeddings
-       embeddingcat = cat(embedding_0...embedding_{n-1}) */
-    InitModelTensor2D(net.embeddingCat, batchSize, (n - 1) * model.eSize, model);
-    _Concatenate(&eList, &net.embeddingCat, 1);
-
-    /* go over each hidden layer */
-    for(int i = 0; i < depth; i++){
-        XTensor &h_pre = i == 0 ? net.embeddingCat : net.hiddens[i - 1];
-        XTensor &w = model.hiddenW[i];
-        XTensor &b = model.hiddenB[i];
-        XTensor &h = net.hiddens[i];
-        XTensor &s = net.hiddenStates[i];
-
-        InitModelTensor2D(h, batchSize, model.hSize, model);
-        InitModelTensor2D(s, batchSize, model.hSize, model);
-
-        /* generate hidden states of layer i: 
-           s = h_pre * w    */
-        _MatrixMul(&h_pre, X_NOTRANS, &w, X_NOTRANS, &s);
-
-        /* make a 2d tensor for the bias term */
-        XTensor b2D;
-        InitTensorV2(&b2D, &s);
-        _Unsqueeze(&b, &b2D, 0, batchSize);
-
-        /* introduce bias term:
-           s = s + b
-           NOTE: the trick here is to extend b to a 2d tensor
-                 to fit into the 2d representation in tensor summation */
-        _Sum(&s, &b2D, &s);
-
-        /* pass the state through the hard tanh function:
-           h = tanh(s) */
-        _HardTanH(&s, &h);
-    }
-
-    /* generate the output Pr(w_{n-1}|w_0...w_{n-2}):
-       y = softmax(h_last * w) 
-       Note that this is the implementation as that in Bengio et al.' paper.
-       TODO: we add bias term here */
-    {
-        XTensor &h_last = depth > 0 ? net.hiddens[depth - 1] : net.embeddingCat;
-        XTensor &w = model.outputW;
-        XTensor &b = model.outputB;
-        XTensor &s = net.stateLast;
-        XTensor &y = output;
-
-        InitModelTensor2D(s, batchSize, model.vSize, model);
-        InitModelTensor2D(y, batchSize, model.vSize, model);
-
-        /* s = h_last * w  */
-        _MatrixMul(&h_last, X_NOTRANS, &w, X_NOTRANS, &s);
-
-        XTensor b2D;
-        InitTensorV2(&b2D, &s);
-        _Unsqueeze(&b, &b2D, 0, batchSize);
-
-        _Sum(&s, &b2D, &s);
-
-        /* y = softmax(s) */
-        _LogSoftmax(&s, &y, 1);
-    }
-}
-
-/*
-backward procedure
->> inputs - input word representations
->> output - output probability
->> gold - gold standard
->> loss - loss function name
->> model - the fnn model
->> grad - the model that keeps the gradient information
->> net - the network that keeps the internal tensors generated in the process
-*/
-void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NAME loss, 
-              FNNModel &model,  FNNModel &grad, FNNNet &net)
-{
-    int batchSize = output.GetDim(0);
-    int n = model.n;
-    int depth = model.hDepth;
-
-    /* back-propagation for the output layer */
-    XTensor &y = output;
-    XTensor &s = net.stateLast;
-    XTensor &x = depth > 0 ? net.hiddens[depth - 1] : net.embeddingCat;
-    XTensor &w = model.outputW;
-    XTensor &dedw = grad.outputW;
-    XTensor &dedb = grad.outputB;
-    XTensor deds(&y);
-    XTensor dedx(&x);
-
-    /* for y = softmax(s), we get dE/ds
-        where E is the error function (define by loss) */
-    _LogSoftmaxBackward(&gold, &y, &s, NULL, &deds, NULL, 1, loss);
-
-    /* for s = x * w, we get 
-       dE/w_{i,j} = dE/ds_j * ds/dw_{i,j} 
-                  = dE/ds_j * x_{i}
-       (where i and j are the row and column indices, and
-        x is the top most hidden layer)
-       so we know 
-       dE/dw = x^T * dE/ds */
-    _MatrixMul(&x, X_TRANS, &deds, X_NOTRANS, &dedw);
-
-    /* gradient of the bias: dE/db = dE/ds * 1 = dE/ds
-    specifically dE/db_{j} = \sum_{i} dE/ds_{i,j} */
-    _ReduceSum(&deds, &dedb, 0);
-
-    /* then, we compute 
-       dE/dx_{j} = \sum_j' (dE/ds_{j'} * ds_{j'}/dx_j) 
-                 = \sum_j' (dE/ds_{j'} * w_{j, j'})
-       i.e., 
-       dE/dx = dE/ds * w^T */
-    _MatrixMul(&deds, X_NOTRANS, &w, X_TRANS, &dedx);
-
-    XTensor &gradPassed = dedx;
-    XTensor dedsHidden;
-    XTensor dedxBottom;
-    if (depth > 0)
-        InitTensorV2(&dedsHidden, &dedx);
-    InitTensorV2(&dedxBottom, &net.embeddingCat);
-
-    /* back-propagation from top to bottom in the stack of hidden layers
-       for each layer, h = f(s)
-                       s = x * w + b */
-    for (int i = depth - 1; i >= 0; i--) {
-        XTensor &h = net.hiddens[i];
-        XTensor &s = net.hiddenStates[i];
-        XTensor &x = i == 0 ? net.embeddingCat : net.hiddenStates[i - 1];
-        XTensor &w = model.hiddenW[i];
-        XTensor &dedh = gradPassed;  // gradient passed though the previous layer
-        XTensor &dedx = i == 0 ? dedxBottom : dedh;
-        XTensor &deds = dedsHidden;
-        XTensor &dedw = grad.hiddenW[i];
-        XTensor &dedb = grad.hiddenB[i];
-        
-        /* backpropagation through the activation fucntion: 
-           dE/ds = dE/dh * dh/ds */
-        _HardTanHBackward(NULL, &h, &s, &dedh, &deds, NOLOSS);
-
-        /* gradient of the weight: dE/dw = x^T * dE/ds   */
-        _MatrixMul(&x, X_TRANS, &deds, X_NOTRANS, &dedw);
-
-        /* gradient of the bias: dE/db = dE/ds * 1 = dE/ds
-           specifically dE/db_{j} = \sum_{i} dE/ds_{i,j} */
-        _ReduceSum(&deds, &dedb, 0);
-
-        /* gradient of the input: dE/dx = dE/ds * w^T    */
-        _MatrixMul(&deds, X_NOTRANS, &w, X_TRANS, &dedx);
-
-        if (i > 0)
-            _CopyValues(&dedx, &gradPassed);
-    }
-
-    TensorList eList(n - 1);
-
-    /* back-propagation for the embedding layer */
-    for (int i = 0; i < n - 1; i++) {
-        XTensor * dedy = NewTensor2DV2(batchSize, model.eSize, X_FLOAT, model.devID);
-        eList.Add(dedy);
-    }
-
-    /* gradient of the concatenation of the embedding layers */
-    XTensor &dedyCat = depth > 0 ? dedxBottom : dedx;
-
-    /* split the concatenation of gradients of the embeddings */
-    _Split(&dedyCat, &eList, 1, n - 1);
-
-    /* go over for each word */
-    for (int i = 0; i < n - 1; i++) {
-        XTensor * dedy = (XTensor*)eList.GetItem(i);
-        XTensor &x = inputs[i];
-        XTensor &dedw = grad.embeddingW;
-
-        /* gradient of the embedding weight: dE/dw += x^T * dE/dy 
-           NOTE that we accumulate dE/dw here because the matrix w
-           is shared by several layers (or words) */
-        _MatrixMul(&x, X_TRANS, dedy, X_NOTRANS, &dedw, 1.0F, 1.0F);
-
-        delete dedy;
-    }
-}
-
-/*
-forward process (with tensor connections) (this is implemented by gather function)
->> ngrams - the loaded ngrams
->> batch - the tensor encoding a batch of words
->> output - output probability
->> model - the fnn model
-*/
-void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model)
-{
-    int n = model.n;
-    int depth = model.hDepth;
-
-    XTensor words;
-    XTensor embeddingBig;
-    XTensor hidden;
-    XTensor b;
-
-    int size = batch * (n-1);
-    int * index = new int[size];
-
-    for(int i = 0; i < batch; i++){
-        for (int j = 0; j < n-1; j++){
-            int a = i * (n - 1) + j;
-            index[a] = ngrams[i].words[j];
-        }
-    }
-
-    InitTensor1DV2(&words, size, X_INT, model.devID);
-    words.SetData(index, size);
-
-    embeddingBig = Gather(model.embeddingW, words);
+		delete[] line;

-    delete[] index;
+	}

-    int dimSize[2];
-    dimSize[0] = embeddingBig.GetDim(0) / (n - 1);
-    dimSize[1] = embeddingBig.GetDim(1) * (n - 1);
+	char lineBuf[MAX_LINE_LENGTH_HERE];
+	int wordBuf[MAX_LINE_LENGTH_HERE];

-    hidden = Reshape(embeddingBig, embeddingBig.order, dimSize);
+	/*
+	train the model with the standard SGD method
+	>> train - training data file
+	>> isShuffled - shuffle the data file or not
+	>> model - the fnn model
+	*/
+	void Train(const char * train, bool isShuffled, FNNModel &model)
+	{
+		char name[MAX_NAME_LENGTH];

-    /* hidden layers */
-    for(int i = 0; i < depth; i++)
-        hidden = HardTanH(MMul(hidden, model.hiddenW[i]) + model.hiddenB[i]);
+		/* shuffle the data */
+		if (isShuffled) {
+			sprintf(name, "%s-tmp", train);
+			Shuffle(train, name);
+		}
+		else
+			strcpy(name, train);

-    /* output layer */
-    //output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1);
-    output = Softmax(MMul(hidden, model.outputW) + model.outputB, 1);
-}
+		int epoch = 0;
+		int step = 0;
+		int wordCount = 0;
+		int wordCountTotal = 0;
+		int ngramNum = 1;
+		float loss = 0;
+		bool isEnd = false;
+
+		NGram * ngrams = new NGram[MAX_LINE_LENGTH_HERE];
+
+		/* make a model to keep gradients */
+		FNNModel grad;
+		Copy(grad, model);
+
+		/* XNet for automatic differentiation */
+		XNet autoDiffer;
+
+		double startT = GetClockSec();
+		/* iterate for a number of epochs */
+		for (epoch = 0; epoch < nEpoch; epoch++) {
+
+			/* data file */
+			FILE * file = fopen(name, "rb");
+			CheckErrors(file, "Cannot open the training file");
+
+			wordCount = 0;
+			loss = 0;
+			ngramNum = 1;

-/*
-forward process (with tensor connections) (this is implemented by multiply function)
->> inputs - input word representations
->> output - output probability
->> model - the fnn model
-*/
-void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model)
-{
-    int n = model.n;
-    int depth = model.hDepth;
-
-    XTensor words;
-    XTensor embeddingBig;
-    XTensor hidden;
-    XTensor b;
-
-    TensorList inputList(n - 1);
-    for(int i = 0; i < n - 1; i++)
-        inputList.Add(inputs + i);
-
-    /* represent n - 1 words in one tensor */
-    words = Merge(inputList, 0);
-
-    /* word embedding */
-    embeddingBig = MMul(words, model.embeddingW);
-
-    /* input of the first hidden layer */
-    hidden = Split(embeddingBig, 0, n - 1);
-    hidden = Merge(hidden, 2, 0);
-
-    /* hidden layers */
-    for(int i = 0; i < depth; i++)
-        hidden = MMul(hidden, model.hiddenW[i]) + model.hiddenB[i];
+			while (ngramNum > 0) {
+
+				/* load a minibatch of ngrams */
+				ngramNum = LoadNGrams(file, model.n, ngrams, sentBatch, wordBatch);

-    /* output layer */
-    output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1);
+				if (ngramNum <= 0)
+					break;

-}
-
-/* 
-dump the model to the disk space
->> fn - where to keep the model
->> model - the fnn model
-*/
-void Dump(const char * fn, FNNModel &model)
-{
-    FILE * file = fopen(fn, "wb");
-    CheckErrors(file, "Cannot open the model file");
-
-    model.embeddingW.Dump(file, "embedding w:");
-    for (int i = 0; i < model.hDepth; i++) {
-        char name[MAX_NAME_LENGTH];
-        sprintf(name, "hidden %d w:", i);
-        model.hiddenW[i].Dump(file, name);
-        sprintf(name, "hidden %d b:", i);
-        model.hiddenB[i].Dump(file, name);
-    }
-
-    model.outputW.Dump(file, "output w:");
-    model.outputB.Dump(file, "output b:");
-
-    fclose(file);
-
-    XPRINT(0, stderr, "[INFO] model saved\n");
-}
-
-/* 
-read the model from the disk space
->> fn - where to keep the model
->> model - the fnn model
-*/
-void Read(const char * fn, FNNModel &model)
-{
-    FILE * file = fopen(fn, "rb");
-    CheckErrors(file, "Cannot open the model file");
-
-    model.embeddingW.Read(file, "embedding w:");
-    for (int i = 0; i < model.hDepth; i++) {
-        char name[MAX_NAME_LENGTH];
-        sprintf(name, "hidden %d w:", i);
-        model.hiddenW[i].Read(file, name);
-        sprintf(name, "hidden %d b:", i);
-        model.hiddenB[i].Read(file, name);
-    }
-
-    model.outputW.Read(file, "output w:");
-    model.outputB.Read(file, "output b:");
-
-    fclose(file);
-
-    XPRINT(0, stderr, "[INFO] model loaded\n");
-}
-
-/* 
-test the model
->> test - test data file
->> result - where to keep the result
->> model - the fnn model
-*/
-void Test(const char * test, const char * result, FNNModel &model)
-{
-    int wordCount = 0;
-    int sentCount = 0;
-    float loss = 0;
-
-    NGram * ngrams = new NGram[MAX_LINE_LENGTH_HERE];
-
-    double startT = GetClockSec();
-
-    /* data files */
-    FILE * file = fopen(test, "rb");
-    CheckErrors(file, "Cannot read the test file");
-    FILE * ofile = fopen(result, "wb");
-    CheckErrors(ofile, "Cannot open the output file");
-
-    int ngramNum = 1;
-    while (ngramNum > 0) {
-
-        /* load a minibatch of ngrams */
-        ngramNum = LoadNGrams(file, model.n, ngrams, 1, MAX_INT);
-
-        if (ngramNum <= 0)
-            break;
-
-        /* previous n - 1 words */
-        XTensor inputs[MAX_N_GRAM];
-
-        /* the predicted word */
-        XTensor output;
-
-        /* the gold standard */
-        XTensor gold;
-        
-        /* make the input tensor for position i */
-        for (int i = 0; i < model.n - 1; i++)
-            MakeWordBatch(inputs[i], ngrams, ngramNum, i, model.vSize, model.devID, model.mem);
-
-        /* make the gold tensor */
-        MakeWordBatch(gold, ngrams, ngramNum, model.n - 1, model.vSize, model.devID, model.mem);
-
-        if (!autoDiff) {
-            /* prepare an empty network for building the fnn */
-            FNNNet net;
-
-            /* forward computation */
-            Forward(inputs, output, model, net);
-        }
-        else {			
-			/* this is implemented by gather function */
-            ForwardAutoDiff(ngrams, ngramNum, output, model);
-				
-			/* this is implemented by multiply function */
-			//ForwardAutoDiff(inputs, output, model);
-        }
-
-        /* prediction probabilities */
-        XTensor probs;
-        InitTensor1DV2(&probs, ngramNum);
-
-        /* get probabilities */
-        float prob = GetProb(output, gold, &probs);
-
-        /* dump the test result */
-        for (int i = 0; i < model.n - 1; i++)
-            fprintf(ofile, "%d ", ngrams[0].words[i]);
-        for (int i = 0; i < ngramNum; i++)
-            fprintf(ofile, "%d ", ngrams[i].words[model.n - 1]);
-        fprintf(ofile, "||| ");
-        for (int i = 0; i < model.n - 1; i++)
-            fprintf(ofile, "<s> ");
-        for (int i = 0; i < ngramNum; i++)
-            fprintf(ofile, "%f ", probs.Get1D(i));
-        fprintf(ofile, "||| %f\n", prob);
-
-        loss += -prob;
-        wordCount += ngramNum;
-        sentCount += 1;
-    }
-
-    fclose(file);
-
-    double elapsed = GetClockSec() - startT;
-
-    XPRINT1(0, stderr, "[INFO] ppl=%.2f\n", exp(loss/wordCount));
-    XPRINT3(0, stderr, "[INFO] test finished (took %.1fs, sentence=%d and ngram=%d)\n", 
-               elapsed, sentCount, wordCount);
-
-    delete[] ngrams;
-}
+				/* previous n - 1 words */
+				XTensor inputs[MAX_N_GRAM];
+
+				/* the predicted word */
+				XTensor output;
+
+				/* the gold standard */
+				XTensor gold;
+
+				/* the loss tensor */
+				XTensor lossTensor;
+
+				/* make the input tensor for position i */
+				for (int i = 0; i < model.n - 1; i++)
+					MakeWordBatch(inputs[i], ngrams, ngramNum, i, model.vSize, model.devID, model.mem);
+
+				/* make the gold tensor */
+				MakeWordBatch(gold, ngrams, ngramNum, model.n - 1, model.vSize, model.devID, model.mem);
+
+				if (!autoDiff) {
+					/* prepare an empty network for building the fnn */
+					FNNNet net;
+
+					/* gradident = 0 */
+					Clear(grad, false);
+
+					/* forward computation */
+					Forward(inputs, output, model, net);
+
+					/* backward computation to obtain gradients */
+					Backward(inputs, output, gold, CROSSENTROPY, model, grad, net);
+
+					/* update model parameters */
+					Update(model, grad, learningRate, false);
+				}
+				else {
+					/* gradient = 0 */
+					Clear(model, true);
+
+					/* forward + backward process */
+
+					/* this is implemented by gather function */
+					ForwardAutoDiff(ngrams, ngramNum, output, model);
+
+					/* this is implemented by multiply function */
+					//ForwardAutoDiff(inputs, output, model);
+					lossTensor = CrossEntropy(output, gold);
+					output.Dump(stderr, "output:",10);
+					gold.Dump(stderr, "gold:", 10);
+					lossTensor.Dump(stderr, "lossTensor:",10);
+
+					/* automatic differentiation */
+					autoDiffer.Backward(lossTensor);
+					//autoDiffer.Backward(output, gold, CROSSENTROPY);
+
+					/* update model parameters */
+					Update(model, grad, learningRate, true);
+				}
+
+				/* get probabilities */
+				float prob = GetProb(output, gold);
+				if (autoDiff) {
+					prob = -ReduceSumAll(lossTensor);
+				}
+				//printf("prob:%f", prob);
+				loss += -prob;
+				wordCount += ngramNum;
+				wordCountTotal += ngramNum;
+
+				if (++step >= nStep) {
+					isEnd = true;
+					break;
+				}
+
+				if (step % 100 == 0) {
+					double elapsed = GetClockSec() - startT;
+					XPRINT5(0, stderr, "[INFO] elapsed=%.1fs, step=%d, epoch=%d, ngram=%d, ppl=%.3f\n",
+						elapsed, step, epoch + 1, wordCountTotal, exp(loss / wordCount));
+				}
+			}
+
+			fclose(file);
+
+			if (isEnd)
+				break;
+
+			Test(testFN, outputFN, model);
+		}
+
+		double elapsed = GetClockSec() - startT;
+
+		XPRINT5(0, stderr, "[INFO] elapsed=%.1fs, step=%d, epoch=%d, ngram=%d, ppl=%.3f\n",
+			elapsed, step, epoch, wordCountTotal, exp(loss / wordCount));
+		XPRINT3(0, stderr, "[INFO] training finished (took %.1fs, step=%d and epoch=%d)\n",
+			elapsed, step, epoch);
+
+		delete[] ngrams;
+	}
+
+	/*
+	update the model parameters using the delta rule
+	>> model - the model to update
+	>> grad - gradients
+	>> epsilon - learning rate
+	>> isNodeGrad - indicates whether the gradient is associated with the node
+	*/
+	void Update(FNNModel &model, FNNModel &grad, float epsilon, bool isNodeGrad)
+	{
+		TensorList paraList(10);
+		TensorList gradList(10);
+
+		paraList.Add(&model.outputW);
+		paraList.Add(&model.outputB);
+
+		for (int i = 0; i < model.hDepth; i++) {
+			paraList.Add(&model.hiddenW[i]);
+			paraList.Add(&model.hiddenB[i]);
+		}
+
+		paraList.Add(&model.embeddingW);
+
+		if (!isNodeGrad) {
+			gradList.Add(&grad.outputW);
+			gradList.Add(&grad.outputB);
+
+			for (int i = 0; i < model.hDepth; i++) {
+				gradList.Add(&grad.hiddenW[i]);
+				gradList.Add(&grad.hiddenB[i]);
+			}
+			;
+			gradList.Add(&grad.embeddingW);
+		}
+		else {
+			gradList.Add(model.outputW.grad);
+			gradList.Add(model.outputB.grad);
+
+			for (int i = 0; i < model.hDepth; i++) {
+				gradList.Add(model.hiddenW[i].grad);
+				gradList.Add(model.hiddenB[i].grad);
+			}
+
+			gradList.Add(model.embeddingW.grad);
+		}
+
+		for (int i = 0; i < paraList.count; i++) {
+			XTensor * para = (XTensor*)paraList.GetItem(i);
+			XTensor * paraGrad = (XTensor*)gradList.GetItem(i);
+
+			//fprintf(stderr, "%d\n", i);
+			//paraGrad->Dump(stderr, "grad:", 10);
+
+			/* the delta rule */
+			_Sum(para, paraGrad, para, -epsilon);
+		}
+	}
+
+	/*
+	get prediction probabilites of the gold words
+	>> output - output probabilities
+	>> gold - gold standard
+	>> wordPobs - probability of each word
+	<< return - probability of the batch
+	*/
+	float GetProb(XTensor &output, XTensor &gold, XTensor * wordProbs)
+	{
+		XTensor probs;
+		InitTensorV2(&probs, &output);
+
+		/* probs[i,j] = output[i,j] * gold[i,j] */
+		_Multiply(&output, &gold, &probs);
+
+		/* probability of each word */
+		XTensor wprobs;
+		InitTensor1DV2(&wprobs, output.GetDim(0), output.dataType, output.devID);
+		_ReduceSum(&probs, &wprobs, 1);
+		if (wordProbs != NULL)
+			_CopyValues(&wprobs, wordProbs);
+
+		/* reshape the tensor to fit it into the reduce procedure
+		   TODO: XTensor supports scalars */
+		int dims[2];
+		dims[0] = 1;
+		dims[1] = probs.unitNum;
+		probs.Reshape(2, dims);
+
+		/* probability for the batch */
+		XTensor result;
+		InitTensor1DV2(&result, 1, X_FLOAT, output.devID);
+		_ReduceSum(&probs, &result, 1);
+
+		return result.Get1D(0);
+	}
+
+	int pin = 0;
+	int wordBufCount = 0;
+
+	/*
+	load a minibatch of ngrams
+	>> file - data file
+	>> n - order of the language model
+	>> ngrams - the loaded ngrams
+	>> sentNum - maximum sentences kept in the minibatch
+	>> wordNum - maximum words kept in the minibatch
+	*/
+	int LoadNGrams(FILE * file, int n, NGram * ngrams, int sentNum, int wordNum)
+	{
+		int num = 0;
+		int lineNum = 0;
+		while (pin > 0 || fgets(lineBuf, MAX_LINE_LENGTH_HERE - 1, file)) {
+			if (pin <= 0) {
+				int len = (int)strlen(lineBuf);
+
+				while (lineBuf[len - 1] == '\r' || lineBuf[len - 1] == '\n') {
+					lineBuf[len - 1] = 0;
+					len--;
+				}
+
+				len = (int)strlen(lineBuf);
+				if (len == 0)
+					continue;
+
+				/* how many characters are in a word */
+				int wSize = 0;
+
+				/* how many words are in the sentence */
+				int wNum = 0;
+				int i = 0;
+
+				for (i = pin; i < len; i++) {
+					/* load word (id) seperated by space or tab */
+					if ((lineBuf[i] == ' ' || lineBuf[i] == '\t') && wSize > 0) {
+						lineBuf[i] = 0;
+						wordBuf[wNum++] = atoi(lineBuf + i - wSize);
+						wSize = 0;
+					}
+					else
+						wSize++;
+				}
+
+				if (wSize > 0)
+					wordBuf[wNum++] = atoi(lineBuf + i - wSize);
+
+				wordBufCount = wNum;
+				lineNum++;
+			}
+			else
+				lineNum = 1;
+
+			int i = -MAX_INT;
+
+			/* create ngrams */
+			for (i = MAX(pin, n - 1); i < wordBufCount - 1; i++) {
+				memcpy(ngrams[num++].words, wordBuf + i - n + 1, sizeof(int) * n);
+				if (num >= wordNum)
+					break;
+			}
+
+			/* set a finished flag if we reach the end of the sentence*/
+			if (i >= wordBufCount - 1) {
+				pin = 0;
+				wordBufCount = 0;
+			}
+			/* record where to start next time if we break in the middle */
+			else {
+				pin = i + 1;
+			}
+
+			if ((sentNum > 0 && lineNum >= sentNum) || num >= wordNum)
+				break;
+		}
+
+		return num;
+	}
+
+	/*
+	make a 2d tensor in zero-one representation
+	The indexed cell is set to 1, and 0 otherwise.
+	>> tensor - the tensor to initialize
+	>> rowNum - number of rows
+	>> colNum - number of columns
+	>> rows - row index
+	>> cols - column index
+	>> itemNum - number of non-zero items
+	>> devID - device id
+	>> mem - memory pool
+	*/
+	void InitZeroOneTensor2D(XTensor &tensor, int rowNum, int colNum, int * rows, int * cols,
+		int itemNum, int devID, XMem * mem)
+	{
+		InitTensor2DV2(&tensor, rowNum, colNum, X_FLOAT, devID);
+
+		tensor.SetZeroAll();
+
+		/* set none-zero cells */
+		for (int i = 0; i < itemNum; i++)
+			tensor.Set2D(1.0F, rows[i], cols[i]);
+	}
+
+	/*
+	make a tensor that encodes a batch of words
+	>> batch - the tensor encoding a batch of words
+	>> ngrams - the ngram batch
+	>> ngramNum - batch size
+	>> n - indicate which word is encode for each ngram
+	>> vSize - vocabulary size
+	>> devID - device id
+	>> mem - memory pool
+	*/
+	void MakeWordBatch(XTensor &batch, NGram * ngrams, int ngramNum, int n, int vSize, int devID, XMem * mem)
+	{
+		int * rows = new int[ngramNum];
+		int * cols = new int[ngramNum];
+
+		for (int i = 0; i < ngramNum; i++) {
+			rows[i] = i;
+			cols[i] = ngrams[i].words[n];
+		}
+
+		InitZeroOneTensor2D(batch, ngramNum, vSize, rows, cols, ngramNum, devID, mem);
+
+		delete[] rows;
+		delete[] cols;
+	}
+
+	/*
+	forward procedure
+	>> inputs - input word representations
+	>> output - output probability
+	>> model - the fnn model
+	>> net - the network that keeps the internal tensors generated in the process
+	*/
+	void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
+	{
+		int batchSize = -1;
+		int n = model.n;
+		int depth = model.hDepth;
+		TensorList eList(n - 1);
+
+		/* previoius n - 1 words */
+		for (int i = 0; i < n - 1; i++) {
+			XTensor &input = inputs[i];
+			XTensor &w = model.embeddingW;
+			XTensor &embedding = net.embeddings[i];
+
+			if (batchSize == -1)
+				batchSize = input.dimSize[0];
+			else {
+				CheckErrors(batchSize == input.dimSize[0], "Wrong input word representations!");
+			}
+
+			/* embedding output tensor of position i */
+			InitModelTensor2D(embedding, batchSize, model.eSize, model);
+
+			/* generate word embedding of position i:
+			   embedding = input * w   */
+			_MatrixMul(&input, X_NOTRANS, &w, X_NOTRANS, &embedding);
+
+			eList.Add(&net.embeddings[i]);
+		}
+
+		/* concatenate word embeddings
+		   embeddingcat = cat(embedding_0...embedding_{n-1}) */
+		InitModelTensor2D(net.embeddingCat, batchSize, (n - 1) * model.eSize, model);
+		_Concatenate(&eList, &net.embeddingCat, 1);
+
+		/* go over each hidden layer */
+		for (int i = 0; i < depth; i++) {
+			XTensor &h_pre = i == 0 ? net.embeddingCat : net.hiddens[i - 1];
+			XTensor &w = model.hiddenW[i];
+			XTensor &b = model.hiddenB[i];
+			XTensor &h = net.hiddens[i];
+			XTensor &s = net.hiddenStates[i];
+
+			InitModelTensor2D(h, batchSize, model.hSize, model);
+			InitModelTensor2D(s, batchSize, model.hSize, model);
+
+			/* generate hidden states of layer i:
+			   s = h_pre * w    */
+			_MatrixMul(&h_pre, X_NOTRANS, &w, X_NOTRANS, &s);
+
+			/* make a 2d tensor for the bias term */
+			XTensor b2D;
+			InitTensorV2(&b2D, &s);
+			_Unsqueeze(&b, &b2D, 0, batchSize);
+
+			/* introduce bias term:
+			   s = s + b
+			   NOTE: the trick here is to extend b to a 2d tensor
+					 to fit into the 2d representation in tensor summation */
+			_Sum(&s, &b2D, &s);
+
+			/* pass the state through the hard tanh function:
+			   h = tanh(s) */
+			_HardTanH(&s, &h);
+		}
+
+		/* generate the output Pr(w_{n-1}|w_0...w_{n-2}):
+		   y = softmax(h_last * w)
+		   Note that this is the implementation as that in Bengio et al.' paper.
+		   TODO: we add bias term here */
+		{
+			XTensor &h_last = depth > 0 ? net.hiddens[depth - 1] : net.embeddingCat;
+			XTensor &w = model.outputW;
+			XTensor &b = model.outputB;
+			XTensor &s = net.stateLast;
+			XTensor &y = output;
+
+			InitModelTensor2D(s, batchSize, model.vSize, model);
+			InitModelTensor2D(y, batchSize, model.vSize, model);
+
+			/* s = h_last * w  */
+			_MatrixMul(&h_last, X_NOTRANS, &w, X_NOTRANS, &s);
+
+			XTensor b2D;
+			InitTensorV2(&b2D, &s);
+			_Unsqueeze(&b, &b2D, 0, batchSize);
+
+			_Sum(&s, &b2D, &s);
+
+			/* y = softmax(s) */
+			_LogSoftmax(&s, &y, 1);
+		}
+	}
+
+	/*
+	backward procedure
+	>> inputs - input word representations
+	>> output - output probability
+	>> gold - gold standard
+	>> loss - loss function name
+	>> model - the fnn model
+	>> grad - the model that keeps the gradient information
+	>> net - the network that keeps the internal tensors generated in the process
+	*/
+	void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NAME loss,
+		FNNModel &model, FNNModel &grad, FNNNet &net)
+	{
+		int batchSize = output.GetDim(0);
+		int n = model.n;
+		int depth = model.hDepth;
+
+		/* back-propagation for the output layer */
+		XTensor &y = output;
+		XTensor &s = net.stateLast;
+		XTensor &x = depth > 0 ? net.hiddens[depth - 1] : net.embeddingCat;
+		XTensor &w = model.outputW;
+		XTensor &dedw = grad.outputW;
+		XTensor &dedb = grad.outputB;
+		XTensor deds(&y);
+		XTensor dedx(&x);
+
+		/* for y = softmax(s), we get dE/ds
+			where E is the error function (define by loss) */
+		_LogSoftmaxBackward(&gold, &y, &s, NULL, &deds, NULL, 1, loss);
+
+		/* for s = x * w, we get
+		   dE/w_{i,j} = dE/ds_j * ds/dw_{i,j}
+					  = dE/ds_j * x_{i}
+		   (where i and j are the row and column indices, and
+			x is the top most hidden layer)
+		   so we know
+		   dE/dw = x^T * dE/ds */
+		_MatrixMul(&x, X_TRANS, &deds, X_NOTRANS, &dedw);
+
+		/* gradient of the bias: dE/db = dE/ds * 1 = dE/ds
+		specifically dE/db_{j} = \sum_{i} dE/ds_{i,j} */
+		_ReduceSum(&deds, &dedb, 0);
+
+		/* then, we compute
+		   dE/dx_{j} = \sum_j' (dE/ds_{j'} * ds_{j'}/dx_j)
+					 = \sum_j' (dE/ds_{j'} * w_{j, j'})
+		   i.e.,
+		   dE/dx = dE/ds * w^T */
+		_MatrixMul(&deds, X_NOTRANS, &w, X_TRANS, &dedx);
+
+		XTensor &gradPassed = dedx;
+		XTensor dedsHidden;
+		XTensor dedxBottom;
+		if (depth > 0)
+			InitTensorV2(&dedsHidden, &dedx);
+		InitTensorV2(&dedxBottom, &net.embeddingCat);
+
+		/* back-propagation from top to bottom in the stack of hidden layers
+		   for each layer, h = f(s)
+						   s = x * w + b */
+		for (int i = depth - 1; i >= 0; i--) {
+			XTensor &h = net.hiddens[i];
+			XTensor &s = net.hiddenStates[i];
+			XTensor &x = i == 0 ? net.embeddingCat : net.hiddenStates[i - 1];
+			XTensor &w = model.hiddenW[i];
+			XTensor &dedh = gradPassed;  // gradient passed though the previous layer
+			XTensor &dedx = i == 0 ? dedxBottom : dedh;
+			XTensor &deds = dedsHidden;
+			XTensor &dedw = grad.hiddenW[i];
+			XTensor &dedb = grad.hiddenB[i];
+
+			/* backpropagation through the activation fucntion:
+			   dE/ds = dE/dh * dh/ds */
+			_HardTanHBackward(NULL, &h, &s, &dedh, &deds, NOLOSS);
+
+			/* gradient of the weight: dE/dw = x^T * dE/ds   */
+			_MatrixMul(&x, X_TRANS, &deds, X_NOTRANS, &dedw);
+
+			/* gradient of the bias: dE/db = dE/ds * 1 = dE/ds
+			   specifically dE/db_{j} = \sum_{i} dE/ds_{i,j} */
+			_ReduceSum(&deds, &dedb, 0);
+
+			/* gradient of the input: dE/dx = dE/ds * w^T    */
+			_MatrixMul(&deds, X_NOTRANS, &w, X_TRANS, &dedx);
+
+			if (i > 0)
+				_CopyValues(&dedx, &gradPassed);
+		}
+
+		TensorList eList(n - 1);
+
+		/* back-propagation for the embedding layer */
+		for (int i = 0; i < n - 1; i++) {
+			XTensor * dedy = NewTensor2DV2(batchSize, model.eSize, X_FLOAT, model.devID);
+			eList.Add(dedy);
+		}
+
+		/* gradient of the concatenation of the embedding layers */
+		XTensor &dedyCat = depth > 0 ? dedxBottom : dedx;
+
+		/* split the concatenation of gradients of the embeddings */
+		_Split(&dedyCat, &eList, 1, n - 1);
+
+		/* go over for each word */
+		for (int i = 0; i < n - 1; i++) {
+			XTensor * dedy = (XTensor*)eList.GetItem(i);
+			XTensor &x = inputs[i];
+			XTensor &dedw = grad.embeddingW;
+
+			/* gradient of the embedding weight: dE/dw += x^T * dE/dy
+			   NOTE that we accumulate dE/dw here because the matrix w
+			   is shared by several layers (or words) */
+			_MatrixMul(&x, X_TRANS, dedy, X_NOTRANS, &dedw, 1.0F, 1.0F);
+
+			delete dedy;
+		}
+	}
+
+	/*
+	forward process (with tensor connections) (this is implemented by gather function)
+	>> ngrams - the loaded ngrams
+	>> batch - the tensor encoding a batch of words
+	>> output - output probability
+	>> model - the fnn model
+	*/
+	void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model)
+	{
+		int n = model.n;
+		int depth = model.hDepth;
+
+		XTensor words;
+		XTensor embeddingBig;
+		XTensor hidden;
+		XTensor b;
+
+		int size = batch * (n - 1);
+		int * index = new int[size];
+
+		for (int i = 0; i < batch; i++) {
+			for (int j = 0; j < n - 1; j++) {
+				int a = i * (n - 1) + j;
+				index[a] = ngrams[i].words[j];
+			}
+		}
+
+		InitTensor1DV2(&words, size, X_INT, model.devID);
+		words.SetData(index, size);
+		words.Dump(stderr, "word:", 10);
+		embeddingBig = Gather(model.embeddingW, words);
+
+		delete[] index;
+
+		int dimSize[2];
+		dimSize[0] = embeddingBig.GetDim(0) / (n - 1);
+		dimSize[1] = embeddingBig.GetDim(1) * (n - 1);
+		embeddingBig.Dump(stderr, "embeddingBig:", 10);
+		hidden = Reshape(embeddingBig, embeddingBig.order, dimSize);
+		hidden.Dump(stderr, "hidden-0:", 10);
+		/* hidden layers */
+		for (int i = 0; i < depth; i++)
+			hidden = HardTanH(MMul(hidden, model.hiddenW[i]) + model.hiddenB[i]);
+		hidden.Dump(stderr, "hidden-1:", 10);
+		/* output layer */
+		//output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1);
+		output = Softmax(MMul(hidden, model.outputW) + model.outputB, 1);
+	}
+
+	/*
+	forward process (with tensor connections) (this is implemented by multiply function)
+	>> inputs - input word representations
+	>> output - output probability
+	>> model - the fnn model
+	*/
+	void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model)
+	{
+		int n = model.n;
+		int depth = model.hDepth;
+
+		XTensor words;
+		XTensor embeddingBig;
+		XTensor hidden;
+		XTensor b;
+
+		TensorList inputList(n - 1);
+		for (int i = 0; i < n - 1; i++)
+			inputList.Add(inputs + i);
+
+		/* represent n - 1 words in one tensor */
+		words = Merge(inputList, 0);
+
+		/* word embedding */
+		embeddingBig = MMul(words, model.embeddingW);
+
+		/* input of the first hidden layer */
+		hidden = Split(embeddingBig, 0, n - 1);
+		hidden = Merge(hidden, 2, 0);
+
+		/* hidden layers */
+		for (int i = 0; i < depth; i++)
+			hidden = MMul(hidden, model.hiddenW[i]) + model.hiddenB[i];
+
+		/* output layer */
+		output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1);
+
+	}
+
+	/*
+	dump the model to the disk space
+	>> fn - where to keep the model
+	>> model - the fnn model
+	*/
+	void Dump(const char * fn, FNNModel &model)
+	{
+		FILE * file = fopen(fn, "wb");
+		CheckErrors(file, "Cannot open the model file");
+
+		model.embeddingW.Dump(file, "embedding w:");
+		for (int i = 0; i < model.hDepth; i++) {
+			char name[MAX_NAME_LENGTH];
+			sprintf(name, "hidden %d w:", i);
+			model.hiddenW[i].Dump(file, name);
+			sprintf(name, "hidden %d b:", i);
+			model.hiddenB[i].Dump(file, name);
+		}
+
+		model.outputW.Dump(file, "output w:");
+		model.outputB.Dump(file, "output b:");
+
+		fclose(file);
+
+		XPRINT(0, stderr, "[INFO] model saved\n");
+	}
+
+	/*
+	read the model from the disk space
+	>> fn - where to keep the model
+	>> model - the fnn model
+	*/
+	void Read(const char * fn, FNNModel &model)
+	{
+		FILE * file = fopen(fn, "rb");
+		CheckErrors(file, "Cannot open the model file");
+
+		model.embeddingW.Read(file, "embedding w:");
+		for (int i = 0; i < model.hDepth; i++) {
+			char name[MAX_NAME_LENGTH];
+			sprintf(name, "hidden %d w:", i);
+			model.hiddenW[i].Read(file, name);
+			sprintf(name, "hidden %d b:", i);
+			model.hiddenB[i].Read(file, name);
+		}
+
+		model.outputW.Read(file, "output w:");
+		model.outputB.Read(file, "output b:");
+
+		fclose(file);
+
+		XPRINT(0, stderr, "[INFO] model loaded\n");
+	}
+
+	/*
+	test the model
+	>> test - test data file
+	>> result - where to keep the result
+	>> model - the fnn model
+	*/
+	void Test(const char * test, const char * result, FNNModel &model)
+	{
+		int wordCount = 0;
+		int sentCount = 0;
+		float loss = 0;
+
+		NGram * ngrams = new NGram[MAX_LINE_LENGTH_HERE];
+
+		double startT = GetClockSec();
+
+		/* data files */
+		FILE * file = fopen(test, "rb");
+		CheckErrors(file, "Cannot read the test file");
+		FILE * ofile = fopen(result, "wb");
+		CheckErrors(ofile, "Cannot open the output file");
+
+		int ngramNum = 1;
+		while (ngramNum > 0) {
+
+			/* load a minibatch of ngrams */
+			ngramNum = LoadNGrams(file, model.n, ngrams, 1, MAX_INT);
+
+			if (ngramNum <= 0)
+				break;
+
+			/* previous n - 1 words */
+			XTensor inputs[MAX_N_GRAM];
+
+			/* the predicted word */
+			XTensor output;
+
+			/* the gold standard */
+			XTensor gold;
+
+			/* make the input tensor for position i */
+			for (int i = 0; i < model.n - 1; i++)
+				MakeWordBatch(inputs[i], ngrams, ngramNum, i, model.vSize, model.devID, model.mem);
+
+			/* make the gold tensor */
+			MakeWordBatch(gold, ngrams, ngramNum, model.n - 1, model.vSize, model.devID, model.mem);
+
+			if (!autoDiff) {
+				/* prepare an empty network for building the fnn */
+				FNNNet net;
+				/* forward computation */
+				Forward(inputs, output, model, net);
+			}
+			else {
+				/* this is implemented by gather function */
+				ForwardAutoDiff(ngrams, ngramNum, output, model);
+				output = Log(output);
+				/* this is implemented by multiply function */
+				//ForwardAutoDiff(inputs, output, model);
+			}
+
+			/* prediction probabilities */
+			XTensor probs;
+			InitTensor1DV2(&probs, ngramNum);
+
+			/* get probabilities */
+			float prob = GetProb(output, gold, &probs);
+
+			/* dump the test result */
+			for (int i = 0; i < model.n - 1; i++)
+				fprintf(ofile, "%d ", ngrams[0].words[i]);
+			for (int i = 0; i < ngramNum; i++)
+				fprintf(ofile, "%d ", ngrams[i].words[model.n - 1]);
+			fprintf(ofile, "||| ");
+			for (int i = 0; i < model.n - 1; i++)
+				fprintf(ofile, "<s> ");
+			for (int i = 0; i < ngramNum; i++)
+				fprintf(ofile, "%f ", probs.Get1D(i));
+			fprintf(ofile, "||| %f\n", prob);
+
+			loss += -prob;
+			wordCount += ngramNum;
+			sentCount += 1;
+		}
+
+		fclose(file);
+
+		double elapsed = GetClockSec() - startT;
+
+		XPRINT1(0, stderr, "[INFO] ppl=%.2f\n", exp(loss / wordCount));
+		XPRINT3(0, stderr, "[INFO] test finished (took %.1fs, sentence=%d and ngram=%d)\n",
+			elapsed, sentCount, wordCount);
+
+		delete[] ngrams;
+	}

 };
--- a/source/tensor/Main.cpp
+++ b/source/tensor/Main.cpp
@@ -28,7 +28,7 @@
 #include <time.h>
 #include "XTensor.h"
 #include "XDevice.h"
-#include "./test/Test.h"
+//#include "./test/Test.h"
 #include "./core/CHeader.h"
 #include "./loss/CrossEntropy.h"

@@ -44,7 +44,7 @@ void LittleTest();
 void T2TTest();
 void T2TTest2();
 void PowerTest();
-
+void Tests();
 int main( int argc, const char ** argv )
 {
    //PowerTest();
@@ -63,7 +63,7 @@ int main( int argc, const char ** argv )
    //return 0;

    if(argc > 1 && !strcmp(argv[1], "-test"))
-        Test();
+        Tests();
    else{
        fprintf(stderr, "Thanks for using NiuTrans.Tensor! This is a library that eases the\n");
        fprintf(stderr, "use of tensors. All you need is to ... \n\n");
@@ -75,219 +75,223 @@ int main( int argc, const char ** argv )
    return 0;
 }

-void myRead(XTensor * tensor, const char * filename, const char * label)
-{
-    FILE * file = fopen(filename, "rb");
-    if(file == NULL)
-        printf("%s\n", filename);
-    tensor->Read(file, label);
-}
-
-void myDump(XTensor * tensor, const char * filename, const char * label)
-{
-    FILE * file = fopen(filename, "wb");
-    if(file == NULL)
-        printf("%s\n", filename);
-    tensor->Dump(file, label);
-}
-
-void PowerTest()
-{
-    XTensor input;
-    XTensor output;
-    InitTensor2D(&input, 256, 10000, X_FLOAT, 0);
-    InitTensor2D(&output, 256, 10000, X_FLOAT, 0);
-    myRead(&input, "1.txt", "");
-
-    _Power(&input, &output, 2);
-    output.Dump(stderr, "", 200);
-}
-
-void SmallTest()
-{
-    XTensor a;
-    XTensor b;
-    XTensor c;
-    XTensor d;
-
-    InitTensor2D(&a, 2, 2);
-    InitTensor2D(&b, 2, 2);
-    a.SetZeroAll();
-    b.SetZeroAll();
-    a.Set2D(1.0F, 0, 0);
-    a.Set2D(2.0F, 1, 1);
-
-    b = Sum(a, Multiply(a, a));
-
-    /* this is prohibited !!!!!!!!!!!!! */
-    //XTensor c = a * b + a;
-    //XTensor d = a + b + c.Lin(0.5F);
-    
-    c = a * b + a;
-    d = a + b + c.Lin(0.5F);
-
-    XLink::CheckNetwork(&d);
-    //XLink::ShowNetwork(stderr, &d);
-        
-    a.Dump(stderr, "a:");
-    b.Dump(stderr, "b:");
-    c.Dump(stderr, "c:");
-    d.Dump(stderr, "d:");
-}
-
-void TransposeTest()
-{
-    XTensor a;
-    XTensor b;
-
-    int I = 2;
-    int J = 3;
-
-    InitTensor4D(&a, 2, 3, 4, 5);
-
-    int * dims = new int[a.order];
-    memcpy(dims, a.dimSize, sizeof(int) * a.order);
-    dims[I] = a.dimSize[J];
-    dims[J] = a.dimSize[I];
-
-    InitTensor(&b, 4, dims);
+void Tests() {

-    a.SetZeroAll();
-    b.SetZeroAll();
-
-    float * data = new float[a.unitNum];
-    for(int i = 0; i < a.unitNum; i++)
-        data[i] = (float)i;
-
-    a.SetData(data, a.unitNum, 0);
-
-    _Transpose(&a, &b, I, J);
-    b.Dump(stderr, "b:");
-
-    delete[] data;
 }

-void LittleTest()
-{
-    int a = 5000;
-    int b = 100000;
-    int c = a*b;
-    printf("%d\n", c);
-
-    exit(1);
-}
-
-void T2TTest()
-{
-    XTensor * input;
-    XTensor * weight;
-    XTensor * output;
-    XTensor * gold;
-    XTensor * dedy;
-    XTensor * dedx;
-    XTensor * dedxTmp;
-    XTensor * dedw;
-    XTensor * padding;
-
-    DTYPE loss;
-
-    int * dimSize = new int[2];
-    dimSize[0] = 256;
-    dimSize[1] = 10001;
-
-    int * dimSize2 = new int[3];
-    dimSize2[0] = 2;
-    dimSize2[1] = 31;
-    dimSize2[2] = 256;
-   
-    int * dimSize3 = new int[3];
-    dimSize3[0] = 2;
-    dimSize3[1] = 31;
-    dimSize3[2] = 10001;
-
-    int * dimSize4 = new int[2];
-    dimSize4[0] = 2;
-    dimSize4[1] = 31;
-
-    input = NewTensor(3, dimSize2, X_FLOAT, 1.0F, 0);
-    weight = NewTensor(2, dimSize, X_FLOAT, 1.0F, 0);
-    dedw = NewTensor(2, dimSize, X_FLOAT, 1.0F, 0);
-    gold = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
-    output = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
-    dedy = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
-    dedx = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
-    dedxTmp = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
-    padding = NewTensor(2, dimSize4, X_FLOAT, 1.0F, 0);
-
-    //weight = NewTensor(2, dimSize);
-    //dedw = NewTensor(2, dimSize);
-    //input = NewTensor(3, dimSize2);
-    //gold = NewTensor(3, dimSize3);
-    //output = NewTensor(3, dimSize3);
-    //dedy = NewTensor(3, dimSize3);
-    //dedx = NewTensor(3, dimSize3);
-    //dedxTmp = NewTensor(3, dimSize3);
-    //padding = NewTensor(2, dimSize4);
-
-    myRead(input, "x.txt", "x");
-    myRead(weight, "w.txt", "w");
-    myRead(gold, "gold.txt", "gold");
-    myRead(padding, "padding.txt", "padding");
-
-    XTensor inter;
-    inter = MMul(*input, *weight);
-
-    _Softmax(&inter, output, 2);
-
-    //_LogMe(output);
-    loss = _CrossEntropyFast(output, gold, REDUCE_MEAN, NULL, padding);
-
-    printf("loss: %f\n", loss);
-
-    _CrossEntropyBackward(dedy, output, gold, NULL);
-    //_CrossEntropyBackward(dedy, output, gold, NULL, padding);
-
-    myDump(dedy, "dedy.txt", "dedy");
-
-    _SoftmaxBackward(NULL, output, input, dedy, dedx, NULL, -1, NOLOSS);
-    _Sub(output, gold, dedxTmp);
-
-    myDump(dedx, "dedx.txt", "dedx");
-    dedx->Dump(stderr, "dedx", 200);
-    dedxTmp->Dump(stderr, "dedxTmp", 200);
-
-    input->Reshape(input->unitNum/input->GetDim(-1), input->GetDim(-1));
-    dedx->Reshape(dedx->unitNum/dedx->GetDim(-1), dedx->GetDim(-1));
-
-    _MatrixMulBatched(input, X_TRANS, dedx, X_NOTRANS, dedw);
-
-    myDump(dedw, "dedw.txt", "dedw");
-}
-
-void T2TTest2()
-{
-    int dimSize[3];
-    dimSize[0] = 161;
-    dimSize[1] = 47;
-    dimSize[2] = 10001;
-    XTensor * probs = NewTensor(3, dimSize, X_FLOAT, 1.0F, 0);
-    //XTensor * probs = NewTensor(3, dimSize, X_FLOAT, 1.0F, -1);
-
-    //myRead(probs, "probs.txt", " ");
-    _SetDataFixedFloat(probs, 1.0F);
-
-    probs->Reshape(1, probs->unitNum);
-
-    DTYPE sum = _ReduceSumAll(probs);
-    printf("%e\n", sum);
-
-    //XTensor tmp;
-    //tmp = IsNonZero(*probs);
-    //DTYPE nonZeroNum = ReduceSumAll(tmp);
-    //printf("%f\n", nonZeroNum);
-    //
-    //DTYPE gpu = ReduceSum(*probs, 1).Get2D(0, 0);
-
-    //printf("%e\n", gpu);
-}
+//void myRead(XTensor * tensor, const char * filename, const char * label)
+//{
+//    FILE * file = fopen(filename, "rb");
+//    if(file == NULL)
+//        printf("%s\n", filename);
+//    tensor->Read(file, label);
+//}
+//
+//void myDump(XTensor * tensor, const char * filename, const char * label)
+//{
+//    FILE * file = fopen(filename, "wb");
+//    if(file == NULL)
+//        printf("%s\n", filename);
+//    tensor->Dump(file, label);
+//}
+//
+//void PowerTest()
+//{
+//    XTensor input;
+//    XTensor output;
+//    InitTensor2D(&input, 256, 10000, X_FLOAT, 0);
+//    InitTensor2D(&output, 256, 10000, X_FLOAT, 0);
+//    myRead(&input, "1.txt", "");
+//
+//    _Power(&input, &output, 2);
+//    output.Dump(stderr, "", 200);
+//}
+//
+//void SmallTest()
+//{
+//    XTensor a;
+//    XTensor b;
+//    XTensor c;
+//    XTensor d;
+//
+//    InitTensor2D(&a, 2, 2);
+//    InitTensor2D(&b, 2, 2);
+//    a.SetZeroAll();
+//    b.SetZeroAll();
+//    a.Set2D(1.0F, 0, 0);
+//    a.Set2D(2.0F, 1, 1);
+//
+//    b = Sum(a, Multiply(a, a));
+//
+//    /* this is prohibited !!!!!!!!!!!!! */
+//    //XTensor c = a * b + a;
+//    //XTensor d = a + b + c.Lin(0.5F);
+//    
+//    c = a * b + a;
+//    d = a + b + c.Lin(0.5F);
+//
+//    XLink::CheckNetwork(&d);
+//    //XLink::ShowNetwork(stderr, &d);
+//        
+//    a.Dump(stderr, "a:");
+//    b.Dump(stderr, "b:");
+//    c.Dump(stderr, "c:");
+//    d.Dump(stderr, "d:");
+//}
+//
+//void TransposeTest()
+//{
+//    XTensor a;
+//    XTensor b;
+//
+//    int I = 2;
+//    int J = 3;
+//
+//    InitTensor4D(&a, 2, 3, 4, 5);
+//
+//    int * dims = new int[a.order];
+//    memcpy(dims, a.dimSize, sizeof(int) * a.order);
+//    dims[I] = a.dimSize[J];
+//    dims[J] = a.dimSize[I];
+//
+//    InitTensor(&b, 4, dims);
+//
+//    a.SetZeroAll();
+//    b.SetZeroAll();
+//
+//    float * data = new float[a.unitNum];
+//    for(int i = 0; i < a.unitNum; i++)
+//        data[i] = (float)i;
+//
+//    a.SetData(data, a.unitNum, 0);
+//
+//    _Transpose(&a, &b, I, J);
+//    b.Dump(stderr, "b:");
+//
+//    delete[] data;
+//}
+//
+//void LittleTest()
+//{
+//    int a = 5000;
+//    int b = 100000;
+//    int c = a*b;
+//    printf("%d\n", c);
+//
+//    exit(1);
+//}
+//
+//void T2TTest()
+//{
+//    XTensor * input;
+//    XTensor * weight;
+//    XTensor * output;
+//    XTensor * gold;
+//    XTensor * dedy;
+//    XTensor * dedx;
+//    XTensor * dedxTmp;
+//    XTensor * dedw;
+//    XTensor * padding;
+//
+//    DTYPE loss;
+//
+//    int * dimSize = new int[2];
+//    dimSize[0] = 256;
+//    dimSize[1] = 10001;
+//
+//    int * dimSize2 = new int[3];
+//    dimSize2[0] = 2;
+//    dimSize2[1] = 31;
+//    dimSize2[2] = 256;
+//   
+//    int * dimSize3 = new int[3];
+//    dimSize3[0] = 2;
+//    dimSize3[1] = 31;
+//    dimSize3[2] = 10001;
+//
+//    int * dimSize4 = new int[2];
+//    dimSize4[0] = 2;
+//    dimSize4[1] = 31;
+//
+//    input = NewTensor(3, dimSize2, X_FLOAT, 1.0F, 0);
+//    weight = NewTensor(2, dimSize, X_FLOAT, 1.0F, 0);
+//    dedw = NewTensor(2, dimSize, X_FLOAT, 1.0F, 0);
+//    gold = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
+//    output = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
+//    dedy = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
+//    dedx = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
+//    dedxTmp = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
+//    padding = NewTensor(2, dimSize4, X_FLOAT, 1.0F, 0);
+//
+//    //weight = NewTensor(2, dimSize);
+//    //dedw = NewTensor(2, dimSize);
+//    //input = NewTensor(3, dimSize2);
+//    //gold = NewTensor(3, dimSize3);
+//    //output = NewTensor(3, dimSize3);
+//    //dedy = NewTensor(3, dimSize3);
+//    //dedx = NewTensor(3, dimSize3);
+//    //dedxTmp = NewTensor(3, dimSize3);
+//    //padding = NewTensor(2, dimSize4);
+//
+//    myRead(input, "x.txt", "x");
+//    myRead(weight, "w.txt", "w");
+//    myRead(gold, "gold.txt", "gold");
+//    myRead(padding, "padding.txt", "padding");
+//
+//    XTensor inter;
+//    inter = MMul(*input, *weight);
+//
+//    _Softmax(&inter, output, 2);
+//
+//    //_LogMe(output);
+//    loss = _CrossEntropyFast(output, gold, REDUCE_MEAN, NULL, padding);
+//
+//    printf("loss: %f\n", loss);
+//
+//    _CrossEntropyBackward(dedy, output, gold, NULL);
+//    //_CrossEntropyBackward(dedy, output, gold, NULL, padding);
+//
+//    myDump(dedy, "dedy.txt", "dedy");
+//
+//    _SoftmaxBackward(NULL, output, input, dedy, dedx, NULL, -1, NOLOSS);
+//    _Sub(output, gold, dedxTmp);
+//
+//    myDump(dedx, "dedx.txt", "dedx");
+//    dedx->Dump(stderr, "dedx", 200);
+//    dedxTmp->Dump(stderr, "dedxTmp", 200);
+//
+//    input->Reshape(input->unitNum/input->GetDim(-1), input->GetDim(-1));
+//    dedx->Reshape(dedx->unitNum/dedx->GetDim(-1), dedx->GetDim(-1));
+//
+//    _MatrixMulBatched(input, X_TRANS, dedx, X_NOTRANS, dedw);
+//
+//    myDump(dedw, "dedw.txt", "dedw");
+//}
+//
+//void T2TTest2()
+//{
+//    int dimSize[3];
+//    dimSize[0] = 161;
+//    dimSize[1] = 47;
+//    dimSize[2] = 10001;
+//    XTensor * probs = NewTensor(3, dimSize, X_FLOAT, 1.0F, 0);
+//    //XTensor * probs = NewTensor(3, dimSize, X_FLOAT, 1.0F, -1);
+//
+//    //myRead(probs, "probs.txt", " ");
+//    _SetDataFixedFloat(probs, 1.0F);
+//
+//    probs->Reshape(1, probs->unitNum);
+//
+//    DTYPE sum = _ReduceSumAll(probs);
+//    printf("%e\n", sum);
+//
+//    //XTensor tmp;
+//    //tmp = IsNonZero(*probs);
+//    //DTYPE nonZeroNum = ReduceSumAll(tmp);
+//    //printf("%f\n", nonZeroNum);
+//    //
+//    //DTYPE gpu = ReduceSum(*probs, 1).Get2D(0, 0);
+//
+//    //printf("%e\n", gpu);
+//}

--- a/source/tensor/loss/CrossEntropy.cu
+++ b/source/tensor/loss/CrossEntropy.cu
@@ -196,17 +196,17 @@ void _CudaCrossEntropyBackward(XTensor * dedy, const XTensor * output,
        delete[] dims;
    }

-    if(padding != NULL) {
-        XTensor * tmp = NewTensor(padding);
-        _IsNonZero(padding, tmp);
-        int nonZeroNum = (int)_ReduceSumAll(tmp);
-        _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum);
-        delete tmp;
-    }
-    else {
-        int num = dedy->unitNum / dedy->GetDim(n);
-        _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)num);
-    }
+    //if(padding != NULL) {
+    //    XTensor * tmp = NewTensor(padding);
+    //    _IsNonZero(padding, tmp);
+    //    int nonZeroNum = (int)_ReduceSumAll(tmp);
+    //    _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum);
+    //    delete tmp;
+    //}
+    //else {
+    //    int num = dedy->unitNum / dedy->GetDim(n);
+    //    _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)num);
+    //}

 }