Merge with liyinqiao brach and add the max/min function

bc49d32a · xuchen · cadda317 · bc49d32a · bc49d32a · bc49d32a
Commit bc49d32a authored Oct 13, 2019 by xuchen
--- a/source/network/Main.cpp
+++ b/source/network/Main.cpp
@@ -71,6 +71,9 @@ void BackwardTest()
    XTensor a;
    XTensor b;
    XTensor c;
+    a.enableGrad = true;
+    b.enableGrad = false;
+    c.enableGrad = false;
    XTensor mean;
    XTensor origin;
    InitTensor2D(&a, 2, 3);
@@ -88,14 +91,15 @@ void BackwardTest()
    b.Set1D(2.0F, 0);
    b.Set1D(1.0F, 1);

-    c = DivDim(a, b, 0);
+    DivDim(a, b, c, 0);
    c.Dump(stderr, "c:");
+    auto loss = CrossEntropy(c, a);

    //XLink::ShowNetwork(stderr, &c);

-    net.Backward(c);
+    net.Backward(loss);

-    net.Dump(stderr);
+    a.grad->Dump(stderr);

 }


--- a/source/tensor/XBLAS.cpp
+++ b/source/tensor/XBLAS.cpp
@@ -26,183 +26,9 @@
 *
 */

-#ifdef WIN32
-#include <wtypes.h>
-#endif
-
-#include <stdlib.h>
-#include <stdio.h>
-#include "XBLAS.h"
-#include "XGlobal.h"

 /* the nts (NiuTrans.Tensor) namespace */
 namespace nts{

-#ifdef WIN32
-HINSTANCE hBLASDll;
-#endif
-
-
-/* single-precision floating matrix-matrix multiplication */
-void (*XBLAS_SGEMM)(OPENBLAS_CONST enum CBLAS_ORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE, OPENBLAS_CONST enum CBLAS_TRANSPOSE,
-                    OPENBLAS_CONST BLASINT, OPENBLAS_CONST BLASINT, OPENBLAS_CONST BLASINT, OPENBLAS_CONST float,  
-                    OPENBLAS_CONST float *, OPENBLAS_CONST BLASINT,
-                    OPENBLAS_CONST float *, OPENBLAS_CONST BLASINT, OPENBLAS_CONST float, 
-                    float *, OPENBLAS_CONST BLASINT);
-
-/* double-precision floating matrix-matrix multiplication */
-void (*XBLAS_DGEMM)(OPENBLAS_CONST enum CBLAS_ORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE, OPENBLAS_CONST enum CBLAS_TRANSPOSE,
-                    OPENBLAS_CONST BLASINT, OPENBLAS_CONST BLASINT, OPENBLAS_CONST BLASINT, OPENBLAS_CONST double,  
-                    OPENBLAS_CONST double *, OPENBLAS_CONST BLASINT,
-                    OPENBLAS_CONST double *, OPENBLAS_CONST BLASINT, OPENBLAS_CONST double, 
-                    double *, OPENBLAS_CONST BLASINT);
-
-/* single-precision floating vector-vector multiplication (rank-1) */
-void (*XBLAS_SGER)(OPENBLAS_CONST enum CBLAS_ORDER, OPENBLAS_CONST BLASINT M, OPENBLAS_CONST BLASINT N, OPENBLAS_CONST float  alpha, 
-                   OPENBLAS_CONST float *Y, OPENBLAS_CONST BLASINT, OPENBLAS_CONST float *, OPENBLAS_CONST BLASINT, 
-                   float *, OPENBLAS_CONST BLASINT);
-
-/* double-precision floating vector-vector multiplication (rank-1) */
-void (*XBLAS_DGER)(OPENBLAS_CONST enum CBLAS_ORDER, OPENBLAS_CONST BLASINT M, OPENBLAS_CONST BLASINT N, OPENBLAS_CONST double  alpha, 
-                   OPENBLAS_CONST double *Y, OPENBLAS_CONST BLASINT, OPENBLAS_CONST double *, OPENBLAS_CONST BLASINT, 
-                   double *, OPENBLAS_CONST BLASINT);
-
-/* set the number of threads */
-void (*XBLAS_SET_THREAD_NUM)(int);
-
-/* get the number of threads */
-//int (*XBLAS_GET_THREAD_NUM)();
-
-
-/* get the number of physical processors (cores).*/
-int (*XBLAS_GET_CORE_NUM)();
-
-
-/* get the CPU corename */
-//char * (*XBLAS_GET_CORE_NAME)();
-
-/* get the parallelization type used by OpenBLAS */
-//int (*XBLAS_GET_PARALLEL_TYPE)(void);
-
-
-#if defined(USE_BLAS)
-
-/* load some stuff for BLAS */
-void LoadBLAS(const char * dllFileName)
-{
-#ifndef CUDA_BLAS
-#ifdef _WIN32
-
-#if defined(OPENBLAS)
-    /* non-ascii characters are not supported yet */
-    wchar_t * fn = new wchar_t[strlen(dllFileName) + 1];
-    memset(fn, 0, sizeof(wchar_t) * (strlen(dllFileName) + 1));
-    for(int i = 0; i < strlen(dllFileName); i++)
-        fn[i] = dllFileName[i];
-
-    hBLASDll = LoadLibrary((LPCWSTR)fn);
-
-    if(!hBLASDll){
-        XPRINT1(0, stderr, "[LoadBLAS] Error! Cannot load dll %s!\n", dllFileName);
-        exit(1);
-    }
-
-    /* matrix-matrix multiplicatoin */
-    (FARPROC&)XBLAS_SGEMM = GetProcAddress(hBLASDll, "cblas_sgemm");
-    (FARPROC&)XBLAS_DGEMM = GetProcAddress(hBLASDll, "cblas_dgemm");
-
-    /* vector-vector multiplication */
-    (FARPROC&)XBLAS_SGER = GetProcAddress(hBLASDll, "cblas_sger");
-    (FARPROC&)XBLAS_DGER = GetProcAddress(hBLASDll, "cblas_dger");
-
-    /* multi-threading */
-    (FARPROC&)XBLAS_SET_THREAD_NUM = GetProcAddress(hBLASDll, "openblas_set_num_threads");
-    //(FARPROC&)XBLAS_SET_THREAD_NUM = GetProcAddress(hBLASDll, "goto_set_num_threads");
-    //(FARPROC&)XBLAS_GET_THREAD_NUM = GetProcAddress(hBLASDll, "openblas_get_num_threads");
-    (FARPROC&)XBLAS_GET_CORE_NUM = GetProcAddress(hBLASDll, "openblas_get_num_procs");
-    //(FARPROC&)XBLAS_GET_CORE_NAME = GetProcAddress(hBLASDll, "openblas_get_corename");
-    //(FARPROC&)XBLAS_GET_PARALLEL_TYPE = GetProcAddress(hBLASDll, "openblas_get_parallel");
-
-    delete[] fn;
-#endif // defined(OPENBLAS)
-
-#if defined(MKL)
-    /* non-ascii characters are not supported yet */
-    wchar_t * fn = new wchar_t[strlen(dllFileName) + 1];
-    memset(fn, 0, sizeof(wchar_t) * (strlen(dllFileName) + 1));
-    for(int i = 0; i < strlen(dllFileName); i++)
-        fn[i] = dllFileName[i];
-
-    hBLASDll = LoadLibrary((LPCWSTR)fn);
-
-    if(!hBLASDll){
-        XPRINT1(0, stderr, "[LoadBLAS] Error! Cannot load dll %s!\n", dllFileName);
-        exit(1);
-    }
-
-    /* matrix-matrix multiplicatoin */
-    (FARPROC&)XBLAS_SGEMM = GetProcAddress(hBLASDll, "cblas_sgemm");
-    (FARPROC&)XBLAS_DGEMM = GetProcAddress(hBLASDll, "cblas_dgemm");
-
-    /* vector-vector multiplication */
-    (FARPROC&)XBLAS_SGER = GetProcAddress(hBLASDll, "cblas_sger");
-    (FARPROC&)XBLAS_DGER = GetProcAddress(hBLASDll, "cblas_dger");
-
-    /* multi-threading */
-    (FARPROC&)XBLAS_SET_THREAD_NUM = GetProcAddress(hBLASDll, "MKL_Set_Num_Threads");
-    (FARPROC&)XBLAS_GET_CORE_NUM   = GetProcAddress(hBLASDll, "MKL_Get_Max_Threads");
-#endif // defined(MKL)
-
-#else // _WIN32
-
-    XBLAS_SGEMM = &cblas_sgemm;
-    XBLAS_DGEMM = &cblas_dgemm;
-    XBLAS_SGER  = &cblas_sger;
-    XBLAS_DGER  = &cblas_dger;
-#if defined(OPENBLAS)
-    XBLAS_SET_THREAD_NUM    = &openblas_set_num_threads;
-    XBLAS_GET_CORE_NUM      = &openblas_get_num_procs;
-#endif // defined(OPENBLAS)
-#if defined(MKL)
-    XBLAS_SET_THREAD_NUM    = &mkl_set_num_threads;
-    XBLAS_GET_CORE_NUM      = &mkl_get_max_num_threads;
-#endif // defined(MKL)
-
-#endif // _WIN32
-
-    XBLAS_SET_THREAD_NUM(1);
-#endif // ndef(CUDA_BLAS)
-}
-
-/* unload the libs */
-void UnloadBLAS()
-{
-#ifdef _WIN32
-
-    if(!FreeLibrary(hBLASDll)){
-        XPRINT(0, stderr, "[UnloadBLAS] Error! Cannot free the BLAS dll!\n");
-        exit(1);
-    }
-
-#else
-
-#endif
-}
-
-#else  // undefined(USE_BLAS) || undefined(OPENBLAS)
-
-void LoadBLAS(const char * dllFileName)
-{
-    XPRINT(0, stderr, "[LoadBLAS] Error! No Blas lib is available. Please use OPENBLAS or MKL!\n");
-    exit(1);
-}
-
-void UnloadBLAS()
-{
-    XPRINT(0, stderr, "[UnloadBLAS] Error! No Blas lib is available. Please use OPENBLAS or MKL!\n");
-    exit(1);
-}
-
-#endif // defined(USE_BLAS) && defined(OPENBLAS)

 } /* end of the nts (NiuTrans.Tensor) namespace */
\ No newline at end of file
--- a/source/tensor/XBLAS.h
+++ b/source/tensor/XBLAS.h
@@ -34,7 +34,6 @@ namespace nts{

 /* some of the code below is from OpenBLAS (https://github.com/xianyi/OpenBLAS) */

-//#define OPENBLAS

 #define OPENBLAS_CONST const
 typedef int BLASINT;
@@ -46,7 +45,26 @@ typedef enum CBLAS_SIDE      {CblasLeft=141, CblasRight=142} CBLAS_SIDE;


 #if defined(USE_BLAS)
-
+#ifdef OPENBLAS
+#define XBLAS_SGEMM cblas_sgemm
+#define XBLAS_DGEMM cblas_dgemm
+#define XBLAS_SGER cblas_sger
+#define XBLAS_DGER cblas_dger
+#define XBLAS_SAXPY cblas_saxpy
+#define XBLAS_DAXPY cblas_daxpy
+#define XBLAS_SET_THREAD_NUM openblas_set_num_threads
+#define XBLAS_GET_CORE_NUM openblas_get_num_procs
+#endif
+#ifdef MKL
+#define XBLAS_SGEMM cblas_sgemm
+#define XBLAS_DGEMM cblas_dgemm
+#define XBLAS_SGER cblas_sger
+#define XBLAS_DGER cblas_dger
+#define XBLAS_SAXPY cblas_saxpy
+#define XBLAS_DAXPY cblas_daxpy
+#define XBLAS_SET_THREAD_NUM MKL_Set_Num_Threads
+#define XBLAS_GET_CORE_NUM MKL_Get_Max_Threads
+#endif
 /* 
 single/double-precision floating matrix-matrix multiplication (rank-3)
 - SGEMM (ORDER, TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
@@ -62,14 +80,14 @@ where A, B and C are matrices,
      LDB(=N) specifies the size of the first dimension of B as declared in the calling (sub) program,
      and LDC(=N) specifies the size of the first dimension of C as declared in the calling (sub) program.
 */
-extern "C" void (*XBLAS_SGEMM)(OPENBLAS_CONST enum CBLAS_ORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE, OPENBLAS_CONST enum CBLAS_TRANSPOSE,
+extern "C" void XBLAS_SGEMM(OPENBLAS_CONST enum CBLAS_ORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE, OPENBLAS_CONST enum CBLAS_TRANSPOSE,
                               OPENBLAS_CONST BLASINT, OPENBLAS_CONST BLASINT, OPENBLAS_CONST BLASINT, OPENBLAS_CONST float,  
                               OPENBLAS_CONST float *, OPENBLAS_CONST BLASINT,
                               OPENBLAS_CONST float *, OPENBLAS_CONST BLASINT, OPENBLAS_CONST float, 
                               float *, OPENBLAS_CONST BLASINT);

 /* double-precision floating matrix-matrix multiplication */
-extern "C" void (*XBLAS_DGEMM)(OPENBLAS_CONST enum CBLAS_ORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE, OPENBLAS_CONST enum CBLAS_TRANSPOSE,
+extern "C" void XBLAS_DGEMM(OPENBLAS_CONST enum CBLAS_ORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE, OPENBLAS_CONST enum CBLAS_TRANSPOSE,
                               OPENBLAS_CONST BLASINT, OPENBLAS_CONST BLASINT, OPENBLAS_CONST BLASINT, OPENBLAS_CONST double,  
                               OPENBLAS_CONST double *, OPENBLAS_CONST BLASINT,
                               OPENBLAS_CONST double *, OPENBLAS_CONST BLASINT, OPENBLAS_CONST double, 
@@ -88,24 +106,33 @@ where X and Y are vectors with m and n elements respectively,
          E.g., if we are using CblasRowMajor, the leading dimension is the number of columns of A.

 */
-extern "C" void (*XBLAS_SGER)(OPENBLAS_CONST enum CBLAS_ORDER, OPENBLAS_CONST BLASINT M, OPENBLAS_CONST BLASINT N, OPENBLAS_CONST float  alpha, 
+extern "C" void XBLAS_SGER(OPENBLAS_CONST enum CBLAS_ORDER, OPENBLAS_CONST BLASINT M, OPENBLAS_CONST BLASINT N, OPENBLAS_CONST float  alpha, 
                              OPENBLAS_CONST float *Y, OPENBLAS_CONST BLASINT, OPENBLAS_CONST float *, OPENBLAS_CONST BLASINT, 
                              float *, OPENBLAS_CONST BLASINT);

 /* double-precision floating vector-vector multiplication (rank-1) */
-extern "C" void (*XBLAS_DGER)(OPENBLAS_CONST enum CBLAS_ORDER, OPENBLAS_CONST BLASINT M, OPENBLAS_CONST BLASINT N, OPENBLAS_CONST double  alpha, 
+extern "C" void XBLAS_DGER(OPENBLAS_CONST enum CBLAS_ORDER, OPENBLAS_CONST BLASINT M, OPENBLAS_CONST BLASINT N, OPENBLAS_CONST double  alpha, 
                              OPENBLAS_CONST double *Y, OPENBLAS_CONST BLASINT, OPENBLAS_CONST double *, OPENBLAS_CONST BLASINT, 
                              double *, OPENBLAS_CONST BLASINT);

+/*
+some description
+
+*/
+extern "C" void XBLAS_SAXPY(OPENBLAS_CONST BLASINT n, OPENBLAS_CONST float a, OPENBLAS_CONST float *x, OPENBLAS_CONST BLASINT incx, OPENBLAS_CONST float *y, OPENBLAS_CONST BLASINT  incy);
+
+/* double-precision floating sumMe function */
+extern "C" void XBLAS_DAXPY(OPENBLAS_CONST BLASINT n, OPENBLAS_CONST double a, OPENBLAS_CONST double *x, OPENBLAS_CONST BLASINT incx, OPENBLAS_CONST double *y, OPENBLAS_CONST BLASINT  incy);
+
 /* set the number of threads */
-extern "C" void (*XBLAS_SET_THREAD_NUM)(int);
+extern "C" void XBLAS_SET_THREAD_NUM(int);

 /* get the number of threads */
 //extern "C" int (*XBLAS_GET_THREAD_NUM)();


 /* get the number of physical processors (cores).*/
-extern "C" int (*XBLAS_GET_CORE_NUM)();
+extern "C" int XBLAS_GET_CORE_NUM();

 /* get the CPU corename */
 //extern "C" char * (*XBLAS_GET_CORE_NAME)();
@@ -113,58 +140,6 @@ extern "C" int (*XBLAS_GET_CORE_NUM)();
 /* get the parallelization type used by OpenBLAS */
 //extern "C" int (*XBLAS_GET_PARALLEL_TYPE)(void);

-/* linux systems */
-#ifndef _WIN32
-
-/* cblas functions that are imported from the lib. See cblas.h in OpenBlas for more information */
-extern "C" void cblas_sgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, 
-                        OPENBLAS_CONST BLASINT M, OPENBLAS_CONST BLASINT N, OPENBLAS_CONST BLASINT K, OPENBLAS_CONST float alpha, 
-                        OPENBLAS_CONST float *A, OPENBLAS_CONST BLASINT lda, 
-                        OPENBLAS_CONST float *B, OPENBLAS_CONST BLASINT ldb, 
-                        OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST BLASINT ldc);
-extern "C" void cblas_dgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, 
-                        OPENBLAS_CONST BLASINT M, OPENBLAS_CONST BLASINT N, OPENBLAS_CONST BLASINT K, OPENBLAS_CONST double alpha, 
-                        OPENBLAS_CONST double *A, OPENBLAS_CONST BLASINT lda, 
-                        OPENBLAS_CONST double *B, OPENBLAS_CONST BLASINT ldb, 
-                        OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST BLASINT ldc);
-extern "C" void cblas_sger (OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST BLASINT M, OPENBLAS_CONST BLASINT N, OPENBLAS_CONST float  alpha, 
-                        OPENBLAS_CONST float  *X, OPENBLAS_CONST BLASINT incX, OPENBLAS_CONST float  *Y, OPENBLAS_CONST BLASINT incY, 
-                        float  *A, OPENBLAS_CONST BLASINT lda);
-extern "C" void cblas_dger (OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST BLASINT M, OPENBLAS_CONST BLASINT N, OPENBLAS_CONST double alpha, 
-                        OPENBLAS_CONST double *X, OPENBLAS_CONST BLASINT incX, OPENBLAS_CONST double *Y, OPENBLAS_CONST BLASINT incY, 
-                        double *A, OPENBLAS_CONST BLASINT lda);
-
-#if defined(OPENBLAS)
-/* better control of multi-threading */
-extern "C" void  openblas_set_num_threads(int num_threads);
-extern "C" void  goto_set_num_threads(int num_threads);
-//extern "C" int   openblas_get_num_threads(void);
-extern "C" int   openblas_get_num_procs(void);
-//extern "C" char* openblas_get_config(void);
-//extern "C" char* openblas_get_corename(void);
-//extern "C" int   openblas_get_parallel(void);
-#endif
-
-#endif
-
-#if defined(MKL)
-
-
-/* better control of multi-threading */
-//_Mkl_Api(void,MKL_Set_Num_Threads,(int nth))
-//_Mkl_Api(int,MKL_Get_Max_Threads,(void))
-extern "C" void  MKL_Set_Num_Threads(int num_threads);
-extern "C" int  MKL_Get_Max_Threads();
-
-
-#define mkl_set_num_threads MKL_Set_Num_Threads
-#define mkl_get_max_num_threads MKL_Get_Max_Threads
-
-//extern "C" void  mkl_set_num_threads(int num_threads);
-//extern "C" void  omp_set_num_threads(int num_threads);
-//extern "C" int  mkl_get_max_num_threads();
-
-#endif

 #if defined(CUDA_BLAS)

@@ -186,24 +161,8 @@ extern void BLASMatrixMULD(int deviceID, double * a, double * b, double * c, int

 #endif

-#endif
-
-#ifdef _WIN32
-
-#include "windows.h"
-
-extern HINSTANCE hBLASDll;
-
-#else

 #endif
-
-/* load some stuff for BLAS */
-extern void LoadBLAS(const char * dllFileName);
-
-/* unload the libs */
-extern void UnloadBLAS();
-
 } /* end of the nts (NiuTrans.Tensor) namespace */
-
 #endif
+
--- a/source/tensor/XGlobal.h
+++ b/source/tensor/XGlobal.h
@@ -160,8 +160,10 @@ extern bool useCUDA;
 /* BLAS interfaces */
 #ifdef DOUBELPRICSION
 #define GEMM XBLAS_DGEMM
+#define AXPY XBLAS_DAXPY
 #else
 #define GEMM XBLAS_SGEMM
+#define AXPY XBLAS_SAXPY
 #endif

 extern void InitGlobalAll();

--- a/source/tensor/XLink.cpp
+++ b/source/tensor/XLink.cpp
@@ -300,6 +300,9 @@ void XLink::MakeLink(const XTensor * t1, const XTensor * t2, XTensor * h, int id
    if(h == NULL)
        return;
    
+    if (!t1->enableGrad)
+        return;
+
    TensorList list(2);
    list.Add((XTensor*)t1);
    list.Add((XTensor*)t2);
@@ -320,6 +323,9 @@ void XLink::MakeLink(const XTensor * t1, const XTensor * t2, const XTensor * t3,
    if (h == NULL)
        return;

+    if (!t1->enableGrad || !t2->enableGrad)
+        return;
+    
    TensorList list(3);
    list.Add((XTensor*)t1);
    list.Add((XTensor*)t2);
@@ -370,6 +376,9 @@ create a hyper edge with a input tensors and a list of output tensors
 */
 void XLink::MakeLink(XTensor * t, TensorList * list, int id)
 {
+    if (!t->enableGrad)
+        return;
+
    /* forward */
    for(int i = 0; i < list->count; i++){
        XTensor * h = (XTensor*)list->GetItem(i);

--- a/source/tensor/XList.cpp
+++ b/source/tensor/XList.cpp
@@ -23,15 +23,11 @@
  *
  */

-#include "XList.h"
+#include "time.h"
 #include "XMem.h"
+#include "XList.h"
 #include "XGlobal.h"

-#include <ctime>
-#include <utility>
-#include <algorithm>
-
-
 /* the nts (NiuTrans.Tensor) namespace */
 namespace nts {

@@ -78,7 +74,8 @@ TensorListBase<T>::TensorListBase(int myMaxNum, XMem* myMem)
 template <typename T>
 TensorListBase<T>::~TensorListBase()
 {
-    delete[] items;
+    if(items && mem)
+        delete[] items;
 }


@@ -90,7 +87,7 @@ template <typename T>
 void TensorListBase<T>::Add(T&& item)
 {
    if (count == maxNum) {
-		
+        
        T* newItems;
        if (mem == NULL)
            newItems = new T[maxNum * 2 + 1];
@@ -101,7 +98,13 @@ void TensorListBase<T>::Add(T&& item)
        maxNum = maxNum * 2 + 1;
    }
    items[count++] = item;
-	
+}
+
+/* return number of elements */
+template<typename T>
+size_t TensorListBase<T>::Size()
+{
+    return count;
 }

 /*
@@ -111,18 +114,18 @@ add an item into the list
 template <typename T>
 void TensorListBase<T>::Add(const T& item)
 {
-	if (count == maxNum) {
-		T* newItems;
-		if (mem == NULL)
-			newItems = new T[maxNum * 2 + 1];
-		else
-			newItems = (T*)mem->Alloc(mem->devID, sizeof(T) * (maxNum * 2 + 1));
-		memcpy(newItems, items, sizeof(T) * maxNum);
-		items = newItems;
-		maxNum = maxNum * 2 + 1;
-	}
-
-	items[count++] = item;
+    if (count == maxNum) {
+        T* newItems;
+        if (mem == NULL)
+            newItems = new T[maxNum * 2 + 1];
+        else
+            newItems = (T*)mem->Alloc(mem->devID, sizeof(T) * (maxNum * 2 + 1));
+        memcpy(newItems, items, sizeof(T) * maxNum);
+        items = newItems;
+        maxNum = maxNum * 2 + 1;
+    }
+
+    items[count++] = item;
 }

 /* 
@@ -131,7 +134,7 @@ add a number of items into the list
 >> inputItemCount - number of input items
 */
 template <typename T>
-void TensorListBase<T>::Add(T* inputItems, int inputItemCount)
+void TensorListBase<T>::Add(const T* inputItems, int inputItemCount)
 {
    if (count + inputItemCount >= maxNum) {
        int newMaxNum = (count + inputItemCount) * 2 + 1;
@@ -186,31 +189,31 @@ void TensorListBase<T>::Insert(int pos, const T& item)
 template<typename T>
 void TensorListBase<T>::Insert(int pos, T&& item)
 {
-	if (count == maxNum) {
-		T* newItems;
-		if (mem == NULL)
-			newItems = new T[maxNum * 2 + 1];
-		else
-			newItems = (T*)mem->Alloc(mem->devID, sizeof(T) * (maxNum * 2 + 1));
-		memcpy(newItems, items, sizeof(T) * maxNum);
-		items = newItems;
-		maxNum = maxNum * 2 + 1;
-	}
-
-	for (int i = count - 1; i >= pos; i--)
-		items[i + 1] = items[i];
-	items[pos] = item;
-	count++;
+    if (count == maxNum) {
+        T* newItems;
+        if (mem == NULL)
+            newItems = new T[maxNum * 2 + 1];
+        else
+            newItems = (T*)mem->Alloc(mem->devID, sizeof(T) * (maxNum * 2 + 1));
+        memcpy(newItems, items, sizeof(T) * maxNum);
+        items = newItems;
+        maxNum = maxNum * 2 + 1;
+    }
+
+    for (int i = count - 1; i >= pos; i--)
+        items[i + 1] = items[i];
+    items[pos] = item;
+    count++;
 }

 /* get the item at position i */
 template <typename T>
 T& TensorListBase<T>::GetItem(int i) const
 {
-    CheckNTErrors(i >= -1 && i < count, "Index of a list item is out of scope!");
+    CheckNTErrors(i >= -count && i < count, "Index of a list item is out of scope!");
    CheckNTErrors(count > 0, "Cannt index the item in an empty list!");
-    if (i == -1)
-        return items[count - 1];
+    if (i < 0)
+        return items[count + i];
    else
        return items[i];
 }
@@ -226,8 +229,8 @@ inline void TensorListBase<T>::SetItem(int i, const T& item)
 template<typename T>
 inline void TensorListBase<T>::SetItem(int i, T&& item)
 {
-	if (i >= 0 && i < count)
-		items[i] = std::move(item);
+    if (i >= 0 && i < count)
+        items[i] = item;
 }

 /* 
@@ -246,11 +249,31 @@ inline int TensorListBase<T>::FindFirst(const T& item)
    return -1;
 }

+template <>
+inline int TensorListBase<Example>::FindFirst(const Example& item)
+{
+    for (int i = 0; i < count; i++) {
+        if (item.id == items[i].id)
+            return i;
+    }
+    return -1;
+}
+
+template <>
+inline int TensorListBase<Result>::FindFirst(const Result& item)
+{
+    for (int i = 0; i < count; i++) {
+        if (item.id == items[i].id)
+            return i;
+    }
+    return -1;
+}
+
 /* clear the data array */
 template <typename T>
 void TensorListBase<T>::Clear()
 {
-	count = 0;
+    count = 0;
 }

 /*
@@ -295,6 +318,17 @@ void TensorListBase<T>::Remove(int i)
    count--;
 }

+template<typename T>
+void TensorListBase<T>::Reserve(int n)
+{
+    if (items) {
+        /* reserve failed */
+        return;
+    }
+
+    items = new T[n];
+}
+
 /* 
 copy the list 
 >> myMem - memory pool used for allocating the data in the new list
@@ -349,6 +383,8 @@ template struct TensorListBase<long>;
 template struct TensorListBase<float>;
 template struct TensorListBase<short>;
 template struct TensorListBase<XTensor*>;
+template struct TensorListBase<Result>;
+template struct TensorListBase<Example>;
 template struct TensorListBase<void*>;

 } /* end of the nts (NiuTrans.Tensor) namespace */
\ No newline at end of file
--- a/source/tensor/XList.h
+++ b/source/tensor/XList.h
@@ -32,7 +32,7 @@

 /* the nts (NiuTrans.Tensor) namespace */
 namespace nts {
-	
+    
 /* the TensorListBase class */
 template <typename T>
 struct TensorListBase {
@@ -66,68 +66,85 @@ public:
    /* add an item into the list */
    void Add(T&& item);

-	/* add an item into the list */
-	void Add(const T& item);
+    /* return number of elements */
+    size_t Size();

-	/* add a number of items into the list */
-    void Add(T* inputItems, int inputItemCount);
+    /* add an item into the list */
+    void Add(const T& item);
+
+    /* add a number of items into the list */
+    void Add(const T* inputItems, int inputItemCount);

-	/* append a list to the current list */
+    /* append a list to the current list */
    void AddList(TensorListBase* l);

-	/* insert an item to the given position of the list */
+    /* insert an item to the given position of the list */
    void Insert(int pos, const T& item);

-	/* insert an item to the given position of the list */
-	void Insert(int pos, T&& item);
+    /* insert an item to the given position of the list */
+    void Insert(int pos, T&& item);

-	/* get the item at position i */
+    /* get the item at position i */
    T& GetItem(int i) const;

-	/* set the item at position i */
+    /* set the item at position i */
    void SetItem(int i, const T& item);

-	/* set the item at position i */
-	void SetItem(int i, T&& item);
+    /* set the item at position i */
+    void SetItem(int i, T&& item);

-	/* find the position of the first matched item  */
+    /* find the position of the first matched item  */
    int FindFirst(const T& item);

-	/* clear the data array */
+    /* clear the data array */
    void Clear();

-	/* sort the list */
+    /* sort the list */
    void Sort(int itemSize);

-	/* reverse the list */
+    /* reverse the list */
    void Reverse();

-	/* remove the item at position i */
+    /* remove the item at position i */
    void Remove(int i);

-	/* copy the list */
+    /* reserve space for data entry */
+    void Reserve(int n);
+
+    /* copy the list */
    TensorListBase* Copy(XMem* myMem);

-	/* shuffle the list */
+    /* shuffle the list */
    void Shuffle(int nround = 10, int beg = -1, int len = 0);

    /* short */
-	T& operator[] (int i) {
-		return GetItem(i);
-	};
+    T& operator[] (int i) { return GetItem(i); };
    T& Get(int i) { return GetItem(i); };
-	void Set(int i, T item) { SetItem(i, item); };
+    void Set(int i, T item) { SetItem(i, item); };
 };

 struct XTensor;

+typedef TensorListBase<void*> XList;
 typedef TensorListBase<int> IntList;
 typedef TensorListBase<char> CharList;
 typedef TensorListBase<char*> StrList;
 typedef TensorListBase<long> LongList;
 typedef TensorListBase<float> FloatList;
 typedef TensorListBase<short> ShortList;
-typedef TensorListBase<void*> XList;
+
+struct Example {
+    int id;
+    IntList data;
+};
+
+struct Result {
+    int id;
+    IntList data;
+};
+
+typedef TensorListBase<Result> ResultList;
+typedef TensorListBase<Example> ExampleList;
 typedef TensorListBase<XTensor*> TensorList;

 } /* end of the nts (NiuTrans.Tensor) namespace */

--- a/source/tensor/XName.h
+++ b/source/tensor/XName.h
@@ -51,7 +51,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define MATH_MASK               MATH_DIVDIM + 1
 #define MATH_MATRIXMUL          MATH_MASK + 1
 #define MATH_MATRIXMULBATCHED   MATH_MATRIXMUL + 1
-#define MATH_MULTIPLY           MATH_MATRIXMULBATCHED + 1
+#define MATH_MAX                MATH_MATRIXMULBATCHED + 1
+#define MATH_MIN                MATH_MAX + 1
+#define MATH_MULTIPLY           MATH_MIN + 1
 #define MATH_MULTIPLYDIM        MATH_MULTIPLY + 1
 #define MATH_MULTIPLYBROADCAST  MATH_MULTIPLYDIM + 1
 #define MATH_NEGATE             MATH_MULTIPLYBROADCAST + 1

--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -280,7 +280,7 @@ void XTensor::Init()
    isTmp =  false;
    isGrad = false;
    isVar  = false;
-    enableGrad = false;
+    enableGrad = true;
    visitMark = 0;
    grad = NULL;
 }
@@ -397,7 +397,7 @@ XTensor& XTensor::operator= (const XTensor& tensor)
        CheckNTErrors(outgo.tailNum == 0, "The node has outgoing edge to other nodes!");

        /* create tensor links for the new tensor */
-        XLink::Replace(&tensor, this);
+        XLink::Copy(&tensor, this);
    }

    return *this;
@@ -445,7 +445,7 @@ XTensor& XTensor::operator= (const XTensor&& tensor)
       This is VERY tricky and there might be better solutions :) */
    *tensor.dataP = NULL;

-    XLink::Replace(&tensor, this);
+    XLink::Copy(&tensor, this);

    return *this;
 }
@@ -720,7 +720,7 @@ int XTensor::GetSize() const
 }

 /* get the size of the memory space used */
-int XTensor::GetDataSizeInChar()
+int XTensor::GetDataSizeInChar() const
 {
    if(isSparse){
        int num = int(unitNum * denseRatio + 1);
@@ -738,7 +738,7 @@ get unit size in terms of "dataType"
 >> myDataType - type of unit
 << return - unit size
 */
-int XTensor::GetUnitSize(TENSOR_DATA_TYPE myDataType)
+int XTensor::GetUnitSize(TENSOR_DATA_TYPE myDataType) const
 {
    if(myDataType == X_INT)
        return sizeof(int);
@@ -758,7 +758,7 @@ get offset (2D)
 >> row - index of demension 0
 >> col - index of demension 1
 */
-MTYPE XTensor::GetOffset2D(int row, int col)
+MTYPE XTensor::GetOffset2D(int row, int col) const
 {
    CheckNTErrors(order == 2, "Cannot get a 2d cell for a tensor whose order is not 2!");
    CheckNTErrors(row >= 0 && row < dimSize[0], "dimension 0 is out of range!");
@@ -773,7 +773,7 @@ get offset (3D)
 >> d1 - index of demension 1
 >> d2 - index of demension 2
 */
-MTYPE XTensor::GetOffset3D(int d0, int d1, int d2)
+MTYPE XTensor::GetOffset3D(int d0, int d1, int d2) const
 {
    CheckNTErrors(order == 3, "Cannot get a 3d cell for a tensor whose order is not 2!");
    CheckNTErrors(d0 >= 0 && d0 < dimSize[0], "dimension 0 is out of range!");
@@ -856,6 +856,16 @@ void XTensor::Rand(int rNum, int cNum)
    _SetDataRand(this, rNum, cNum);
 }

+/* generate data items with a range by start, end and the step
+>> start - the begin of the array
+>> end - the end of the array (not included self)
+>> step - the step of two items
+*/
+void XTensor::Range(DTYPE lower, DTYPE upper, DTYPE step)
+{
+    _SetDataRange(this, lower, upper, step);
+}
+
 /* 
 set the tensor items by a uniform distribution in range [lower, upper]
 >> lower - lower value of the range
@@ -929,9 +939,11 @@ set the tensor items by a normal distribution
 void XTensor::SetDataRandn(DTYPE mean, DTYPE standardDeviation)
 {
    // TODO: cuda code!!!!!!!
+
    if (data == NULL)
        return;

+    // srand((unsigned)time(0));
    void * d = NULL;
    if (dataType == X_FLOAT) {
        d = new float[unitNum];
@@ -986,7 +998,7 @@ void XTensor::SetDataBatchedWithValues(MTYPE * offsets, void * values, int num)
 >> num - number of data items
 >> beg - where we start this in the data array of the tensor
 */
-bool XTensor::CheckData(const void * d, int num, int beg)
+bool XTensor::CheckData(const void * d, int num, int beg) const
 {
    if (data == NULL || d == NULL)
        return false;
@@ -1030,7 +1042,7 @@ bool IsFloatEqual(DTYPE a, DTYPE b, float absError, float relError)
 }

 /* check whether the data array is the same as the "answer" */
-bool XTensor::CheckData(const void * d, int num, float tolerance, int beg)
+bool XTensor::CheckData(const void * d, int num, float tolerance, int beg) const
 {
    if (data == NULL || d == NULL)
        return false;
@@ -1106,7 +1118,7 @@ get the value of a cell with the index
 >> size - size of the index
 << return - cell value
 */
-DTYPE XTensor::Get(int index[], int size)
+DTYPE XTensor::Get(int index[], int size) const
 {
    CheckNTErrors(dataType == DEFAULT_DTYPE, "The tensor is not in the default type.");

@@ -1118,7 +1130,7 @@ get the value of a cell with its offset
 >> offset - offset in the array
 << return - cell value
 */
-DTYPE XTensor::Get(int offset)
+DTYPE XTensor::Get(int offset) const
 {
    CheckNTErrors(dataType == DEFAULT_DTYPE, "The tensor is not in the default type.");
    CheckNTErrors(offset >= 0 && offset < unitNum, "Invalid index!");
@@ -1170,7 +1182,7 @@ get the value of a cell in a 1d tensor in default type
 >> i - idex
 << return - value of cell(i) in float
 */
-DTYPE XTensor::Get1D(int i)
+DTYPE XTensor::Get1D(int i) const
 {
    CheckNTErrors((order == 1), "Cannot get a 2d cell for a tensor whose order is not 2!");
    CheckNTErrors((i >= 0 && i < dimSize[0]), "dimension 0 is out of range!");
@@ -1207,7 +1219,7 @@ get the value of a cell in a 3d tensor
 >> d1 - index of dimension 1
 >> d2 - index of dimension 2
 */
-DTYPE XTensor::Get3D(int d0, int d1, int d2)
+DTYPE XTensor::Get3D(int d0, int d1, int d2) const
 {
    CheckNTErrors((order == 3), "Cannot get a 2d cell for a tensor whose order is not 2!");
    CheckNTErrors((d0 >= 0 && d0 < dimSize[0]), "dimension 0 is out of range!");
@@ -1225,7 +1237,7 @@ DTYPE XTensor::Get3D(int d0, int d1, int d2)
 get the int value of a cell by its offset
 >> offset - offset of the item
 */
-int XTensor::GetInt(int offset)
+int XTensor::GetInt(int offset) const
 {
    CheckNTErrors(dataType == X_INT, "The tensor is not in the integer type.");
    CheckNTErrors(offset >= 0 && offset < unitNum, "Invalid index!");
@@ -1242,7 +1254,7 @@ get the value of a cell in a 1d tensor in int type
 >> i - index
 << return - value of cell(i) in int
 */
-int XTensor::Get1DInt(int i)
+int XTensor::Get1DInt(int i) const
 {
    CheckNTErrors(order == 1, "Cannot get a 2d cell for a tensor whose order is not 2!");
    CheckNTErrors(i >= 0 && i < dimSize[0], "dimension 0 is out of range!");
@@ -1260,7 +1272,7 @@ get the value of a cell in a 2d tensor in int type
 >> mi - column index
 << return - value of cell(ni, mi) in int
 */
- int XTensor::Get2DInt(int ni, int mi)
+ int XTensor::Get2DInt(int ni, int mi) const
 {
    CheckNTErrors(order == 2, "Cannot get a 2d cell for a tensor whose order is not 2!");
    CheckNTErrors(ni >= 0 && ni < dimSize[0], "dimension 0 is out of range!");
@@ -1280,7 +1292,7 @@ get the value of a cell in a 3d tensor in int type
 >> d2 - index of dimension 2
 << return - value of cell(d0, d1, d2) in int
 */
-int XTensor::Get3DInt(int d0, int d1, int d2)
+int XTensor::Get3DInt(int d0, int d1, int d2) const
 {
    CheckNTErrors(order == 3, "Cannot get a 2d cell for a tensor whose order is not 2!");
    CheckNTErrors(d0 >= 0 && d0 < dimSize[0], "dimension 0 is out of range!");
@@ -1299,7 +1311,7 @@ get the value of a cell in the sparse tensor
 >> i - i-th tuple in the tuple list of the sparse tensor
 << return - value of the tuple
 */
-DTYPE XTensor::GetInSparse(int i)
+DTYPE XTensor::GetInSparse(int i) const
 {
    CheckNTErrors(i >= 0 && i < unitNum, "Index is out of range!");
    CheckNTErrors(dataType == DEFAULT_DTYPE, "The tensor is not in default type.");
@@ -1315,7 +1327,7 @@ get the key value of a tuple in a sparse tensor
 >> i - i-th tuple in the tuple list of the sparse tensor
 << return - key of the tuple
 */
-int XTensor::GetKeyInSparse(int i)
+int XTensor::GetKeyInSparse(int i) const
 {
    CheckNTErrors(i >= 0 && i < unitNum, "Index is out of range!");
    CheckNTErrors(dataType == DEFAULT_DTYPE, "The tensor is not in default type.");
@@ -1528,7 +1540,7 @@ increase the value of a cell in a 2d tensor
 }

 /* get the number of non-zero elements (in a sparse tensor) */
-int XTensor::GetNonzeroSize()
+int XTensor::GetNonzeroSize() const
 {
    if(!isSparse){
        XPRINT(1, stderr, "WARNING! Counting non-zero elements in a dense tensor might be slow!\n");
@@ -1893,148 +1905,6 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int beg, 
    }
 }

-void * RecursionData(XTensor * s, int dim, int * index, void * d, FILE * file)
-{
-    if (dim == s->order - 2) {
-        /* print index */
-        printf("Index: ");
-        for (int i = 0; i < s->order-2; i++)
-            printf("[%d]", index[i]);
-        int dimSize1 = s->dimSize[dim];
-        int dimSize2 = s->dimSize[dim+1];
-        printf("  %d * %d\n", dimSize1, dimSize2);
-
-        /* print 2D data */
-        if (s->dataType == X_FLOAT) {
-            float * data = (float*)d;
-            for (int i = 0; i < dimSize1; i++) {
-                printf("\t");
-                for (int j = 0; j < dimSize2; j++)
-                    fprintf(file, "%e ", *data++);
-                fprintf(file, "\n");
-            }
-            d = (float*)d + dimSize1 *dimSize2;
-        }
-        else if (s->dataType == X_INT) {
-            int * data = (int*)d;
-            for (int i = 0; i < dimSize1; i++) {
-                printf("\t");
-                for (int j = 0; j < dimSize2; j++)
-                    fprintf(file, "%d ", *data++);
-                fprintf(file, "\n");
-            }
-            d = (int*)d + dimSize1 *dimSize2;
-        }
-        else
-            ShowNTErrors("TODO!");
-        
-        return d;
-    }
-
-    /* recursion for deeper dimsion */
-    int levelSize = s->dimSize[dim];
-
-    for (int k = 0; k < levelSize; k++) {
-        index[dim] = k;
-        d = RecursionData(s, dim+1, index, d, file);
-    }
-    return d;
-}
-
-/* 
-dump data to a file 
->> file - where to domp the data
->> label - label of the tensor
->> n - number of items to dump
->> beg - the first item id
->> verbose - verbose level
-*/
-void XTensor::DumpFormat(FILE * file, const char * label, const int n, const int beg, const int verbose)
-{
-    if (verbose > verboseLevel)
-        return;
-
-    void * d = data;
-    bool isNewData = false;
-
-#ifdef USE_CUDA
-    if (devID >= 0) {
-        CudaGPUToCPUFlush(this);
-        d = dataHost;
-        isNewData = true;
-    }
-#endif
-
-    if (d == NULL) {
-        if (isSparse) {
-            int num = 0;
-            for (int i = 0; i < order; i++)
-                num *= dimSizeRDI[i];
-            num = int(num * denseRatio + 1);
-            int tupleSize = sizeof(int) + sizeof(DTYPE);
-            int size = sizeof(int) + tupleSize*(num);
-
-            d = new char[size];
-            memset(d, 0, size);
-        }
-        else {
-            d = new char[unitNum * unitSize];
-            memset(d, 0, unitNum * unitSize);
-        }
-        isNewData = true;
-    }
-
-    if (label != NULL)
-        fprintf(file, "%s ", label);
-    
-    if(isInit){
-        fprintf(file, "id=%d ", id);
-        fprintf(file, "order=%d dimsize=", order);
-        for (int i = 0; i < order; i++) {
-            fprintf(file, "%d", dimSize[i]);
-            if (i < order - 1)
-                fprintf(file, ",");
-        }
-    }
-    else{
-        fprintf(file, "order=-1 dimsize=-1");
-    }
-
-    fprintf(file, " dtype=%s dense=%f\n", GetDataTypeName(dataType), denseRatio);
-
-    if(!isInit){
-        fprintf(file, "NULL");
-    }
- 
-    if (order == 1) {
-        for (int i = 0; i < unitNum; i++) {
-            if (dataType == X_FLOAT)
-                fprintf(file, "%e ", ((float*)d)[i]);
-            else if (dataType == X_INT)
-                fprintf(file, "%d ", ((int*)d)[i]);
-            else
-                ShowNTErrors("TODO!");
-        }
-        printf("\n");
-    }
-    /* print multi-dimensional tensor */
-    else {
-        int * index = new int[order];
-        RecursionData(this, 0, index, d, file);
-        delete[] index;
-    }
-
-    fprintf(file, "\n");
-
-    if (isNewData) {
-        delete[](char*)d;
-#ifdef USE_CUDA
-        if (devID >= 0)
-            dataHost = NULL;
-#endif
-    }
-}
-
 /* 
 dump data to a file
 >> tensor - the tensor for dumping
@@ -2052,6 +1922,26 @@ void XTensor::Dump(const XTensor * tensor, FILE * file, const char * label, cons
 }

 /* 
+dump data to a binary file 
+>> file - where to dump the data
+*/
+void XTensor::BinaryDump(FILE* file)
+{
+    XTensor tmp;
+    InitTensorOnCPU(&tmp, this);
+    _CopyValues(this, &tmp);
+
+    switch (dataType) {
+    case X_INT: {
+        fwrite(tmp.data, sizeof(int), unitNum, file);
+    }
+    default: {
+        fwrite(tmp.data, sizeof(float), unitNum, file);
+    }
+    }
+}
+
+/* 
 read data from a file
 >> file - where to load the data
 >> label - label of the tensor
@@ -2163,6 +2053,30 @@ void XTensor::Read(FILE * file, const char * label)
    delete[](char*)dataBuf;
 }

+/* 
+read data from a binary file
+>>> file - the file stream pointer
+>>> offset - the distance from the start to this tensor
+*/
+void XTensor::BinaryRead(FILE* file, size_t offset)
+{
+    fseek(file, offset, 0);
+    switch (dataType) {
+    case X_INT: {
+        int * d = new int[unitNum];
+        fread(d, sizeof(int), unitNum, file);
+        SetData(d, unitNum);
+        delete[] d;
+    }
+    default: {
+        float * d = new float[unitNum];
+        fread(d, sizeof(float), unitNum, file);
+        SetData(d, unitNum);
+        delete[] d;
+    }
+    }
+}
+
 /*
 flush the data to the target device
 >> targetMem - memory pool on the target device
@@ -2327,7 +2241,7 @@ initialize a dense tensor V2

 void InitTensorV2(XTensor * tensor,
                const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType,
-                const int myDevID)
+                const int myDevID, const bool isEnableGrad)
 {
    if (tensor->mem == NULL) {
        XMem * myMem = GMems.GetMem(myDevID);
@@ -2359,6 +2273,7 @@ void InitTensorV2(XTensor * tensor,
        if(allocated)
            XTensor::AllocateData(tensor);
    }
+    tensor->enableGrad = isEnableGrad;
 }

 /* 
@@ -2392,12 +2307,12 @@ initialize a dense tensor V2
 */

 void InitTensor1DV2(XTensor * tensor, const int num,
-                  const TENSOR_DATA_TYPE myDataType, const int myDevID)
+                  const TENSOR_DATA_TYPE myDataType, const int myDevID, const bool isEnableGrad)
 {
    int dims[1];
    dims[0] = num;

-    InitTensorV2(tensor, 1, dims, myDataType, myDevID);
+    InitTensorV2(tensor, 1, dims, myDataType, myDevID, isEnableGrad);
 }

 /* 
@@ -2434,13 +2349,13 @@ initialize a dense matrix V2
 */

 void InitTensor2DV2(XTensor * tensor, const int rowNum, const int colNum,
-                  const TENSOR_DATA_TYPE myDataType, const int myDevID)
+                  const TENSOR_DATA_TYPE myDataType, const int myDevID, const bool isEnableGrad)
 {
    int dims[2];
    dims[0] = rowNum;
    dims[1] = colNum;

-    InitTensorV2(tensor, 2, dims, myDataType, myDevID);
+    InitTensorV2(tensor, 2, dims, myDataType, myDevID, isEnableGrad);
 }

 /* 
@@ -2480,14 +2395,14 @@ initialize a dense 3d tensor V2
 */

 void InitTensor3DV2(XTensor * tensor, const int d0, const int d1, const int d2, 
-                  const TENSOR_DATA_TYPE myDataType, const int myDevID)
+                  const TENSOR_DATA_TYPE myDataType, const int myDevID, const bool isEnableGrad)
 {
    int dims[3];
    dims[0] = d0;
    dims[1] = d1;
    dims[2] = d2;

-    InitTensorV2(tensor, 3, dims, myDataType, myDevID);
+    InitTensorV2(tensor, 3, dims, myDataType, myDevID, isEnableGrad);
 }
    
 /*
@@ -2530,7 +2445,7 @@ initialize a dense 4d tensor V2
 */

 void InitTensor4DV2(XTensor * tensor, const int d0, const int d1, const int d2, const int d3,
-                  const TENSOR_DATA_TYPE myDataType, const int myDevID)
+                  const TENSOR_DATA_TYPE myDataType, const int myDevID, const bool isEnableGrad)
 {
    int dims[4];
    dims[0] = d0;
@@ -2538,7 +2453,7 @@ void InitTensor4DV2(XTensor * tensor, const int d0, const int d1, const int d2, 
    dims[2] = d2;
    dims[3] = d3;
    
-    InitTensorV2(tensor, 4, dims, myDataType, myDevID);
+    InitTensorV2(tensor, 4, dims, myDataType, myDevID, isEnableGrad);
 }

 /*
@@ -2584,7 +2499,7 @@ initialize a dense 5d tensor V2
 */

 void InitTensor5DV2(XTensor * tensor, const int d0, const int d1, const int d2, const int d3, const int d4,
-                    const TENSOR_DATA_TYPE myDataType, const int myDevID)
+                    const TENSOR_DATA_TYPE myDataType, const int myDevID, const bool isEnableGrad)
 {
    int dims[5];
    dims[0] = d0;
@@ -2593,7 +2508,7 @@ void InitTensor5DV2(XTensor * tensor, const int d0, const int d1, const int d2, 
    dims[3] = d3;
    dims[4] = d4;
    
-    InitTensorV2(tensor, 5, dims, myDataType, myDevID);
+    InitTensorV2(tensor, 5, dims, myDataType, myDevID, isEnableGrad);
 }

 /* 
@@ -2685,10 +2600,12 @@ generate a dense XTensor V2
 */

 XTensor * NewTensorV2(const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType,
-                      const int myDevID)
+                      const int myDevID, const bool isEnableGrad)
 {
    XMem * myMem = GMems.GetMem(myDevID);
-    return new XTensor(myOrder, myDimSize, myDataType, 1.0F, myDevID, myMem);
+    XTensor * tensor = new XTensor(myOrder, myDimSize, myDataType, 1.0F, myDevID, myMem);
+    tensor->enableGrad = isEnableGrad;
+    return tensor;
 }

 /*
@@ -2714,6 +2631,9 @@ XTensor * NewTensorBuf(const int myOrder, const int * myDimSize,

    XTensor * tensor = NewTensor(myOrder, dims, myDataType, myDenseRatio, devID, myMem);

+    if (tensor->unitNum * tensor->unitSize == 176657664) {
+        tensor->Dump(stderr, "", 200);
+    }
    if(myMem != NULL)
        tensor->data = myMem->AllocBuf(myMem->devID, tensor->unitNum * tensor->unitSize);
    else
@@ -2732,14 +2652,14 @@ generate a dense XTensor which allocates data on the buffer V2

 */
 XTensor * NewTensorBufV2(const int myOrder, const int * myDimSize,
-                       const TENSOR_DATA_TYPE myDataType, const int devID)
+                       const TENSOR_DATA_TYPE myDataType, const int devID, const bool isEnableGrad)
 {
    int dims[MAX_TENSOR_DIM_NUM];
    memcpy(dims, myDimSize, sizeof(int) * myOrder);

    dims[0] = -abs(dims[0]);

-    XTensor * tensor = NewTensorV2(myOrder, dims, myDataType, devID);
+    XTensor * tensor = NewTensorV2(myOrder, dims, myDataType, devID, isEnableGrad);

    if (tensor->unitNum * tensor->unitSize == 176657664) {
        tensor->Dump(stderr, "", 200);
@@ -2771,10 +2691,10 @@ generate a XTensor which allocates data on the buffer V2
 >> reference - reference tensor
 >> devID - device id
 */
-XTensor * NewTensorBufV2(const XTensor * reference, int devID)
+XTensor * NewTensorBufV2(const XTensor * reference, int devID, const bool isEnableGrad)
 {
    return NewTensorBufV2(reference->order, reference->dimSize, 
-                        reference->dataType, devID);
+                        reference->dataType, devID, isEnableGrad);
 }

 /* 
@@ -2806,12 +2726,12 @@ generate a dense vector V2
 */

 XTensor * NewTensor1DV2(const int num, 
-                      const TENSOR_DATA_TYPE myDataType, const int myDevID)
+                      const TENSOR_DATA_TYPE myDataType, const int myDevID, const bool isEnableGrad)
 {
    int dims[1];
    dims[0] = num;

-    return NewTensorV2(1, dims, myDataType, myDevID);
+    return NewTensorV2(1, dims, myDataType, myDevID, isEnableGrad);
 }

 /* 
@@ -2846,13 +2766,13 @@ generate a dense matrix V2
 */

 XTensor * NewTensor2DV2(const int rowNum, const int colNum,
-                      const TENSOR_DATA_TYPE myDataType, const int myDevID)
+                      const TENSOR_DATA_TYPE myDataType, const int myDevID, const bool isEnableGrad)
 {
    int dims[2];
    dims[0] = rowNum;
    dims[1] = colNum;

-    return NewTensorV2(2, dims, myDataType, myDevID);
+    return NewTensorV2(2, dims, myDataType, myDevID, isEnableGrad);
 }

 /* 
@@ -2890,14 +2810,14 @@ generate a dense 3d tensor V2
 */

 XTensor * NewTensor3DV2(const int d0, const int d1, const int d2,
-                      const TENSOR_DATA_TYPE myDataType, const int myDevID)
+                      const TENSOR_DATA_TYPE myDataType, const int myDevID, const bool isEnableGrad)
 {
    int dims[3];
    dims[0] = d0;
    dims[1] = d1;
    dims[2] = d2;

-    return NewTensorV2(3, dims, myDataType, myDevID);
+    return NewTensorV2(3, dims, myDataType, myDevID, isEnableGrad);
 }

 /* 
@@ -2938,7 +2858,7 @@ generate a dense 4d tensor V2
 */

 XTensor * NewTensor4DV2(const int d0, const int d1, const int d2, const int d3,
-                      const TENSOR_DATA_TYPE myDataType, const int myDevID)
+                      const TENSOR_DATA_TYPE myDataType, const int myDevID, const bool isEnableGrad)
 {
    int dims[4];
    dims[0] = d0;
@@ -2946,7 +2866,7 @@ XTensor * NewTensor4DV2(const int d0, const int d1, const int d2, const int d3,
    dims[2] = d2;
    dims[3] = d3;

-    return NewTensorV2(4, dims, myDataType, myDevID);
+    return NewTensorV2(4, dims, myDataType, myDevID, isEnableGrad);
 }

 /* 
@@ -2990,7 +2910,7 @@ generate a dense 5d tensor V2
 */

 XTensor * NewTensor5DV2(const int d0, const int d1, const int d2, const int d3, const int d4,
-                      const TENSOR_DATA_TYPE myDataType, const int myDevID)
+                      const TENSOR_DATA_TYPE myDataType, const int myDevID, const bool isEnableGrad)
 {
    int dims[5];
    dims[0] = d0;
@@ -2999,7 +2919,17 @@ XTensor * NewTensor5DV2(const int d0, const int d1, const int d2, const int d3, 
    dims[3] = d3;
    dims[4] = d4;

-    return NewTensorV2(5, dims, myDataType, myDevID);
+    return NewTensorV2(5, dims, myDataType, myDevID, isEnableGrad);
+}
+
+XTensor * NewTensorRange(int lower, int upper, int step, const TENSOR_DATA_TYPE myDataType, const int myDevID, const bool isEnableGrad)
+{
+    int size = abs(upper - lower);
+    int unitNum = ceil(1.0 * size / abs(step));
+
+    XTensor * tensor = NewTensor1DV2(unitNum, myDataType, myDevID, isEnableGrad);
+    tensor->Range(lower, upper, step);
+    return tensor;
 }

 /* 

--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
@@ -290,16 +290,16 @@ public:
    int GetSize() const;

    /* get size of the memory used */
-    int GetDataSizeInChar();
+    int GetDataSizeInChar() const;

    /* get unit size in terms of "dataType" */
-    int GetUnitSize(TENSOR_DATA_TYPE myDataType);
+    int GetUnitSize(TENSOR_DATA_TYPE myDataType) const;

    /* get offset (2D) */
-    MTYPE GetOffset2D(int row, int col);
+    MTYPE GetOffset2D(int row, int col) const;

    /* get offset (3D) */
-    MTYPE GetOffset3D(int d0, int d1, int d2);
+    MTYPE GetOffset3D(int d0, int d1, int d2) const;

    /* a tensor with all entries of 0 */
    void SetZeroAll(XStream * stream = NULL);
@@ -310,6 +310,9 @@ public:
    /* generate data items with a uniform distribution in [0, 1] */
    void Rand(int rNum, int cNum);

+    /* generate data items with a range by start, end and the step */
+    void Range(DTYPE lower, DTYPE upper, DTYPE step);
+
    /* set tensor items by a uniform distribution */
    void SetDataRand(DTYPE lower = 0.0F, DTYPE upper = 1.0F);

@@ -323,10 +326,10 @@ public:
    void SetDataBatchedWithValues(MTYPE * offsets, void * values, int num);

    /* check whether the data array is the same as the answer */
-    bool CheckData(const void * answer, int num, int beg = 0);
+    bool CheckData(const void * answer, int num, int beg = 0) const;

    /* check whether the data array is the same as the answer */
-    bool CheckData(const void * answer, int num, float tolerance, int beg = 0);
+    bool CheckData(const void * answer, int num, float tolerance, int beg = 0) const;
    
    /* set the pointer to "data" */
    void SetDataPointer();
@@ -335,40 +338,40 @@ public:
    void SetAscendingOrder(int dim);

    /* get the value of a cell with the index */
-    DTYPE Get(int index[], int size = -1);
+    DTYPE Get(int index[], int size = -1) const;
    
    /* get the value of a cell with the offset */
-    DTYPE Get(int offset);
+    DTYPE Get(int offset) const;

    /* get the pointer to a cell */
    void * GetCell(int index[], int size = -1) const;

    /* get the default type value of a cell in a 1d tensor */
-    DTYPE Get1D(int i);
+    DTYPE Get1D(int i) const;

    /* get the default type value of a cell in a 2d tensor */
    DTYPE Get2D(int ni, int mi) const;
    
    /* get the default type value of a cell in a 3d tensor */
-    DTYPE Get3D(int d0, int d1, int d2);
+    DTYPE Get3D(int d0, int d1, int d2) const;

    /* get the int value of a cell by its offset */
-    int GetInt(int offset);
+    int GetInt(int offset) const;
    
    /* get the int value of a cell in a 1d tensor */
-    int Get1DInt(int i);
+    int Get1DInt(int i) const;

    /* get the int value of a cell in a 2d tensor */
-    int Get2DInt(int ni, int mi);
+    int Get2DInt(int ni, int mi) const;
    
    /* get the int value of a cell in a 3d tensor */
-    int Get3DInt(int d0, int d1, int d2);
+    int Get3DInt(int d0, int d1, int d2) const;

    /* get the value of a cell in a sparse tensor */
-    DTYPE GetInSparse(int i);
+    DTYPE GetInSparse(int i) const;

    /* get the key value of a tuple in a sparse tensor */
-    int GetKeyInSparse(int i);
+    int GetKeyInSparse(int i) const;

    /* set the value of a cell */
    bool Set(DTYPE value, int index[], int size = -1);
@@ -404,7 +407,7 @@ public:
    bool Add2D(DTYPE value, int ni, int mi);

    /* get the number of non-zero elements (in a sparse tensor) */
-    int GetNonzeroSize();
+    int GetNonzeroSize() const;

    /* set the tensor as "temporary" */
    void SetTMPFlag(bool myIsTmp = true);
@@ -428,17 +431,20 @@ public:

    /* dump data to a file */
    void Dump(FILE * file, const char * label = NULL, const int n = -1, const int beg = 0, const int verbose = 0);
-    
-    /* dump data to a file */
-    void DumpFormat(FILE * file, const char * label = NULL, const int n = -1, const int beg = 0, const int verbose = 0);

    /* dump data to a file */
    static
    void Dump(const XTensor * tensor, FILE * file, const char * label = NULL, const int n = -1, const int beg = 0, const int verbose = 0);

+    /* dump data to a binary file */
+    void BinaryDump(FILE * file);
+
    /* read data from a file */
    void Read(FILE * file, const char * label = NULL);

+    /* read data from a binary file */
+    void BinaryRead(FILE * file, size_t offset);
+
    /* flush the data to the target device */
    void FlushToMem(XMem * targetMem);

@@ -469,7 +475,7 @@ void InitTensor(XTensor * tensor,
 /* initialize a dense XTensor V2 */
 void InitTensorV2(XTensor * tensor,
                const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType = X_FLOAT,
-                const int myDevID = -1);
+                const int myDevID = -1, const bool isEnableGrad = true);

 /* initialize a dense vector */
 void InitTensor1D(XTensor * tensor, const int num, 
@@ -477,7 +483,7 @@ void InitTensor1D(XTensor * tensor, const int num,

 /* initialize a dense vector V2 */
 void InitTensor1DV2(XTensor * tensor, const int num, 
-                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1);
+                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = true);

 /* initialize a dense matrix */
 void InitTensor2D(XTensor * tensor, const int rowNum, const int colNum,
@@ -485,7 +491,7 @@ void InitTensor2D(XTensor * tensor, const int rowNum, const int colNum,

 /* initialize a dense matrix V2 */
 void InitTensor2DV2(XTensor * tensor, const int rowNum, const int colNum,
-                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1);
+                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = true);

 /* initialize a dense 3d tensor */
 void InitTensor3D(XTensor * tensor, const int d0, const int d1, const int d2,
@@ -493,7 +499,7 @@ void InitTensor3D(XTensor * tensor, const int d0, const int d1, const int d2,

 /* initialize a dense 3d tensor V2 */
 void InitTensor3DV2(XTensor * tensor, const int d0, const int d1, const int d2,
-                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1);
+                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = true);
    
 /* initialize a dense 4d tensor */
 void InitTensor4D(XTensor * tensor, const int d0, const int d1, const int d2, const int d3,
@@ -501,7 +507,7 @@ void InitTensor4D(XTensor * tensor, const int d0, const int d1, const int d2, co

 /* initialize a dense 4d tensor V2 */
 void InitTensor4DV2(XTensor * tensor, const int d0, const int d1, const int d2, const int d3,
-                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1);
+                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = true);

 /* initialize a dense 5d tensor */
 void InitTensor5D(XTensor * tensor, const int d0, const int d1, const int d2, const int d3, const int d4,
@@ -509,7 +515,7 @@ void InitTensor5D(XTensor * tensor, const int d0, const int d1, const int d2, co

 /* initialize a dense 5d tensor V2 */
 void InitTensor5DV2(XTensor * tensor, const int d0, const int d1, const int d2, const int d3, const int d4,
-                    const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1);
+                    const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = true);

 /* initialize a tensor with a reference tensor */
 void InitTensor(XTensor * tensor, const XTensor * reference);
@@ -529,7 +535,7 @@ XTensor * NewTensor(const int myOrder, const int * myDimSize, const TENSOR_DATA_

 /* generate a dense XTensor V2 */
 XTensor * NewTensorV2(const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType = X_FLOAT,
-                      const int myDevID = -1);
+                      const int myDevID = -1, const bool isEnableGrad = true);

 /* generate a XTensor which allocates data on the buffer */
 XTensor * NewTensorBuf(const int myOrder, const int * myDimSize,
@@ -538,20 +544,20 @@ XTensor * NewTensorBuf(const int myOrder, const int * myDimSize,

 /* generate a dense XTensor which allocates data on the buffer V2 */
 XTensor * NewTensorBufV2(const int myOrder, const int * myDimSize,
-                       const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1);
+                       const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = true);

 /* generate a XTensor which allocates data on the buffer */
 XTensor * NewTensorBuf(const XTensor * reference, int devID, XMem * myMem);

 /* generate a XTensor which allocates data on the buffer V2 */
-XTensor * NewTensorBufV2(const XTensor * reference, int devID);
+XTensor * NewTensorBufV2(const XTensor * reference, int devID, const bool isEnableGrad = true);

 /* generate a dense vector */
 XTensor * NewTensor1D(const int num, const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, 
                      XMem * myMem = NULL);

 /* generate a dense vector V2 */
-XTensor * NewTensor1DV2(const int num, const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1);
+XTensor * NewTensor1DV2(const int num, const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = true);

 /* generate a dense matrix */
 XTensor * NewTensor2D(const int rowNum, const int colNum, 
@@ -561,7 +567,7 @@ XTensor * NewTensor2D(const int rowNum, const int colNum,
 /* generate a dense matrix V2 */
 XTensor * NewTensor2DV2(const int rowNum, const int colNum, 
                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
-                      const int myDevID = -1);
+                      const int myDevID = -1, const bool isEnableGrad = true);

 /* generate a dense 3d tensor */
 XTensor * NewTensor3D(const int d0, const int d1, const int d2, 
@@ -571,7 +577,7 @@ XTensor * NewTensor3D(const int d0, const int d1, const int d2,
 /* generate a dense 3d tensor V2 */
 XTensor * NewTensor3DV2(const int d0, const int d1, const int d2, 
                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
-                      const int myDevID = -1);
+                      const int myDevID = -1, const bool isEnableGrad = true);

 /* generate a dense 4d tensor */
 XTensor * NewTensor4D(const int d0, const int d1, const int d2, const int d3,
@@ -581,7 +587,7 @@ XTensor * NewTensor4D(const int d0, const int d1, const int d2, const int d3,
 /* generate a dense 4d tensor V2 */
 XTensor * NewTensor4DV2(const int d0, const int d1, const int d2, const int d3,
                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
-                      const int myDevID = -1);
+                      const int myDevID = -1, const bool isEnableGrad = true);

 /* generate a dense 5d tensor */
 XTensor * NewTensor5D(const int d0, const int d1, const int d2, const int d3, const int d4,
@@ -591,7 +597,10 @@ XTensor * NewTensor5D(const int d0, const int d1, const int d2, const int d3, co
 /* generate a dense 5d tensor V2 */
 XTensor * NewTensor5DV2(const int d0, const int d1, const int d2, const int d3, const int d4,
                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
-                      const int myDevID = -1);
+                      const int myDevID = -1, const bool isEnableGrad = true);
+
+/* generate a dense vector by range */
+XTensor * NewTensorRange(int lower, int upper, int step, const TENSOR_DATA_TYPE myDataType = X_INT, const int myDevID = -1, const bool isEnableGrad = true);

 /* generate a copy of XTensor (with a reference to a given tensor) */
 XTensor * NewTensor(const XTensor * a, bool isFilledData = true);

--- a/source/tensor/core/arithmetic/Div.cpp
+++ b/source/tensor/core/arithmetic/Div.cpp
@@ -215,18 +215,22 @@ XTensor Div(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim)
        _Div(&a, &b, &c, alpha, leadingDim);
    
        /* tensor connections */
-        XLink::MakeLink(&a, &b, &c, MATH_DIV);
-        XLink::AddParamToHead(&c, alpha);
-        XLink::AddParamToHeadInt(&c, leadingDim);
+        if (a.enableGrad && b.enableGrad) {
+            XLink::MakeLink(&a, &b, &c, MATH_DIV);
+            XLink::AddParamToHead(&c, alpha);
+            XLink::AddParamToHeadInt(&c, leadingDim);
+        }
    }
    else if(n >= 0 && n < a.order){
        /* call _DivDim function */
        _DivDim(&a, &b, &c, n, alpha);

        /* tensor connections */
-        XLink::MakeLink(&a, &b, &c, MATH_DIVDIM);
-        XLink::AddParamToHeadInt(&c, n);
-        XLink::AddParamToHead(&c, alpha);
+        if (a.enableGrad && b.enableGrad) {
+            XLink::MakeLink(&a, &b, &c, MATH_DIVDIM);
+            XLink::AddParamToHeadInt(&c, n);
+            XLink::AddParamToHead(&c, alpha);
+        }
    }
    else{
        ShowNTErrors("Something is wrong!");
@@ -261,7 +265,7 @@ void Div(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int leadin
        /* call _Div function */
        _Div(&a, &b, &c, 0, leadingDim);

-        if (c.enableGrad) {
+        if (a.enableGrad && b.enableGrad) {
            /* tensor connections */
            XLink::MakeLink(&a, &b, &c, MATH_DIV);
            XLink::AddParamToHead(&c, alpha);
@@ -272,7 +276,7 @@ void Div(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int leadin
        /* call _DivDim function */
        _DivDim(&a, &b, &c, n, alpha);

-        if (c.enableGrad) {
+        if (a.enableGrad && b.enableGrad) {
            /* tensor connections */
            XLink::MakeLink(&a, &b, &c, MATH_DIVDIM);
            XLink::AddParamToHeadInt(&c, n);

--- a/source/tensor/core/arithmetic/DivDim.cpp
+++ b/source/tensor/core/arithmetic/DivDim.cpp
@@ -164,10 +164,12 @@ XTensor DivDim(const XTensor &a, const XTensor &b, int n, DTYPE alpha)
    _DivDim(&a, &b, &c, n, alpha);
    
    /* tensor connections */
-    XLink::MakeLink(&a, &b, &c, MATH_DIVDIM);
-    XLink::AddParamToHeadInt(&c, n);
-    XLink::AddParamToHead(&c, alpha);
-    
+    if (a.enableGrad && b.enableGrad) {
+        XLink::MakeLink(&a, &b, &c, MATH_DIVDIM);
+        XLink::AddParamToHeadInt(&c, n);
+        XLink::AddParamToHead(&c, alpha);
+    }
+
    return c;
 }

@@ -193,7 +195,7 @@ void DivDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE alpha)
    /* call _Div function */
    _DivDim(&a, &b, &c, n, alpha);

-    if (c.enableGrad == true) {
+    if (a.enableGrad && b.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&a, &b, &c, MATH_DIVDIM);
        XLink::AddParamToHeadInt(&c, n);

--- a/source/tensor/core/arithmetic/Mask.cpp
+++ b/source/tensor/core/arithmetic/Mask.cpp
@@ -155,8 +155,10 @@ XTensor Mask(const XTensor &a, const XTensor &mask, DTYPE alpha)
    _Mask(&a, &mask, &c, alpha);

    /* tensor connections */
-    XLink::MakeLink(&a, &mask, &c, MATH_MASK);
-    XLink::AddParamToHead(&c, alpha);
+    if (a.enableGrad) {
+        XLink::MakeLink(&a, &mask, &c, MATH_MASK);
+        XLink::AddParamToHead(&c, alpha);
+    }

    return c;
 }
@@ -176,7 +178,7 @@ void Mask(const XTensor &a, const XTensor &mask, XTensor &c, DTYPE alpha)
    /* call _Mask function */
    _Mask(&a, &mask, &c, alpha);

-    if (c.enableGrad) {
+    if (a.enableGrad) {
        XLink::MakeLink(&a, &mask, &c, MATH_MASK);
        XLink::AddParamToHead(&c, alpha);
    }

--- a/source/tensor/core/arithmetic/MatrixMul.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul.cpp
@@ -296,10 +296,12 @@ XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
    _MatrixMul(&a, transposedA, &b, transposedB, &c, alpha, 0, parallelRunner);

    /* tensor connections */
-    XLink::MakeLink(&a, &b, &c, MATH_MATRIXMUL);
-    XLink::AddParamToHeadTrans(&c, transposedA);
-    XLink::AddParamToHeadTrans(&c, transposedB);
-    XLink::AddParamToHead(&c, alpha);
+    if (a.enableGrad && b.enableGrad) {
+        XLink::MakeLink(&a, &b, &c, MATH_MATRIXMUL);
+        XLink::AddParamToHeadTrans(&c, transposedA);
+        XLink::AddParamToHeadTrans(&c, transposedB);
+        XLink::AddParamToHead(&c, alpha);
+    }

    /* destroy variables */
    delete[] dimSize;
@@ -344,7 +346,7 @@ void MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
    /* call _MatrixMul function */
    _MatrixMul(&a, transposedA, &b, transposedB, &c, alpha, beta, parallelRunner);

-    if (c.enableGrad) {
+    if (a.enableGrad && b.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&a, &b, &c, MATH_MATRIXMUL);
        XLink::AddParamToHeadTrans(&c, transposedA);
@@ -393,10 +395,12 @@ XTensor MatrixMul(const XTensor &a, const XTensor &b,
    _MatrixMul(&a, X_NOTRANS, &b, X_NOTRANS, &c, alpha, 0, parallelRunner);

    /* tensor connections */
-    XLink::MakeLink(&a, &b, &c, MATH_MATRIXMUL);
-    XLink::AddParamToHeadTrans(&c, X_NOTRANS);
-    XLink::AddParamToHeadTrans(&c, X_NOTRANS);
-    XLink::AddParamToHead(&c, alpha);
+    if (a.enableGrad && b.enableGrad) {
+        XLink::MakeLink(&a, &b, &c, MATH_MATRIXMUL);
+        XLink::AddParamToHeadTrans(&c, X_NOTRANS);
+        XLink::AddParamToHeadTrans(&c, X_NOTRANS);
+        XLink::AddParamToHead(&c, alpha);
+    }

    /* destroy variables */
    delete[] dimSize;
@@ -440,7 +444,7 @@ void MatrixMul(const XTensor &a, const XTensor &b, XTensor &c,
    /* call _MatrixMul function */
    _MatrixMul(&a, X_NOTRANS, &b, X_NOTRANS, &c, alpha, 0, parallelRunner);

-    if (c.enableGrad) {
+    if (a.enableGrad && b.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&a, &b, &c, MATH_MATRIXMUL);
        XLink::AddParamToHeadTrans(&c, X_NOTRANS);

--- a/source/tensor/core/arithmetic/MatrixMul2D.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul2D.cpp
@@ -54,15 +54,15 @@ void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
    CheckNTErrors((a->order == 2 && b->order == 2 && c->order == 2),
                  "Input tensors must have a order = 2!");

-	int an = a->dimSize[0], am = a->dimSize[1];
-	int bn = b->dimSize[0], bm = b->dimSize[1];
-	int cn = c->dimSize[0], cm = c->dimSize[1];
-	int am2 = transposedA == X_TRANS ? an : am;
-	int an2 = transposedA == X_TRANS ? am : an;
-	int bm2 = transposedB == X_TRANS ? bn : bm;
-	int bn2 = transposedB == X_TRANS ? bm : bn;
-	int cm2 = cm;
-	int cn2 = cn;
+    int an = a->dimSize[0], am = a->dimSize[1];
+    int bn = b->dimSize[0], bm = b->dimSize[1];
+    int cn = c->dimSize[0], cm = c->dimSize[1];
+    int am2 = transposedA == X_TRANS ? an : am;
+    int an2 = transposedA == X_TRANS ? am : an;
+    int bm2 = transposedB == X_TRANS ? bn : bm;
+    int bn2 = transposedB == X_TRANS ? bm : bn;
+    int cm2 = cm;
+    int cn2 = cn;

    CheckNTErrors((am2 == bn2 && an2 == cn2 && bm2 == cm2),
                  "Unmatched tensors in multiplication!");
@@ -82,10 +82,11 @@ void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
            b->dataType == DEFAULT_DTYPE &&
            c->dataType == DEFAULT_DTYPE)
        {
-            if (useBLAS)
+#if defined(USE_BLAS)
                _MatrixMULCPU(a, transposedA, b, transposedB, c, alpha, beta);
-            else
+#else
                _MatrixMul2DParallel(a, transposedA, b, transposedB, c, alpha, beta, parallelRunner);
+#endif
        }
        else {
            // TODO!!

--- a/source/tensor/core/arithmetic/MatrixMulBatched.cpp
+++ b/source/tensor/core/arithmetic/MatrixMulBatched.cpp
@@ -199,10 +199,7 @@ void _MatrixMulBatchedCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
        bi->data = (char*)b->data + i * bRealBlockSize;
        ci->data = (char*)c->data + i * cRealBlockSize;
 #ifdef USE_BLAS
-        if (useBLAS)
-            _MatrixMULCPU(ai, transposedA, bi, transposedB, ci, alpha, beta);
-        else
-            _MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
+        _MatrixMULCPU(ai, transposedA, bi, transposedB, ci, alpha, beta);
 #else
        _MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
 #endif
@@ -262,10 +259,7 @@ void _MatrixMulBatchedCPU(const TensorList * a, MATRIX_TRANS_TYPE transposedA,
        CheckNTErrors((bi->order == 2), "2d tensor (i.e., matrix) is required!");
        CheckNTErrors((ci->order == 2), "2d tensor (i.e., matrix) is required!");
 #ifdef USE_BLAS
-        if (useBLAS)
            _MatrixMULCPU(ai, transposedA, bi, transposedB, ci, alpha, beta);
-        else
-            _MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
 #else
        _MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
 #endif
@@ -320,10 +314,12 @@ XTensor MatrixMulBatched(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const 
    _MatrixMulBatched(&a, transposedA, &b, transposedB, &c, alpha, 0, parallelRunner);

    /* tensor connections */
-    XLink::MakeLink(&a, &b, &c, MATH_MATRIXMULBATCHED);
-    XLink::AddParamToHeadTrans(&c, transposedA);
-    XLink::AddParamToHeadTrans(&c, transposedB);
-    XLink::AddParamToHead(&c, alpha);
+    if (a.enableGrad && b.enableGrad) {
+        XLink::MakeLink(&a, &b, &c, MATH_MATRIXMULBATCHED);
+        XLink::AddParamToHeadTrans(&c, transposedA);
+        XLink::AddParamToHeadTrans(&c, transposedB);
+        XLink::AddParamToHead(&c, alpha);
+    }

    /* destroy variables */
    delete[] dimSize;
@@ -376,10 +372,12 @@ XTensor MatrixMulBatched(const XTensor &a, const XTensor &b,
    _MatrixMulBatched(&a, X_NOTRANS, &b, X_NOTRANS, &c, alpha, 0, parallelRunner);

    /* tensor connections */
-    XLink::MakeLink(&a, &b, &c, MATH_MATRIXMULBATCHED);
-    XLink::AddParamToHeadTrans(&c, X_NOTRANS);
-    XLink::AddParamToHeadTrans(&c, X_NOTRANS);
-    XLink::AddParamToHead(&c, alpha);
+    if (a.enableGrad && b.enableGrad) {
+        XLink::MakeLink(&a, &b, &c, MATH_MATRIXMULBATCHED);
+        XLink::AddParamToHeadTrans(&c, X_NOTRANS);
+        XLink::AddParamToHeadTrans(&c, X_NOTRANS);
+        XLink::AddParamToHead(&c, alpha);
+    }

    /* destroy variables */
    delete[] dimSize;

--- a/source/tensor/core/arithmetic/MulAndShift.cpp
+++ b/source/tensor/core/arithmetic/MulAndShift.cpp
@@ -118,11 +118,87 @@ XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
    }

    /* tensor connections */
-    XLink::MakeLink(&x, &w, &b, &c, MATH_MULANDSHIFT);
-    XLink::AddParamToHeadInt(&c, n);
-    XLink::AddParamToHeadTrans(&c, X_NOTRANS);
-    XLink::AddParamToHeadTrans(&c, X_NOTRANS);
-    //XLink::AddParamToHead(&c, beta);
+    if (w.enableGrad && b.enableGrad) {
+        XLink::MakeLink(&x, &w, &b, &c, MATH_MULANDSHIFT);
+        XLink::AddParamToHeadInt(&c, n);
+        XLink::AddParamToHeadTrans(&c, X_NOTRANS);
+        XLink::AddParamToHeadTrans(&c, X_NOTRANS);
+    }
+
+    /* destroy variables */
+    delete[] dimSize;
+    DelTensorBuf(tmp);
+
+    return c;
+
+}
+
+/*
+operation c = x * w + b  MulAndShift
+>> x - tensor x
+>> w - tensor w
+>> b - tensor b
+>> parallelRunner - parallel processing module
+<< return - the result of matrix multiplication
+*/
+XTensor MulAndShift(const XTensor& x, MATRIX_TRANS_TYPE transposedA,
+                    const XTensor& w, MATRIX_TRANS_TYPE transposedB,
+                    const XTensor& b, DTYPE alpha, XPRunner* parallelRunner)
+{
+    CheckNTErrors(x.dataType == w.dataType, "Input tensors should have the same data type!");
+    CheckNTErrors(x.order >= 2 && w.order >= 2, "Input tensors must have a order >= 2!");
+
+    int xn = transposedA == X_TRANS ? x.dimSizeRDI[0] : x.dimSizeRDI[1];
+    int xm = transposedA == X_TRANS ? x.dimSizeRDI[1] : x.dimSizeRDI[0];
+    int wn = transposedB == X_TRANS ? w.dimSizeRDI[0] : w.dimSizeRDI[1];
+    int wm = transposedB == X_TRANS ? w.dimSizeRDI[1] : w.dimSizeRDI[0];
+
+    int order = x.order + w.order - 2;
+    int sub = 0;
+    int * dimSize = new int[order];
+    for (int i = 2; i < x.order; i++)
+        dimSize[sub++] = x.dimSizeRDI[x.order + 1 - i];
+    for (int i = 2; i < w.order; i++)
+        dimSize[sub++] = w.dimSizeRDI[w.order + 1 - i];
+    dimSize[sub++] = xn;
+    dimSize[sub++] = wm;
+
+    float dr = (!x.isSparse || !w.isSparse) ? 1.0F : MAX(x.denseRatio, w.denseRatio);
+
+    XTensor * tmp = NewTensorBuf(order, dimSize, x.dataType, dr, x.devID, x.mem);
+
+    /* call _MatrixMul function */
+    _MatrixMul(&x, transposedA, &w, transposedB, tmp, alpha, 0, parallelRunner);
+
+    XTensor c(tmp);
+    c.SetTMPFlag();
+
+    int n = GetSumIndex(tmp, b);
+
+    if (n == -1) {
+        /* call _Sum function */
+        _Sum(tmp, &b, &c);
+
+        // TODO!!
+        ShowNTErrors("TODO!");
+
+    }
+    else if (n >= 0 && n < tmp->order) {
+        /* call _SumDim function */
+        _SumDim(tmp, &b, &c, n);
+
+    }
+    else {
+        ShowNTErrors("Something is wrong!");
+    }
+
+    /* tensor connections */
+    if (w.enableGrad && b.enableGrad) {
+        XLink::MakeLink(&x, &w, &b, &c, MATH_MULANDSHIFT);
+        XLink::AddParamToHeadInt(&c, n);
+        XLink::AddParamToHeadTrans(&c, transposedA);
+        XLink::AddParamToHeadTrans(&c, transposedB);
+    }

    /* destroy variables */
    delete[] dimSize;

--- a/source/tensor/core/arithmetic/MulAndShift.h
+++ b/source/tensor/core/arithmetic/MulAndShift.h
@@ -31,6 +31,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
                    DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);

+XTensor MulAndShift(const XTensor &x, MATRIX_TRANS_TYPE transposedA, 
+                    const XTensor &w, MATRIX_TRANS_TYPE transposedB, 
+                    const XTensor &b, DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/arithmetic/Multiply.cpp
+++ b/source/tensor/core/arithmetic/Multiply.cpp
@@ -216,18 +216,22 @@ XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim
        _Multiply(&a, &b, &c, 0, leadingDim);
    
        /* tensor connections */
-        XLink::MakeLink(&a, &b, &c, MATH_MULTIPLY);
-        XLink::AddParamToHead(&c, alpha);
-        XLink::AddParamToHeadInt(&c, leadingDim);
+        if (a.enableGrad && b.enableGrad) {
+            XLink::MakeLink(&a, &b, &c, MATH_MULTIPLY);
+            XLink::AddParamToHead(&c, alpha);
+            XLink::AddParamToHeadInt(&c, leadingDim);
+        }
    }
    else if(n >= 0 && n < a.order){
        /* call _MultiplyDim function */
        _MultiplyDim(&a, &b, &c, n, alpha);

        /* tensor connections */
-        XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYDIM);
-        XLink::AddParamToHeadInt(&c, n);
-        XLink::AddParamToHead(&c, alpha);
+        if (a.enableGrad && b.enableGrad) {
+            XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYDIM);
+            XLink::AddParamToHeadInt(&c, n);
+            XLink::AddParamToHead(&c, alpha);
+        }
    }
    else{
        ShowNTErrors("Something is wrong!");
@@ -262,7 +266,7 @@ void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int l
        /* call _Multiply function */
        _Multiply(&a, &b, &c, 0, leadingDim);

-        if (c.enableGrad) {
+        if (a.enableGrad && b.enableGrad) {
            /* tensor connections */
            XLink::MakeLink(&a, &b, &c, MATH_MULTIPLY);
            XLink::AddParamToHead(&c, alpha);
@@ -273,7 +277,7 @@ void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int l
        /* call _MultiplyDim function */
        _MultiplyDim(&a, &b, &c, n, alpha);

-        if (c.enableGrad) {
+        if (a.enableGrad && b.enableGrad) {
            /* tensor connections */
            XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYDIM);
            XLink::AddParamToHeadInt(&c, n);

--- a/source/tensor/core/arithmetic/MultiplyDim.cpp
+++ b/source/tensor/core/arithmetic/MultiplyDim.cpp
@@ -180,9 +180,11 @@ XTensor MultiplyDim(const XTensor &a, const XTensor &b, int n)
    _MultiplyDim(&a, &b, &c, n, 0);

    /* tensor connections */
-    XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYDIM);
-    XLink::AddParamToHeadInt(&c, n);
-    XLink::AddParamToHead(&c, 0);
+    if (a.enableGrad && b.enableGrad) {
+        XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYDIM);
+        XLink::AddParamToHeadInt(&c, n);
+        XLink::AddParamToHead(&c, 0);
+    }

    return c;
 }
@@ -208,7 +210,7 @@ void MultiplyDim(const XTensor &a, const XTensor &b, XTensor &c, int n)
    /* call _Multiply function */
    _MultiplyDim(&a, &b, &c, n, 0);

-    if (c.enableGrad) {
+    if (a.enableGrad && b.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYDIM);
        XLink::AddParamToHeadInt(&c, n);
@@ -350,8 +352,10 @@ XTensor MultiplyBroadcast(const XTensor &a, const XTensor &b)
    _MultiplyBroadcast(&a, &b, &c, 0);
    
    /* tensor connections */
-    XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYBROADCAST);
-    XLink::AddParamToHead(&c, 0);
+    if (a.enableGrad && b.enableGrad) {
+        XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYBROADCAST);
+        XLink::AddParamToHead(&c, 0);
+    }
    
    return c;
 }
@@ -374,7 +378,7 @@ void MultiplyBroadcast(const XTensor &a, const XTensor &b, XTensor &c)
    /* call _SumBroadcast function */
    _MultiplyBroadcast(&a, &b, &c, 0);

-    if (c.enableGrad) {
+    if (a.enableGrad && b.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYBROADCAST);
        XLink::AddParamToHead(&c, 0);

--- a/source/tensor/core/arithmetic/Sub.cpp
+++ b/source/tensor/core/arithmetic/Sub.cpp
@@ -190,17 +190,21 @@ XTensor Sub(const XTensor &a, const XTensor &b, DTYPE beta)
        _Sub(&a, &b, &c, beta);
        
        /* tensor connections */
-        XLink::MakeLink(&a, &b, &c, MATH_SUB);
-        XLink::AddParamToHead(&c, beta);
+        if (a.enableGrad && b.enableGrad) {
+            XLink::MakeLink(&a, &b, &c, MATH_SUB);
+            XLink::AddParamToHead(&c, beta);
+        }
    }
    else if(n >= 0 && n < a.order){
        /* call _SubDim function */
        _SubDim(&a, &b, &c, n, beta);
        
        /* tensor connections */
-        XLink::MakeLink(&a, &b, &c, MATH_SUBDIM);
-        XLink::AddParamToHeadInt(&c, n);
-        XLink::AddParamToHead(&c, beta);
+        if (a.enableGrad && b.enableGrad) {
+            XLink::MakeLink(&a, &b, &c, MATH_SUBDIM);
+            XLink::AddParamToHeadInt(&c, n);
+            XLink::AddParamToHead(&c, beta);
+        }
    }
    else{
        ShowNTErrors("Something is wrong!");
@@ -229,7 +233,7 @@ void Sub(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta)
        /* call _Sub function */
        _Sub(&a, &b, &c, beta);

-        if (c.enableGrad) {
+        if (a.enableGrad && b.enableGrad) {
            /* tensor connections */
            XLink::MakeLink(&a, &b, &c, MATH_SUB);
            XLink::AddParamToHead(&c, beta);
@@ -239,7 +243,7 @@ void Sub(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta)
        /* call _SubDim function */
        _SubDim(&a, &b, &c, n, beta);

-        if (c.enableGrad) {
+        if (a.enableGrad && b.enableGrad) {
            /* tensor connections */
            XLink::MakeLink(&a, &b, &c, MATH_SUBDIM);
            XLink::AddParamToHeadInt(&c, n);

--- a/source/tensor/core/arithmetic/SubDim.cpp
+++ b/source/tensor/core/arithmetic/SubDim.cpp
@@ -164,9 +164,11 @@ XTensor SubDim(const XTensor &a, const XTensor &b, int n, DTYPE beta)
    _SubDim(&a, &b, &c, n, beta);

    /* tensor connections */
-    XLink::MakeLink(&a, &b, &c, MATH_SUBDIM);
-    XLink::AddParamToHeadInt(&c, n);
-    XLink::AddParamToHead(&c, beta);
+    if (a.enableGrad && b.enableGrad) {
+        XLink::MakeLink(&a, &b, &c, MATH_SUBDIM);
+        XLink::AddParamToHeadInt(&c, n);
+        XLink::AddParamToHead(&c, beta);
+    }

    return c;
 }
@@ -193,7 +195,7 @@ void SubDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta)
    /* call _Sub function */
    _SubDim(&a, &b, &c, n, beta);

-    if (c.enableGrad) {
+    if (a.enableGrad && b.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&a, &b, &c, MATH_SUBDIM);
        XLink::AddParamToHeadInt(&c, n);

--- a/source/tensor/core/arithmetic/Sum.cpp
+++ b/source/tensor/core/arithmetic/Sum.cpp
@@ -22,6 +22,7 @@
 #include "../../XTensor.h"
 #include "../../XName.h"
 #include "../../XUtility.h"
+#include "../../XBLAS.h"
 #include "../movement/CopyValues.h"
 #include "Sum.h"
 #include "Sum.cuh"
@@ -84,29 +85,57 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
                DTYPE * ap = (DTYPE*)a->data;
                DTYPE * bp = (DTYPE*)b->data;
                DTYPE * cp = (DTYPE*)c->data;
-    
-                /* unrolling */
-                int num = a->unitNum;
-                if (num % 4 == 0) {
-                    for (int i = 0; i < num; i += 4) {
-                        cp[i] = ap[i] + bp[i] * beta;
-                        cp[i + 1] = ap[i + 1] + bp[i + 1] * beta;
-                        cp[i + 2] = ap[i + 2] + bp[i + 2] * beta;
-                        cp[i + 3] = ap[i + 3] + bp[i + 3] * beta;
-                    }
+                /* when c != a, OpenBLAS needs to copy a to c first. This operation
+                 slow down the speed, so just use OpenBLAS when c == a */
+#if defined(USE_BLAS)
+                if( c == a){
+                    AXPY(a->unitNum,beta,bp,1,cp,1);
+                } else{
+                     int num = a->unitNum;
+                        if (num % 4 == 0) {
+                            for (int i = 0; i < num; i += 4) {
+                                cp[i] = ap[i] + bp[i] * beta;
+                                cp[i + 1] = ap[i + 1] + bp[i + 1] * beta;
+                                cp[i + 2] = ap[i + 2] + bp[i + 2] * beta;
+                                cp[i + 3] = ap[i + 3] + bp[i + 3] * beta;
+                            }
+                        }
+                        else if (num % 2 == 0) {
+                            for (int i = 0; i < num; i += 2) {
+                                cp[i] = ap[i] + bp[i] * beta;
+                                cp[i + 1] = ap[i + 1] + bp[i + 1] * beta;
+                            }
+                        }
+                        else {
+                            for (int i = 0; i < num; i++) {
+                                cp[i] = ap[i] + bp[i] * beta;
+                            }
+                        }
                }
-                else if (num % 2 == 0) {
-                    for (int i = 0; i < num; i += 2) {
-                        cp[i] = ap[i] + bp[i] * beta;
-                        cp[i + 1] = ap[i + 1] + bp[i + 1] * beta;
+#else
+                    /* unrolling */
+                    int num = a->unitNum;
+                    if (num % 4 == 0) {
+                        for (int i = 0; i < num; i += 4) {
+                            cp[i] = ap[i] + bp[i] * beta;
+                            cp[i + 1] = ap[i + 1] + bp[i + 1] * beta;
+                            cp[i + 2] = ap[i + 2] + bp[i + 2] * beta;
+                            cp[i + 3] = ap[i + 3] + bp[i + 3] * beta;
+                        }
                    }
-                }
-                else {
-                    for (int i = 0; i < num; i++) {
-                        cp[i] = ap[i] + bp[i] * beta;
+                    else if (num % 2 == 0) {
+                        for (int i = 0; i < num; i += 2) {
+                            cp[i] = ap[i] + bp[i] * beta;
+                            cp[i + 1] = ap[i + 1] + bp[i + 1] * beta;
+                        }
                    }
+                    else {
+                        for (int i = 0; i < num; i++) {
+                            cp[i] = ap[i] + bp[i] * beta;
+                        }
+                    }
+#endif
                }
-            }
            else {
                // TODO!!
                ShowNTErrors("TODO!");
@@ -195,17 +224,21 @@ XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta)
        _Sum(&a, &b, &c, beta);
    
        /* tensor connections */
-        XLink::MakeLink(&a, &b, &c, MATH_SUM);
-        XLink::AddParamToHead(&c, beta);
+        if (a.enableGrad && b.enableGrad) {
+            XLink::MakeLink(&a, &b, &c, MATH_SUM);
+            XLink::AddParamToHead(&c, beta);
+        }
    }
    else if(n >= 0 && n < a.order){
        /* call _SumDim function */
        _SumDim(&a, &b, &c, n, beta);
    
        /* tensor connections */
-        XLink::MakeLink(&a, &b, &c, MATH_SUMDIM);
-        XLink::AddParamToHeadInt(&c, n);
-        XLink::AddParamToHead(&c, beta);
+        if (a.enableGrad && b.enableGrad) {
+            XLink::MakeLink(&a, &b, &c, MATH_SUMDIM);
+            XLink::AddParamToHeadInt(&c, n);
+            XLink::AddParamToHead(&c, beta);
+        }
    }
    else{
        ShowNTErrors("Something is wrong!");
@@ -232,9 +265,9 @@ void Sum(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta)
    if (n == -1) {
        /* call _Sum function */
        _Sum(&a, &b, &c, beta);
-
-        if (c.enableGrad) {
-            /* tensor connections */
+		
+		/* tensor connections */
+        if (a.enableGrad && b.enableGrad) {    
            XLink::MakeLink(&a, &b, &c, MATH_SUM);
            XLink::AddParamToHead(&c, beta);
        }
@@ -242,9 +275,9 @@ void Sum(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta)
    else if (n >= 0 && n < a.order) {
        /* call _SumDim function */
        _SumDim(&a, &b, &c, n, beta);
-
-        if (c.enableGrad) {
-            /* tensor connections */
+    
+        /* tensor connections */
+        if (a.enableGrad && b.enableGrad) {
            XLink::MakeLink(&a, &b, &c, MATH_SUMDIM);
            XLink::AddParamToHeadInt(&c, n);
            XLink::AddParamToHead(&c, beta);

--- a/source/tensor/core/arithmetic/SumDim.cpp
+++ b/source/tensor/core/arithmetic/SumDim.cpp
@@ -181,9 +181,11 @@ XTensor SumDim(const XTensor &a, const XTensor &b, int n, DTYPE beta)
    _SumDim(&a, &b, &c, n, beta);
    
    /* tensor connections */
-    XLink::MakeLink(&a, &b, &c, MATH_SUMDIM);
-    XLink::AddParamToHeadInt(&c, n);
-    XLink::AddParamToHead(&c, beta);
+    if (a.enableGrad && b.enableGrad) {
+        XLink::MakeLink(&a, &b, &c, MATH_SUMDIM);
+        XLink::AddParamToHeadInt(&c, n);
+        XLink::AddParamToHead(&c, beta);
+    }
    
    return c;
 }
@@ -210,7 +212,7 @@ void SumDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta)
    /* call _SumDim function */
    _SumDim(&a, &b, &c, n, beta);

-    if (c.enableGrad) {
+    if (a.enableGrad && b.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&a, &b, &c, MATH_SUMDIM);
        XLink::AddParamToHeadInt(&c, n);
@@ -353,9 +355,11 @@ XTensor SumBroadcast(const XTensor &a, const XTensor &b, DTYPE beta)
    _SumBroadcast(&a, &b, &c, beta);
    
    /* tensor connections */
-    XLink::MakeLink(&a, &b, &c, MATH_SUMBROADCAST);
-    XLink::AddParamToHead(&c, beta);
-    
+    if (a.enableGrad && b.enableGrad) {
+        XLink::MakeLink(&a, &b, &c, MATH_SUMBROADCAST);
+        XLink::AddParamToHead(&c, beta);
+    }
+
    return c;
 }

@@ -377,7 +381,7 @@ void SumBroadcast(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta)
    /* call _SumBroadcast function */
    _SumBroadcast(&a, &b, &c, beta);

-    if (c.enableGrad) {
+    if (a.enableGrad && b.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&a, &b, &c, MATH_SUMBROADCAST);
        XLink::AddParamToHead(&c, beta);

--- a/source/tensor/core/getandset/ConvertDataType.cpp
+++ b/source/tensor/core/getandset/ConvertDataType.cpp
@@ -121,7 +121,8 @@ XTensor ConvertDataType(const XTensor & input, TENSOR_DATA_TYPE dataType)
    _ConvertDataType(&input, &output);

    /* tensor connection */
-    XLink::MakeLink(&input, NULL, &output, GETANDSET_CONVERTDATATYPE);
+    if(input.enableGrad)
+        XLink::MakeLink(&input, NULL, &output, GETANDSET_CONVERTDATATYPE);

    return output;
 }
@@ -136,7 +137,7 @@ void ConvertDataType(const XTensor & input, XTensor & output, TENSOR_DATA_TYPE d
    _ConvertDataType(&input, &output);

    /* tensor connection */
-    if (output.enableGrad)
+    if (input.enableGrad)
        XLink::MakeLink(&input, NULL, &output, GETANDSET_CONVERTDATATYPE);
 }


--- a/source/tensor/core/getandset/OnehotAndIndex.cpp
+++ b/source/tensor/core/getandset/OnehotAndIndex.cpp
@@ -32,65 +32,43 @@ convert onehot tensor to index tensor
 >> index - index tensor, which value is an integer num
 >> size - the last dimension size of the onehot tensor
 */
-void _OnehotToIndex(XTensor * onehot, XTensor * index, int dim)
+void _OnehotToIndex(const XTensor * onehot, XTensor * index, int size)
 {
-    dim = (dim < 0 ? onehot->GetDim(-1) : dim);
-
+    CheckNTErrors(onehot->GetDim(-1) == size, "Illegal tensor dimension!");
    CheckNTErrors(onehot->order == index->order + 1, "Illegal tensor order!");
-    CheckNTErrors(dim < onehot->order, "Illegal speficied dimension!")
    CheckNTErrors(onehot->dataType == X_INT, "The onehot tensor must be in X_INT!")
    CheckNTErrors(index->dataType == X_INT, "The index tensor must be in X_INT!")

-    for (int i = 0; i < index->order; i++) {
-        if (i < dim) {
-            CheckNTErrors(index->GetDim(i) == onehot->GetDim(i), "Illegal tensor order!");
-        }
-        else {
-            CheckNTErrors(index->GetDim(i) == onehot->GetDim(i + 1), "Illegal tensor order!");
-        }
-    }
+    for (int i = 0; i < index->order; i++)
+        CheckNTErrors(index->GetDim(i) == onehot->GetDim(i), "Illegal tensor order!");

 #ifdef USE_CUDA
    if(onehot->devID >= 0 && index->devID >= 0) {
-        _CudaOnehotToIndex(onehot, index, dim);
+        _CudaOnehotToIndex(onehot, index, size);
        return;
    }
 #endif

-    int blockNum = 1;
-    int blockSize = 1;
-    int dimSize = 1;
-    int stride = 1;
-
-    for (int i = 0; i < dim; i++)
-        blockNum *= onehot->GetDim(i);
-
-    blockSize = onehot->unitNum / blockNum;
-    dimSize = onehot->GetDim(dim);
-
-    for (int i = dim + 1; i < onehot->order; i++)
-        stride *= onehot->GetDim(i);
+    int blockNum = index->unitNum;
+    int stride = size;

    int * onehotData = (int *)onehot->data;
    int * indexData = (int *)index->data;

    for (int i = 0; i < blockNum; i++) {
+        int * od = onehotData + i * stride;
+        int record = -1;
        for (int j = 0; j < stride; j++) {
-            int * od = onehotData + i * blockSize + j;
-            int * index = indexData + i * stride + j;
-
-            int record = -1;
-            for (int j = 0; j < dimSize; j++) {
-                if (od[j*stride] != 0) {
-                    if (record == -1)
-                        record = j;
-                    else
-                        ShowNTErrors("The value of onehot tensor is illegal!");
-                }
+            if (od[j] != 0) {
+                if (record == -1)
+                    record = j;
+                else
+                    ShowNTErrors("The value of onehot tensor is illegal!");
            }
-            *index = record;
        }
+        indexData[i] = record;
    }
+
 }

 /* 
@@ -101,7 +79,7 @@ make a new tensor to keep the result and return it
 >> size - the last dimension size of the onehot tensor
 << return - the index tensor
 */
-XTensor OnehotToIndex(XTensor & onehot, int size)
+XTensor OnehotToIndex(const XTensor & onehot, int size)
 {
    CheckNTErrors(onehot.GetDim(-1) == size, "Illegal tensor dimension!");
    CheckNTErrors(onehot.dataType == X_INT, "The onehot tensor must be in X_INT!")
@@ -123,10 +101,9 @@ convert index tensor to onehot tensor
 >> size - the last dimension size of the onehot tensor
 */
 void _IndexToOnehot(const XTensor * index, XTensor * onehot, 
-                    float labelSmoothingP)
+                    int size, float labelSmoothingP)
 {
-    int size = onehot->GetDim(-1);
-
+    CheckNTErrors(onehot->GetDim(-1) == size, "Illegal tensor dimension!");
    CheckNTErrors(onehot->order == index->order + 1, "Illegal tensor order!");
    //CheckNTErrors(onehot->dataType == X_INT, "The onehot tensor must be in X_INT!")
    CheckNTErrors(index->dataType == X_INT, "The index tensor must be in X_INT!")
@@ -171,7 +148,7 @@ make a new tensor to keep the result and return it
 >> confidence - labelsmoothing
 << return - the onehot tensor
 */
-XTensor IndexToOnehot(XTensor & index, int size, float labelSmoothingP)
+XTensor IndexToOnehot(const XTensor & index, int size, float labelSmoothingP)
 {
    CheckNTErrors(index.dataType == X_INT, "The onehot tensor must be in X_INT!")

@@ -184,11 +161,11 @@ XTensor IndexToOnehot(XTensor & index, int size, float labelSmoothingP)
    dim[order] = size;
    InitTensor(&onehot, index.order + 1, dim, X_FLOAT, 1.0F, index.devID, index.mem);

-    _IndexToOnehot(&index, &onehot, labelSmoothingP);
+    _IndexToOnehot(&index, &onehot, size, labelSmoothingP);

    delete[] dim;

    return onehot;
 }

-} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/getandset/OnehotAndIndex.cu
+++ b/source/tensor/core/getandset/OnehotAndIndex.cu
@@ -61,7 +61,7 @@ convert onehot tensor to index tensor (cuda version)
 >> index - index tensor, which value is an integer num
 >> size - the last dimension size of the onehot tensor
 */
-void _CudaOnehotToIndex(XTensor * onehot, XTensor * index, int size)
+void _CudaOnehotToIndex(const XTensor * onehot, XTensor * index, int size)
 {
    int devID = onehot->devID;

@@ -153,4 +153,4 @@ void _CudaIndexToOnehot(const XTensor * index, XTensor * onehot,

 #endif // USE_CUDA

-} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/getandset/OnehotAndIndex.cuh
+++ b/source/tensor/core/getandset/OnehotAndIndex.cuh
@@ -27,10 +27,11 @@
 namespace nts{ // namespace nts(NiuTrans.Tensor)

 /* convert onehot tensor to index tensor (cuda version) */
-void _CudaOnehotToIndex(XTensor * onehot, XTensor * index, int size);
+void _CudaOnehotToIndex(const XTensor * onehot, XTensor * index, int size);

 /* convert index tensor to onehot tensor (cuda version) */
-void _CudaIndexToOnehot(const XTensor * index, XTensor * onehot, int size, float confidence, float lowconfidence);
+void _CudaIndexToOnehot(const XTensor * index, XTensor * onehot, 
+                        int size, float confidence, float lowconfidence);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/getandset/OnehotAndIndex.h
+++ b/source/tensor/core/getandset/OnehotAndIndex.h
@@ -27,19 +27,18 @@
 namespace nts{ // namespace nts(NiuTrans.Tensor)

 /* convert onehot tensor to index tensor */
-void _OnehotToIndex(XTensor * onehot, XTensor * index, int dim);
+void _OnehotToIndex(const XTensor * onehot, XTensor * index, int size);

 /* convert onehot tensor to index tensor (return an XTensor structure)
 make a new tensor to keep the result and return it */
-XTensor OnehotToIndex(XTensor & onehot, int size);
+XTensor OnehotToIndex(const XTensor & onehot, int num);

 /* convert index tensor to onehot tensor */
-void _IndexToOnehot(const XTensor * index, XTensor * onehot, 
-                    float labelSmoothingP = 0.0F);
+void _IndexToOnehot(const XTensor * index, XTensor * onehot, int size, float labelSmoothingP);

 /* convert index tensor to onehot tensor (return an XTensor structure)
 make a new tensor to keep the result and return it */
-XTensor IndexToOnehot(XTensor & index, int size, float labelSmoothingP = 0.0F);
+XTensor IndexToOnehot(const XTensor & index, int num, float labelSmoothingP);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/getandset/Select.cpp
+++ b/source/tensor/core/getandset/Select.cpp
@@ -117,10 +117,12 @@ XTensor SelectRange(const XTensor &a, int dim, int low, int high)
    _SelectRange(&a, &c, dim, low, high);

    /* tensor connection */
-    XLink::MakeLink(&a, NULL, &c, GETANDSET_SELECT);
-    XLink::AddParamToHeadInt(&c, dim);
-    XLink::AddParamToHeadInt(&c, low);
-    XLink::AddParamToHeadInt(&c, high);
+    if (a.enableGrad) {
+        XLink::MakeLink(&a, NULL, &c, GETANDSET_SELECT);
+        XLink::AddParamToHeadInt(&c, dim);
+        XLink::AddParamToHeadInt(&c, low);
+        XLink::AddParamToHeadInt(&c, high);
+    }

    /* destroy variables */
    delete[] dimSize;

--- a/source/tensor/core/getandset/SetData.cpp
+++ b/source/tensor/core/getandset/SetData.cpp
@@ -526,6 +526,43 @@ void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
    }
 }

+/* generate data items with a range by start, end and the step
+>> tensor - the tensor whose data array would be initialized
+>> start - the begin of the array
+>> end - the end of the array (not included self)
+>> step - the step of two items
+*/
+void _SetDataRange(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE step)
+{
+    CheckNTErrors((tensor->order == 1), "Tensor must be 1 dimension!");
+
+    /* compute the true length according to the (start, end, step) */
+    DTYPE size = fabs(upper - lower);
+    int num = ceil(size / fabs(step));
+    CheckNTErrors((tensor->unitNum == num), "Unit number of the tensor is not matched.");
+
+    /* init a integer array to store the sequence */
+    void * data = NULL;
+    if (tensor->dataType == X_INT) {
+        data = new int[num];
+        for (int i = 0; i < num; i++)
+            *((int*)data + i) = lower + i * step;
+    }
+    else if (tensor->dataType == X_FLOAT) {
+        data = new float[num];
+        for (int i = 0; i < num; i++)
+            *((float*)data + i) = lower + i * step;
+    }
+    else {
+        ShowNTErrors("TODO!");
+    }
+
+    /* set the data from the array */
+    tensor->SetData(data, num);
+
+    delete[] data;
+}
+
 /* 
 generate data items with a uniform distribution in [lower, upper] and set
 the item to a pre-defined value if the item >= p, set the item to 0 otherwise

--- a/source/tensor/core/getandset/SetData.h
+++ b/source/tensor/core/getandset/SetData.h
@@ -69,6 +69,9 @@ void _SetDataRand(XTensor * tensor, int rNum, int cNum);
 /* generate data items with a uniform distribution in [lower, upper] */
 void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper);

+/* generate data items with a range by start, end and the step */
+void _SetDataRange(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE step);
+
 /* generate data items with a uniform distribution in [lower, upper] and set 
   the item to a pre-defined value if the item >= p, set the item to 0 otherwise */
 void _SetDataRandP(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value);

--- a/source/tensor/core/math/Binary.cpp
+++ b/source/tensor/core/math/Binary.cpp
@@ -167,7 +167,9 @@ XTensor funcName(const XTensor &a, T num)                                       
    XTensor b(&a);                                                                   \
    b.SetTMPFlag();                                                                  \
    _funcName(&a, &b, num);                                                          \
-    XLink::MakeLink(&a, NULL, &b, operationId);                                      \
+    if(a.enableGrad){                                                                \
+        XLink::MakeLink(&a, NULL, &b, operationId);                                  \
+    }                                                                                \
    XLink::AddParamToHead(&b, num);                                                  \
    return b;                                                                        \
 }                                                                                    \
@@ -183,7 +185,7 @@ void funcName(const XTensor &a, XTensor &b, T num)                              
        InitTensor(&b, &a);                                                          \
    }                                                                                \
    _funcName(&a, &b, num);                                                          \
-    if (b.enableGrad) {                                                              \
+    if (a.enableGrad) {                                                              \
        XLink::MakeLink(&a, NULL, &b, operationId);                                  \
        XLink::AddParamToHead(&b, num);                                              \
    }                                                                                \

--- a/source/tensor/core/math/Clip.cpp
+++ b/source/tensor/core/math/Clip.cpp
@@ -36,26 +36,26 @@ set every entry to its clip value
 void _Clip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper)
 {
 #ifdef USE_CUDA
-	/* run it on GPUs */
-	if (a->devID >= 0) {
-		_CudaClip(a, b, lower, upper);
-		return;
-	}
+    /* run it on GPUs */
+    if (a->devID >= 0) {
+        _CudaClip(a, b, lower, upper);
+        return;
+    }
 #endif

-	CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
-	CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
-
-	DTYPE * d = (DTYPE*)a->data;
-	DTYPE * db = (DTYPE*)b->data;
-	for (int i = 0; i < a->unitNum; i++) {
-		if (d[i] > upper)
-			db[i] = upper;
-		else if (d[i] < lower)
-			db[i] = lower;
-		else
-			db[i] = d[i];
-	}
+    CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
+    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
+
+    DTYPE * d = (DTYPE*)a->data;
+    DTYPE * db = (DTYPE*)b->data;
+    for (int i = 0; i < a->unitNum; i++) {
+        if (d[i] > upper)
+            db[i] = upper;
+        else if (d[i] < lower)
+            db[i] = lower;
+        else
+            db[i] = d[i];
+    }
 }

 /*
@@ -99,9 +99,11 @@ XTensor Clip(const XTensor & a, DTYPE lower, DTYPE upper)
 	_Clip(&a, &b, lower, upper);

 	/* tensor connections */
-	XLink::MakeLink(&a, NULL, &b, MATH_CLIP);
-	XLink::AddParamToHead(&b, lower);
-	XLink::AddParamToHead(&b, upper);
+	if (a.enableGrad) {
+	    XLink::MakeLink(&a, NULL, &b, MATH_CLIP);
+	    XLink::AddParamToHead(&b, lower);
+	    XLink::AddParamToHead(&b, upper);
+	}

 	return b;
 }
@@ -115,8 +117,8 @@ void Clip(const XTensor & a, XTensor & b, DTYPE lower, DTYPE upper)
    /* call _Clip function */
    _Clip(&a, &b, lower, upper);

-    if (b.enableGrad) {
-        /* tensor connections */
+    /* tensor connections */
+    if (a.enableGrad) {
        XLink::MakeLink(&a, NULL, &b, MATH_CLIP);
        XLink::AddParamToHead(&b, lower);
        XLink::AddParamToHead(&b, upper);

--- a/source/tensor/core/math/Compare.cpp
+++ b/source/tensor/core/math/Compare.cpp
@@ -20,6 +20,7 @@
 */

 #include "../../XTensor.h"
+#include "../../XDevice.h"
 #include "../../XName.h"
 #include "Compare.h"
 #include "Compare.cuh"
@@ -123,4 +124,95 @@ SIMPLE_COMPARE_FUNCTION_ME(NotEqualMe, _NotEqual)
 SIMPLE_COMPARE_FUNCTION(NotEqual, _NotEqual, MATH_NOTEQUAL)
 SIMPLE_COMPARE_FUNCTION_VOID(NotEqual, _NotEqual, MATH_NOTEQUAL)

+
+/* define three marco separately, specify the respective function names */
+#ifdef USE_CUDA
+#define _SIMPLE_MAX_MIN_FUNCTION(_funcName, _cudaFuncName, origFunc)                 \
+void _funcName(const XTensor * a, const XTensor * b,  XTensor * c)                   \
+{                                                                                    \
+    CheckNTErrors((XTensor::IsSameShaped(a, b, c)),                                  \
+                  "Input and output tensors should have the same type!");            \
+    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");                          \
+    CheckDev(a->devID, b->devID);                                                    \
+    CheckDev(a->devID, c->devID);                                                    \
+    /* run it on GPUs */                                                             \
+    if (a->devID >= 0) {                                                             \
+        _cudaFuncName(a, b, c);                                                      \
+        return;                                                                      \
+    }                                                                                \
+    DTYPE * da = (DTYPE*)a->data;                                                    \
+    DTYPE * db = (DTYPE*)b->data;                                                    \
+    DTYPE * dc = (DTYPE*)c->data;                                                    \
+    for (int i = 0; i < a->unitNum; i++)                                             \
+        dc[i] = (DTYPE)origFunc(da[i], db[i]);                                       \
+}     
+#else
+#define _SIMPLE_MAX_MIN_FUNCTION(_funcName, origFunc)                                \
+void _funcName(const XTensor * a, const XTensor * b, XTensor *c)                     \
+{                                                                                    \
+    CheckNTErrors((XTensor::IsSameShaped(a, b, c)),                                  \
+                  "Input and output tensors should have the same type!");            \
+    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");                          \
+    CheckDev(a, b);                                                                  \
+    CheckDev(a, c);                                                                  \
+    /* run it on GPUs */                                                             \
+    if (a->devID >= 0) {                                                             \
+        ShowNTErrors("No GPU devices support!")                                      \
+    }                                                                                \
+    DTYPE * da = (DTYPE*)a->data;                                                    \
+    DTYPE * db = (DTYPE*)b->data;                                                    \
+    DTYPE * dc = (DTYPE*)c->data;                                                    \
+    for (int i = 0; i < a->unitNum; i++)                                             \
+        dc[i] = (DTYPE)origFunc(da[i], db[i]);                                       \
+}     
+#endif
+                                                                                     
+#define _SIMPLE_MAX_MIN_FUNCTION_ME(_funcNameMe, _funcName)                          \
+void _funcNameMe(XTensor * a, const XTensor * b)                                     \
+{                                                                                    \
+    _funcName(a, b, a);                                                              \
+}                                                                                    
+                                                                                        
+#define SIMPLE_MAX_MIN_FUNCTION_ME(funcNameMe, _funcName)                            \
+void funcNameMe(XTensor & a, const XTensor & b)                                      \
+{                                                                                    \
+    _funcName(&a, &b, &a);                                                           \
+}                                                                                    
+                                                                                     
+#define SIMPLE_MAX_MIN_FUNCTION(funcName, _funcName, operationId)                    \
+XTensor funcName(const XTensor & a, const XTensor & b)                               \
+{                                                                                    \
+    XTensor c(&a);                                                                   \
+    c.SetTMPFlag();                                                                  \
+    _funcName(&a, &b, &c);                                                           \
+    return c;                                                                        \
+}
+                                                                                     
+#define SIMPLE_MAX_MIN_FUNCTION_VOID(funcName, _funcName, operationId)               \
+void funcName(const XTensor &a, const XTensor &b, XTensor c)                         \
+{                                                                                    \
+    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {                               \
+        InitTensor(&c, &a);                                                          \
+    }                                                                                \
+    _funcName(&a, &b, &c);                                                           \
+}
+
+#ifdef USE_CUDA
+_SIMPLE_MAX_MIN_FUNCTION(_Max, _CudaMax, max)
+_SIMPLE_MAX_MIN_FUNCTION(_Min, _CudaMin, min)
+#else
+_SIMPLE_MAX_MIN_FUNCTION(_Max, max)
+_SIMPLE_MAX_MIN_FUNCTION(_Min, min)
+#endif
+
+_SIMPLE_MAX_MIN_FUNCTION_ME(_MaxMe, _Max)
+SIMPLE_MAX_MIN_FUNCTION_ME(MaxMe, _Max)
+SIMPLE_MAX_MIN_FUNCTION(Max, _Max, MATH_MAX)
+SIMPLE_MAX_MIN_FUNCTION_VOID(Max, _Max, MATH_MAX)
+
+_SIMPLE_MAX_MIN_FUNCTION_ME(_MinMe, _Min)
+SIMPLE_MAX_MIN_FUNCTION_ME(MinMe, _Min)
+SIMPLE_MAX_MIN_FUNCTION(Min, _Min, MATH_MIN)
+SIMPLE_MAX_MIN_FUNCTION_VOID(Min, _Min, MATH_MIN)
+
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/math/Compare.cu
+++ b/source/tensor/core/math/Compare.cu
@@ -89,6 +89,53 @@ void _Cuda##funcName(const XTensor * a, XTensor * b, DTYPE number)          \
 SIMPLE_COMPARE_FUNCTION_GPU(Equal, cudaIsEqual)
 SIMPLE_COMPARE_FUNCTION_GPU(NotEqual, cudaIsNotEqual)

+#define SIMPLE_MAX_MIN_FUNCTION_GPU(funcName, origFunc)                     \
+__global__                                                                  \
+void Kernel##funcName(DTYPE * a, DTYPE * b, DTYPE * c, int size)            \
+{                                                                           \
+    int i = blockDim.x * blockIdx.x + threadIdx.x;                          \
+                                                                            \
+    if (i < size)                                                           \
+        c[i] = (DTYPE)origFunc(a[i], b[i]);                                 \
+}                                                                           \
+__global__                                                                  \
+void Kernel##funcName(__half * a, __half * b, __half * c, int size)         \
+{                                                                           \
+    return;                                                                 \
+}                                                                           \
+void _Cuda##funcName(const XTensor * a, const XTensor * b, XTensor * c)     \
+{                                                                           \
+                                                                            \
+    int gridSize[3];                                                        \
+    int blockSize[3];                                                       \
+                                                                            \
+    GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);         \
+                                                                            \
+    dim3 blocks(gridSize[0]);                                               \
+    dim3 threads(blockSize[0]);                                             \
+                                                                            \
+    int devIDBackup;                                                        \
+    ProtectCudaDev(a->devID, devIDBackup);                                  \
+                                                                            \
+    if (a->dataType == DEFAULT_DTYPE) {                                     \
+        Kernel##funcName<<<blocks, threads>>>                               \
+                         ((DTYPE*)a->data, (DTYPE*)b->data,                 \
+                          (DTYPE*)c->data, a->unitNum);                     \
+    }                                                                       \
+    else if (a->dataType == X_FLOAT16) {                                    \
+        Kernel##funcName<<<blocks, threads>>>                               \
+                         ((__half*)a->data, (__half*)b->data,               \
+                          (__half*)c->data, a->unitNum);                    \
+    }                                                                       \
+    else {                                                                  \
+        ShowNTErrors("TODO!");                                              \
+    }                                                                       \
+                                                                            \
+    BacktoCudaDev(a->devID, devIDBackup);                                   \
+}    
+
+SIMPLE_MAX_MIN_FUNCTION_GPU(Max, max)
+SIMPLE_MAX_MIN_FUNCTION_GPU(Min, min)

 #endif // USE_CUDA


--- a/source/tensor/core/math/Compare.cuh
+++ b/source/tensor/core/math/Compare.cuh
@@ -34,6 +34,12 @@ void _CudaEqual(const XTensor * a, XTensor * b, DTYPE value);
 /* check whether every entry is not equal to the given value (cuda version) */
 void _CudaNotEqual(const XTensor * a, XTensor * b, DTYPE value);

+/* return maximum of two tensor for each items (cuda version) */
+void _CudaMax(const XTensor * a, const XTensor * b, XTensor *c);
+
+/* return minimum of two tensor for each items (cuda version) */
+void _CudaMin(const XTensor * a, const XTensor * b, XTensor *c);
+
 #endif // USE_CUDA

 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/math/Compare.h
+++ b/source/tensor/core/math/Compare.h
@@ -56,6 +56,36 @@ XTensor NotEqual(const XTensor & a, DTYPE value);
 /* check whether every entry is not equal to the given value */
 void NotEqual(const XTensor & a, XTensor & b, DTYPE value);

+/* return maximum of two tensor for each items */
+void _Max(const XTensor * a, const XTensor * b, XTensor * c);
+
+/* return maximum of two tensor for each items (do it on site) */
+void _MaxMe(XTensor * a, const XTensor * b);
+
+/* return maximum of two tensor for each items (do it on site) */
+void MaxMe(XTensor & a, const XTensor & b);
+
+/* return maximum of two tensor for each items (return an XTensor structure) */
+XTensor Max(const XTensor & a, const XTensor & b);
+
+/* return maximum of two tensor for each items */
+void Max(const XTensor & a, const XTensor & b, XTensor & c);
+
+/* return minimum of two tensor for each items */
+void _Min(const XTensor * a, const XTensor * b, XTensor * c);
+
+/* return minimum of two tensor for each items (do it on site) */
+void _MinMe(XTensor * a, const XTensor * b);
+
+/* return minimum of two tensor for each items (do it on site) */
+void MinMe(XTensor & a, const XTensor & b);
+
+/* return minimum of two tensor for each items (return an XTensor structure) */
+XTensor Min(const XTensor & a, const XTensor & b);
+
+/* return minimum of two tensor for each items */
+void Min(const XTensor & a, const XTensor & b, XTensor & c);
+
 } // namespace nts(NiuTrans.Tensor)

 #endif // end __COMPARE_H__
\ No newline at end of file
--- a/source/tensor/core/math/Normalize.cpp
+++ b/source/tensor/core/math/Normalize.cpp
@@ -46,7 +46,7 @@ void _Normalize(const XTensor * input, XTensor * output, int dim,
                const XTensor * mean, const XTensor * var, 
                const XTensor * a, const XTensor * b, DTYPE epsilon)
 {
-	int dimRDI = input->order - dim - 1;
+    int dimRDI = input->order - dim - 1;
    CheckNTErrors((XTensor::IsSameShaped(input, output)), "Unmatched input tensors!");
    CheckNTErrors((XTensor::IsSameShaped(a, b)), "Unmatched input tensors");
    CheckNTErrors((XTensor::IsSameShaped(mean, var)), "Unmatched input tensors");
@@ -173,9 +173,11 @@ XTensor Normalize(const XTensor &input, int dim,
    list.Add((XTensor*)&var);
    list.Add((XTensor*)&a);
    list.Add((XTensor*)&b);
-    XLink::MakeLink(&list, &output, MATH_NORMALIZE);
-    XLink::AddParamToHeadInt(&output, dim);
-    XLink::AddParamToHead(&output, epsilon);
+    if (input.enableGrad) {
+        XLink::MakeLink(&list, &output, MATH_NORMALIZE);
+        XLink::AddParamToHeadInt(&output, dim);
+        XLink::AddParamToHead(&output, epsilon);
+    }

    return output;
 }
@@ -208,7 +210,7 @@ void Normalize(const XTensor &input, XTensor &output, int dim,
    /* call _Normalize function */
    _Normalize(&input, &output, dim, &mean, &var, &a, &b, epsilon);

-    if (output.enableGrad == true) {
+    if (input.enableGrad == true) {
        /* tensor connections */
        TensorList list(5);
        list.Add((XTensor*)&input);

--- a/source/tensor/core/math/ScaleAndShift.cpp
+++ b/source/tensor/core/math/ScaleAndShift.cpp
@@ -126,9 +126,11 @@ XTensor ScaleAndShift(const XTensor &a, DTYPE scale, DTYPE shift)
    _ScaleAndShift(&a, &b, scale, shift);
    
    /* tensor connections */
-    XLink::MakeLink(&a, NULL, &b, MATH_SCALEANDSHIFT);
-    XLink::AddParamToHead(&b, scale);
-    XLink::AddParamToHead(&b, shift);
+    if (a.enableGrad) {
+        XLink::MakeLink(&a, NULL, &b, MATH_SCALEANDSHIFT);
+        XLink::AddParamToHead(&b, scale);
+        XLink::AddParamToHead(&b, shift);
+    }
    
    return b;
 }
@@ -152,7 +154,7 @@ void ScaleAndShift(const XTensor & a, XTensor & b, DTYPE scale, DTYPE shift)
    /* call _ScaleAndShift function */
    _ScaleAndShift(&a, &b, scale, shift);

-    if (b.enableGrad) {
+    if (a.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&a, NULL, &b, MATH_SCALEANDSHIFT);
        XLink::AddParamToHead(&b, scale);

--- a/source/tensor/core/math/Unary.cpp
+++ b/source/tensor/core/math/Unary.cpp
@@ -151,7 +151,9 @@ XTensor funcName(const XTensor & a)                                             
    XTensor b(&a);                                                                   \
    b.SetTMPFlag();                                                                  \
    _funcName(&a, &b);                                                               \
-    XLink::MakeLink(&a, NULL, &b, operationId);                                      \
+    if(a.enableGrad){                                                                \
+        XLink::MakeLink(&a, NULL, &b, operationId);                                  \
+    }                                                                                \
    return b;                                                                        \
 }                                                                                    
                                                                                     
@@ -162,7 +164,7 @@ void funcName(const XTensor & a, XTensor & b)                                   
        InitTensor(&b, &a);                                                          \
    }                                                                                \
    _funcName(&a, &b);                                                               \
-    if (b.enableGrad) {                                                              \
+    if (a.enableGrad) {                                                              \
        XLink::MakeLink(&a, NULL, &b, operationId);                                  \
    }                                                                                \
 }

--- a/source/tensor/core/movement/CopyIndexed.cpp
+++ b/source/tensor/core/movement/CopyIndexed.cpp
@@ -258,10 +258,12 @@ XTensor CopyIndexed(const XTensor & s, int dim,
    list.Add((XTensor*)&tgtIndex);

    /* tensor connection */
-    XLink::MakeLink(&list, &t, MOVEMENT_COPYINDEXED);
-    XLink::AddParamToHeadInt(&t, dim);
-    XLink::AddParamToHeadInt(&t, copyNum);
-    
+    if (s.enableGrad) {
+        XLink::MakeLink(&list, &t, MOVEMENT_COPYINDEXED);
+        XLink::AddParamToHeadInt(&t, dim);
+        XLink::AddParamToHeadInt(&t, copyNum);
+    }
+
    /* destroy variables */
    delete[] dimSize;

@@ -314,13 +316,15 @@ XTensor CopyIndexed(const XTensor &s, int dim, int * srcIndex, int indexSize, in
    memcpy(saveTgtIndex, tgtIndex, indexSize * sizeof(int));

    /* tensor connection */
-    XLink::MakeLink(&s, NULL, &t, MOVEMENT_COPYINDEXED);
-    XLink::AddParamToHeadInt(&t, dim);
-    XLink::AddParamToHeadPointer(&t, saveSrcIndex);
-    XLink::AddParamToHeadInt(&t, indexSize);
-    XLink::AddParamToHeadPointer(&t, saveTgtIndex);
-    XLink::AddParamToHeadInt(&t, copyNum);
-    
+    if (s.enableGrad) {
+        XLink::MakeLink(&s, NULL, &t, MOVEMENT_COPYINDEXED);
+        XLink::AddParamToHeadInt(&t, dim);
+        XLink::AddParamToHeadPointer(&t, saveSrcIndex);
+        XLink::AddParamToHeadInt(&t, indexSize);
+        XLink::AddParamToHeadPointer(&t, saveTgtIndex);
+        XLink::AddParamToHeadInt(&t, copyNum);
+    }
+
    /* destroy variables */
    delete[] dimSize;


--- a/source/tensor/core/movement/CopyValues.cpp
+++ b/source/tensor/core/movement/CopyValues.cpp
@@ -134,7 +134,9 @@ XTensor CopyValues(const XTensor &s, XStream * stream)
    _CopyValues(&s, &t, stream);
        
    /* tensor connection */
-    XLink::MakeLink(&s, NULL, &t, MOVEMENT_COPYVALUES);
+    if (s.enableGrad) {
+        XLink::MakeLink(&s, NULL, &t, MOVEMENT_COPYVALUES);
+    }

    return t;
 }

--- a/source/tensor/core/movement/Gather.cpp
+++ b/source/tensor/core/movement/Gather.cpp
@@ -93,7 +93,9 @@ XTensor Gather(XTensor &s, XTensor &index)
    _Gather(&s, &t, &index);

    /* tensor connection */
-    XLink::MakeLink(&s, &index, &t, MOVEMENT_GATHER);
+    if (s.enableGrad) {
+        XLink::MakeLink(&s, &index, &t, MOVEMENT_GATHER);
+    }

    return t;
 }

--- a/source/tensor/core/reduce/ReduceMax.cpp
+++ b/source/tensor/core/reduce/ReduceMax.cpp
@@ -21,6 +21,8 @@

 #include "../../XTensor.h"
 #include "../../XName.h"
+#include "../../XBLAS.h"
+#include "VectorBuffer.h"
 #include "ReduceMax.h"
 #include "ReduceMax.cuh"

@@ -41,8 +43,8 @@ void _ReduceMax(const XTensor * input, XTensor * output, int dim)
    CheckNTErrors((input->order == output->order + 1), "Incorrect tensor sizes!");
    CheckNTErrors((input->order > dim && dim >=0), "Illegal dimension to reduce!");
    CheckNTErrors((input->dataType == output->dataType), "Unmatched data types!");
-	
-	int dimRDI = input->order - dim - 1;
+    
+    int dimRDI = input->order - dim - 1;
    CheckNTErrors(dimRDI >= 0, "Wrong dimension!");

    for(int i = 0; i < input->order; i++){
@@ -76,18 +78,75 @@ void _ReduceMax(const XTensor * input, XTensor * output, int dim)
        }
        blockSize = stride * strideNum;

-        for(int k = 0; k < blockNum; k++){
-            DTYPE * ip = (DTYPE*)input->data + blockSize * k;
-            DTYPE * op = (DTYPE*)output->data + stride * k;
-            for(int i = 0; i < stride; i++){
-                DTYPE max = FLOAT_MIN;
-                DTYPE * ipe = ip + blockSize;
-                for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
-                    DTYPE v = *ipb;
-                    if(max < v)
-                        max = v;
+        if(input->dimSizeRDI[0] % (4 * 32 / sizeof(DTYPE)) == 0 && input->dimSizeRDI[0] >= 32){
+            int vecBufLength =  32 / sizeof(DTYPE);
+
+            if(dimRDI == 0){
+                //data is contiguous in dim 0
+                for(int i = 0; i < blockNum; i++){
+                    DTYPE * ip = (DTYPE*)input->data + blockSize * i;
+                    DTYPE * op = (DTYPE*)output->data + i;
+                    VectorBuffer vecBuf[4];
+                    for(int j = 0; j < 4; j++){
+                        vecBuf[j] = VectorBuffer::loadu((DTYPE*)(ip) + j * vecBufLength);
+                    }
+                    for(int j = 1; j < strideNum / 32; j++){
+                        const DTYPE* ptr = (DTYPE*)(ip + j * vecBufLength);
+                        vecBuf[0] = vecBuf[0].maxData(VectorBuffer::loadu(ptr + 0 * vecBufLength));
+                        vecBuf[1] = vecBuf[1].maxData(VectorBuffer::loadu(ptr + 1 * vecBufLength));
+                        vecBuf[2] = vecBuf[2].maxData(VectorBuffer::loadu(ptr + 2 * vecBufLength));
+                        vecBuf[3] = vecBuf[3].maxData(VectorBuffer::loadu(ptr + 3 * vecBufLength));
+                    }
+                    vecBuf[0] = vecBuf[0].maxData(vecBuf[1]);
+                    vecBuf[0] = vecBuf[0].maxData(vecBuf[2]);
+                    vecBuf[0] = vecBuf[0].maxData(vecBuf[3]);
+                    DTYPE maxN = DTYPE_MIN;
+                    for(int k = 0; k < vecBufLength; k++){
+                        maxN = MAX(maxN,vecBuf[0][k]);
+                    }
+                    *op = maxN;
+                }
+
+            } else{
+                //data is separated
+                for(int i = 0; i < blockNum; i++){
+                    for(int j = 0; j < input->dimSizeRDI[0] / 32; j++){
+                        DTYPE * ip = (DTYPE*)input->data + blockSize * i;
+                        DTYPE * op = (DTYPE*)output->data + stride * i;
+                        VectorBuffer vecBuf[4];
+                        for(int k = 0; k < 4; k++){
+                            vecBuf[k] = VectorBuffer::loadu((DTYPE*)(ip) + (j * 4 + k) * 32 / sizeof(DTYPE));
+
+                        }
+                        for(int k = 1; k < strideNum; k++){
+                            DTYPE * ptr = ip + k * stride + (j * 4) * vecBufLength;
+                            vecBuf[0] = vecBuf[0].maxData(VectorBuffer::loadu(ptr + 0 * vecBufLength));
+                            vecBuf[1] = vecBuf[1].maxData(VectorBuffer::loadu(ptr + 1 * vecBufLength));
+                            vecBuf[2] = vecBuf[2].maxData(VectorBuffer::loadu(ptr + 2 * vecBufLength));
+                            vecBuf[3] = vecBuf[3].maxData(VectorBuffer::loadu(ptr + 3 * vecBufLength));
+                        }
+                        for(int k = 0; k < 4; k++){
+                            for(int l = 0; l < vecBufLength; l++)
+                                *(op + j * 32 + 8 * k + l) = vecBuf[k][l];
+                        }
+                    }
+                }
+            }
+        }//run vector buffer
+        else{
+            for(int k = 0; k < blockNum; k++){
+                DTYPE * ip = (DTYPE*)input->data + blockSize * k;
+                DTYPE * op = (DTYPE*)output->data + stride * k;
+                for(int i = 0; i < stride; i++){
+                    DTYPE max = DTYPE_MIN;
+                    DTYPE * ipe = ip + blockSize;
+                    for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
+                        DTYPE v = *ipb;
+                        if(max < v)
+                            max = v;
+                    }
+                    *(op + i) = max;
                }
-                *(op + i) = max;
            }
        }
    }
@@ -104,7 +163,7 @@ make a new tensor to keep the result and return it
 XTensor ReduceMax(const XTensor &input, int dim)
 {
    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
-	
+    
    int order = input.order - 1;
    int * dimSize = new int[order];
    for(int i = 0; i < order; i++){
@@ -122,8 +181,10 @@ XTensor ReduceMax(const XTensor &input, int dim)
    _ReduceMax(&input, &output, dim);
    
    /* tensor connection */
-    XLink::MakeLink(&input, NULL, &output, REDUCE_REDUCEMAX);
-    XLink::AddParamToHeadInt(&output, dim);
+    if (input.enableGrad) {
+        XLink::MakeLink(&input, NULL, &output, REDUCE_REDUCEMAX);
+        XLink::AddParamToHeadInt(&output, dim);
+    }

    /* destroy variables */
    delete[] dimSize;
@@ -162,7 +223,7 @@ void ReduceMax(const XTensor &input, XTensor &output, int dim)
    /* call _ReduceMax function */
    _ReduceMax(&input, &output, dim);

-    if (output.enableGrad) {
+    if (input.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&input, NULL, &output, REDUCE_REDUCEMAX);
        XLink::AddParamToHeadInt(&output, dim);

--- a/source/tensor/core/reduce/ReduceMean.cpp
+++ b/source/tensor/core/reduce/ReduceMean.cpp
@@ -39,7 +39,7 @@ void _ReduceMean(const XTensor * input, XTensor * output, int dim)
 {
    CheckNTErrors((input->order > dim), "Illegal dimension specified!");

-	int dimRDI = input->order - dim - 1;
+    int dimRDI = input->order - dim - 1;
    int num = input->dimSizeRDI[dimRDI];

    _ReduceSum(input, output, dim);
@@ -59,7 +59,7 @@ For a 1-dimensional data array a, mean = (1/n) * sum_i input_i
 XTensor ReduceMean(const XTensor &input, int dim)
 {
    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
-	
+    
    int order = input.order - 1;
    int * dimSize = new int[order];
    for(int i = 0; i < order; i++){
@@ -77,8 +77,10 @@ XTensor ReduceMean(const XTensor &input, int dim)
    _ReduceMean(&input, &output, dim);
        
    /* tensor connection */
-    XLink::MakeLink(&input, NULL, &output, REDUCE_REDUCEMEAN);
-    XLink::AddParamToHeadInt(&output, dim);
+    if (input.enableGrad) {
+        XLink::MakeLink(&input, NULL, &output, REDUCE_REDUCEMEAN);
+        XLink::AddParamToHeadInt(&output, dim);
+    }

    /* destroy variables */
    delete[] dimSize;
@@ -119,7 +121,7 @@ void ReduceMean(const XTensor &input, XTensor &output, int dim)
    /* call _ReduceMean function */
    _ReduceMean(&input, &output, dim);

-    if (output.enableGrad) {
+    if (input.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&input, NULL, &output, REDUCE_REDUCEMEAN);
        XLink::AddParamToHeadInt(&output, dim);

--- a/source/tensor/core/reduce/ReduceSum.cpp
+++ b/source/tensor/core/reduce/ReduceSum.cpp
@@ -23,6 +23,9 @@
 #include "ReduceSum.h"
 #include "ReduceSum.cuh"
 #include "../../XName.h"
+#include "../../XBLAS.h"
+#include "VectorBuffer.h"
+#include <iostream>

 namespace nts{ // namespace nts(NiuTrans.Tensor)

@@ -50,7 +53,7 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor 
    CheckNTErrors((input->dataType == output->dataType), "Unmatched data types!");
    CheckNTErrors((shift == NULL || XTensor::IsSameShaped(output, shift)), "Incorrect shift tensor size!");

-	int dimRDI = input->order - dim - 1;
+    int dimRDI = input->order - dim - 1;
    CheckNTErrors(dimRDI >= 0, "Wrong dimension!");

    for(int i = 0; i < input->order; i++){
@@ -82,118 +85,188 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor 
        }
        blockSize = stride * strideNum;

-        for(int k = 0; k < blockNum; k++){
-            DTYPE * ip = (DTYPE*)input->data + blockSize * k;
-            DTYPE * op = (DTYPE*)output->data + stride * k;
-            DTYPE * sp = shift != NULL ? (DTYPE*)shift->data + stride * k : NULL;
-            for(int i = 0; i < stride; i++){
-                DTYPE sum = 0;
-                DTYPE bias = shift != NULL ? *(sp + i) : 0;
-                DTYPE * ipe = ip + blockSize;
-                if(isExp){
-                    if(bias == 0){
-                        if(power == (DTYPE)1.0){
-                            for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride)
-                                sum += (DTYPE)exp(*ipb);
+        if(input->dimSizeRDI[0] % (4 * 32 / sizeof(DTYPE)) == 0 && input->dimSizeRDI[0] >= 32){
+            int vecBufLength =  32 / sizeof(DTYPE);
+
+            if(dimRDI == 0){
+                //data is contiguous in dim 0
+                for(int i = 0; i < blockNum; i++){
+                    // stride = 1
+                    DTYPE * ip = (DTYPE*)input->data + blockSize * i;
+                    DTYPE * op = (DTYPE*)output->data + i;
+                    DTYPE * sp = shift != NULL ? (DTYPE*)shift->data + i : NULL;
+                    DTYPE bias[32 / sizeof(DTYPE)] = {0};
+                    if(shift != NULL){
+                        for(int k = 0; k < 32 / sizeof(DTYPE); k++)
+                            bias[k] = *(sp);
+                    }
+                    VectorBuffer vecBuf[4];
+                    for(int j = 0; j < 4; j++){
+                        vecBuf[j] = VectorBuffer::loadu((DTYPE*)(ip) + j * vecBufLength, isExp, power, bias);
+                    }
+                    for(int j = 1; j < strideNum / 32; j++){
+                        const DTYPE* ptr = (DTYPE*)(ip + j * vecBufLength);
+                        vecBuf[0] = vecBuf[0] + VectorBuffer::loadu(ptr + 0 * vecBufLength, isExp, power, bias);
+                        vecBuf[1] = vecBuf[1] + VectorBuffer::loadu(ptr + 1 * vecBufLength, isExp, power, bias);
+                        vecBuf[2] = vecBuf[2] + VectorBuffer::loadu(ptr + 2 * vecBufLength, isExp, power, bias);
+                        vecBuf[3] = vecBuf[3] + VectorBuffer::loadu(ptr + 3 * vecBufLength, isExp, power, bias);
+                    }
+                    vecBuf[0] = ((vecBuf[0] + vecBuf[1]) + (vecBuf[2] + vecBuf[3]));
+                    DTYPE sum = (DTYPE) 0.0;
+                    for(int k = 0; k < vecBufLength; k++){
+                        sum = sum + vecBuf[0][k];
+                    }
+                    *op = sum;
+                }
+
+            } else{
+                //data is separated
+                for(int i = 0; i < blockNum; i++){
+                    for(int j = 0; j < input->dimSizeRDI[0] / 32; j++){
+                        DTYPE * ip = (DTYPE*)input->data + blockSize * i;
+                        DTYPE * op = (DTYPE*)output->data + stride * i;
+                        DTYPE * sp = shift != NULL ? (DTYPE*)shift->data + stride * i : NULL;
+                        DTYPE bias[4 * 32 / sizeof(DTYPE)] = {0};
+                        if(shift != NULL){
+                            for(int k = 0; k < 4 * 32 / sizeof(DTYPE); k++)
+                                bias[k] = *(sp + k);
                        }
-                        else if(power == (DTYPE)2.0){
-                            for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
-                                DTYPE value = (*ipb);
-                                sum += (DTYPE)exp(value * value);
-                            }
+                        VectorBuffer vecBuf[4];
+                        for(int k = 0; k < 4; k++){
+                            vecBuf[k] = VectorBuffer::loadu((DTYPE*)(ip) + (j * 4 + k) * 32 / sizeof(DTYPE), isExp, power, bias + j * 32 / sizeof(DTYPE));
+
                        }
-                        else if(power == (DTYPE)0.5){
-                            for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
-                                DTYPE value = (*ipb);
-                                sum += (DTYPE)exp(sqrt(value));
-                            }
+                        for(int k = 1; k < strideNum; k++){
+                            DTYPE * ptr = ip + k * stride + (j * 4) * vecBufLength;
+                            vecBuf[0] = vecBuf[0] + VectorBuffer::loadu(ptr + 0 * vecBufLength, isExp, power, bias);
+                            vecBuf[1] = vecBuf[1] + VectorBuffer::loadu(ptr + 1 * vecBufLength, isExp, power, bias + 1 * vecBufLength);
+                            vecBuf[2] = vecBuf[2] + VectorBuffer::loadu(ptr + 2 * vecBufLength, isExp, power, bias + 2 * vecBufLength);
+                            vecBuf[3] = vecBuf[3] + VectorBuffer::loadu(ptr + 3 * vecBufLength, isExp, power, bias + 3 * vecBufLength);
                        }
-                        else{
-                            for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
-                                DTYPE value = (*ipb);
-                                sum += (DTYPE)exp(pow(value, power));
-                            }
+                        for(int k = 0; k < 4; k++){
+                            for(int l = 0; l < vecBufLength; l++)
+                                *(op + j * 32 + 8 * k + l) = vecBuf[k][l];
                        }
                    }
-                    else{
-                        if(power == (DTYPE)1.0){
-                            for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride)
-                                sum += (DTYPE)exp(*ipb - bias);
-                        }
-                        else if(power == (DTYPE)2.0){
-                            for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
-                                DTYPE value = (*ipb) - bias;
-                                sum += (DTYPE)exp(value * value);
+                }
+            }
+        }//run vector buffer
+        else{
+
+            for(int k = 0; k < blockNum; k++){
+                DTYPE * ip = (DTYPE*)input->data + blockSize * k;
+                DTYPE * op = (DTYPE*)output->data + stride * k;
+                DTYPE * sp = shift != NULL ? (DTYPE*)shift->data + stride * k : NULL;
+                for(int i = 0; i < stride; i++){
+                    DTYPE sum = 0;
+                    DTYPE bias = shift != NULL ? *(sp + i) : 0;
+                    DTYPE * ipe = ip + blockSize;
+                    if(isExp){
+                        if(bias == 0){
+                            if(power == (DTYPE)1.0){
+                                for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride)
+                                    sum += (DTYPE)exp(*ipb);
                            }
-                        }
-                        else if(power == (DTYPE)0.5){
-                            for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
-                                DTYPE value = (*ipb) - bias;
-                                sum += (DTYPE)exp(sqrt(value));
+                            else if(power == (DTYPE)2.0){
+                                for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
+                                    DTYPE value = (*ipb);
+                                    sum += (DTYPE)exp(value * value);
+                                }
+                            }
+                            else if(power == (DTYPE)0.5){
+                                for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
+                                    DTYPE value = (*ipb);
+                                    sum += (DTYPE)exp(sqrt(value));
+                                }
+                            }
+                            else{
+                                for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
+                                    DTYPE value = (*ipb);
+                                    sum += (DTYPE)exp(pow(value, power));
+                                }
                            }
                        }
                        else{
-                            for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
-                                DTYPE value = (*ipb) - bias;
-                                sum += (DTYPE)exp(pow(value, power));
+                            if(power == (DTYPE)1.0){
+                                for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride)
+                                    sum += (DTYPE)exp(*ipb - bias);
                            }
-                        }
-                    }
-                }
-                else{
-                    if(bias == 0){
-                        if(power == (DTYPE)1.0){
-                            for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride)
-                                sum += *ipb;
-                        }
-                        else if(power == (DTYPE)2.0){
-                            for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
-                                DTYPE value = (*ipb);
-                                sum += value * value;
+                            else if(power == (DTYPE)2.0){
+                                for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
+                                    DTYPE value = (*ipb) - bias;
+                                    sum += (DTYPE)exp(value * value);
+                                }
                            }
-                        }
-                        else if(power == (DTYPE)0.5){
-                            for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
-                                DTYPE value = (*ipb);
-                                sum += (DTYPE)sqrt(value);
+                            else if(power == (DTYPE)0.5){
+                                for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
+                                    DTYPE value = (*ipb) - bias;
+                                    sum += (DTYPE)exp(sqrt(value));
+                                }
                            }
-                        }
-                        else{
-                            for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
-                                DTYPE value = (*ipb);
-                                sum += (DTYPE)pow(value, power);
+                            else{
+                                for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
+                                    DTYPE value = (*ipb) - bias;
+                                    sum += (DTYPE)exp(pow(value, power));
+                                }
                            }
                        }
                    }
                    else{
-                        if(power == (DTYPE)1.0){
-                            for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride)
-                                sum += *ipb;
-                            sum -= strideNum * bias;
-                        }
-                        else if(power == (DTYPE)2.0){
-                            for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
-                                DTYPE value = (*ipb) - bias;
-                                sum += value * value;
+                        if(bias == 0){
+                            if(power == (DTYPE)1.0){
+                                    for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride)
+                                        sum += *ipb;
                            }
-                        }
-                        else if(power == (DTYPE)0.5){
-                            for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
-                                DTYPE value = (*ipb) - bias;
-                                sum += (DTYPE)sqrt(value);
+                            else if(power == (DTYPE)2.0){
+                                    for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
+                                        DTYPE value = (*ipb);
+                                        sum += value * value;
+                                    }
+                            }
+                            else if(power == (DTYPE)0.5){
+                                for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
+                                    DTYPE value = (*ipb);
+                                    sum += (DTYPE)sqrt(value);
+                                }
+                            }
+                            else{
+                                for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
+                                    DTYPE value = (*ipb);
+                                    sum += (DTYPE)pow(value, power);
+                                }
                            }
                        }
                        else{
-                            for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
-                                DTYPE value = (*ipb) - bias;
-                                sum += (DTYPE)pow(value, power);
+                            if(power == (DTYPE)1.0){
+                                    for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride)
+                                        sum += *ipb;
+                                sum -= strideNum * bias;
+                            }
+                            else if(power == (DTYPE)2.0){
+                                for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
+                                    DTYPE value = (*ipb) - bias;
+                                    sum += value * value;
+                                }
+                            }
+                            else if(power == (DTYPE)0.5){
+                                for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
+                                    DTYPE value = (*ipb) - bias;
+                                    sum += (DTYPE)sqrt(value);
+                                }
+                            }
+                            else{
+                                for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
+                                    DTYPE value = (*ipb) - bias;
+                                    sum += (DTYPE)pow(value, power);
+                                }
                            }
                        }
                    }
+                    *(op + i) = sum;
                }
-                *(op + i) = sum;
            }
        }
+
    }
 }

@@ -215,7 +288,7 @@ sum = \sum_i exp((a_i - shift)^power) if isExp == true
 XTensor ReduceSum(const XTensor &input, int dim, const XTensor &shift, DTYPE power, bool isExp)
 {
    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
-	
+    
    int order = input.order - 1;
    int * dimSize = new int[order];
    for(int i = 0; i < order; i++){
@@ -233,10 +306,12 @@ XTensor ReduceSum(const XTensor &input, int dim, const XTensor &shift, DTYPE pow
    _ReduceSum(&input, &output, dim, &shift, power, isExp);
            
    /* tensor connection */
-    XLink::MakeLink(&input, &shift, &output, REDUCE_REDUCESUM);
-    XLink::AddParamToHeadInt(&output, dim);
-    XLink::AddParamToHead(&output, power);
-    XLink::AddParamToHeadBool(&output, isExp);
+    if (input.enableGrad) {
+        XLink::MakeLink(&input, &shift, &output, REDUCE_REDUCESUM);
+        XLink::AddParamToHeadInt(&output, dim);
+        XLink::AddParamToHead(&output, power);
+        XLink::AddParamToHeadBool(&output, isExp);
+    }

    /* destroy variables */
    delete[] dimSize;
@@ -268,7 +343,7 @@ void ReduceSum(const XTensor &input, XTensor &output, int dim, const XTensor &sh
    /* call _ReduceSum function */
    _ReduceSum(&input, &output, dim, &shift, power, isExp);

-    if (output.enableGrad) {
+    if (input.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&input, &shift, &output, REDUCE_REDUCESUM);
        XLink::AddParamToHeadInt(&output, dim);
@@ -294,7 +369,7 @@ sum = \sum_i exp((a_i)^power) if isExp == true
 XTensor ReduceSum(const XTensor &input, int dim, DTYPE power, bool isExp)
 {
    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
-	
+    
    int order = input.order - 1;
    int * dimSize = new int[order];
    for(int i = 0; i < order; i++){
@@ -312,10 +387,12 @@ XTensor ReduceSum(const XTensor &input, int dim, DTYPE power, bool isExp)
    _ReduceSum(&input, &output, dim, NULL, power, isExp);
            
    /* tensor connection */
-    XLink::MakeLink(&input, NULL, &output, REDUCE_REDUCESUM);
-    XLink::AddParamToHeadInt(&output, dim);
-    XLink::AddParamToHead(&output, power);
-    XLink::AddParamToHeadBool(&output, isExp);
+    if (input.enableGrad) {
+        XLink::MakeLink(&input, NULL, &output, REDUCE_REDUCESUM);
+        XLink::AddParamToHeadInt(&output, dim);
+        XLink::AddParamToHead(&output, power);
+        XLink::AddParamToHeadBool(&output, isExp);
+    }

    /* destroy variables */
    delete[] dimSize;
@@ -361,7 +438,7 @@ void ReduceSum(const XTensor &input, XTensor &output, int dim, DTYPE power, bool
    /* call _ReduceSum function */
    _ReduceSum(&input, &output, dim, NULL, power, isExp);

-    if (output.enableGrad) {
+    if (input.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&input, NULL, &output, REDUCE_REDUCESUM);
        XLink::AddParamToHeadInt(&output, dim);

--- a/source/tensor/core/reduce/ReduceSumSquared.cpp
+++ b/source/tensor/core/reduce/ReduceSumSquared.cpp
@@ -55,7 +55,7 @@ For a 1-dimensional data array a, sum = \sum_i (a_i - shift)^2
 XTensor ReduceSumSquared(const XTensor &input, int dim, const XTensor &shift)
 {
    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
-	
+    
    int order = input.order - 1;
    int * dimSize = new int[order];
    for(int i = 0; i < order; i++){
@@ -73,8 +73,10 @@ XTensor ReduceSumSquared(const XTensor &input, int dim, const XTensor &shift)
    _ReduceSumSquared(&input, &output, dim, &shift);
                    
    /* tensor connection */
-    XLink::MakeLink(&input, &shift, &output, REDUCE_REDUCESUMSQUARED);
-    XLink::AddParamToHeadInt(&output, dim);
+    if (input.enableGrad) {
+        XLink::MakeLink(&input, &shift, &output, REDUCE_REDUCESUMSQUARED);
+        XLink::AddParamToHeadInt(&output, dim);
+    }

    /* destroy variables */
    delete[] dimSize;
@@ -116,7 +118,7 @@ void ReduceSumSquared(const XTensor &input, XTensor &output, int dim, const XTen
    /* call _ReduceSumSquared function */
    _ReduceSumSquared(&input, &output, dim, &shift);

-    if (output.enableGrad) {
+    if (input.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&input, &shift, &output, REDUCE_REDUCESUMSQUARED);
        XLink::AddParamToHeadInt(&output, dim);

--- a/source/tensor/core/reduce/ReduceVariance.cpp
+++ b/source/tensor/core/reduce/ReduceVariance.cpp
@@ -38,7 +38,7 @@ For a 1-dimensional data array a, variance = 1/n * \sum_i (a_i - mean)^2
 */
 void _ReduceVariance(const XTensor * input, XTensor * output, int dim, const XTensor * mean)
 {
-	int dimRDI = input->order - dim - 1;
+    int dimRDI = input->order - dim - 1;
    int num = input->dimSizeRDI[dimRDI];
    _ReduceSum(input, output, dim, mean, 2.0F);
    _ScaleAndShiftMe(output, (DTYPE)1 / num, 0);
@@ -58,7 +58,7 @@ For a 1-dimensional data array a, variance = 1/n * \sum_i (a_i - mean)^2
 XTensor ReduceVariance(const XTensor &input, int dim, const XTensor &mean)
 {
    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
-	
+    
    int order = input.order - 1;
    int * dimSize = new int[order];
    for(int i = 0; i < order; i++){
@@ -76,8 +76,10 @@ XTensor ReduceVariance(const XTensor &input, int dim, const XTensor &mean)
    _ReduceVariance(&input, &output, dim, &mean);
                
    /* tensor connection */
-    XLink::MakeLink(&input, &mean, &output, REDUCE_REDUCEVARIANCE);
-    XLink::AddParamToHeadInt(&output, dim);
+    if (input.enableGrad) {
+        XLink::MakeLink(&input, &mean, &output, REDUCE_REDUCEVARIANCE);
+        XLink::AddParamToHeadInt(&output, dim);
+    }

    /* destroy variables */
    delete[] dimSize;
@@ -119,7 +121,7 @@ void ReduceVariance(const XTensor &input, XTensor &output, int dim, const XTenso
    /* call _ReduceVariance function */
    _ReduceVariance(&input, &output, dim, &mean);

-    if (output.enableGrad) {
+    if (input.enableGrad) {
        /* tensor connection */
        XLink::MakeLink(&input, &mean, &output, REDUCE_REDUCEVARIANCE);
        XLink::AddParamToHeadInt(&output, dim);

--- a/source/tensor/core/reduce/VectorBuffer.cpp
+++ b/source/tensor/core/reduce/VectorBuffer.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: ZHANG Yuhao (email: zhangyuhao@stu.neu.edu.cn) 2019-07-23
+*/
+
+#include "VectorBuffer.h"
+
+namespace nts {
+/* data size for each buffer */
+int VectorBuffer::size()
+{
+    return 32 / sizeof(DTYPE);
+}
+
+/* constructor */
+VectorBuffer::VectorBuffer() 
+{
+
+}
+
+/* 
+constructor
+initial values with val
+*/
+VectorBuffer::VectorBuffer(DTYPE val)
+{
+    for (int i = 0; i != size(); i++) {
+        values[i] = val;
+    }
+}
+
+/* load data */
+VectorBuffer VectorBuffer::loadu(const DTYPE* ptr, bool isExp , DTYPE power , DTYPE* bias )
+{
+    int count = 32 / sizeof(DTYPE);
+    VectorBuffer vec;
+    if (isExp) {
+        if (bias == NULL) {
+            if (power == (DTYPE)1.0) {
+                for (int i = 0; i != count; i++) {
+                    vec.values[i] = (DTYPE)exp(*(ptr + i));
+                }
+            }
+            else if (power == (DTYPE)2.0) {
+                for (int i = 0; i != count; i++) {
+                    vec.values[i] = (DTYPE)exp((*(ptr + i)) * (*(ptr + i)));
+                }
+            }
+            else if (power == (DTYPE)0.5) {
+                for (int i = 0; i != count; i++) {
+                    vec.values[i] = (DTYPE)exp(sqrt(*(ptr + i)));
+                }
+            }
+            else {
+                for (int i = 0; i != count; i++) {
+                    vec.values[i] = (DTYPE)exp(pow(*(ptr + i), power));
+                }
+            }
+        }/*is bias == NULL*/
+        else {
+            if (power == (DTYPE)1.0) {
+                for (int i = 0; i != count; i++) {
+                    vec.values[i] = (DTYPE)exp(*(ptr + i) - bias[i]);
+                }
+            }
+            else if (power == (DTYPE)2.0) {
+                for (int i = 0; i != count; i++) {
+                    DTYPE value = *(ptr + i) - bias[i];
+                    vec.values[i] = (DTYPE)exp(value * value);
+                }
+            }
+            else if (power == (DTYPE)0.5) {
+                for (int i = 0; i != count; i++) {
+                    vec.values[i] = (DTYPE)exp(sqrt(*(ptr + i) - bias[i]));
+                }
+            }
+            else {
+                for (int i = 0; i != count; i++) {
+                    vec.values[i] = (DTYPE)exp(pow(*(ptr + i) - bias[i], power));
+                }
+            }
+        }
+    }//isExp
+    else {
+        if (bias == NULL) {
+            if (power == (DTYPE)1.0) {
+                memcpy(vec.values, ptr, count * sizeof(DTYPE));
+            }
+            else if (power == (DTYPE)2.0) {
+                for (int i = 0; i != count; i++) {
+                    vec.values[i] = (*(ptr + i)) * (*(ptr + i));
+                }
+            }
+            else if (power == (DTYPE)0.5) {
+                for (int i = 0; i != count; i++) {
+                    vec.values[i] = (DTYPE)sqrt(*(ptr + i));
+                }
+            }
+            else {
+                for (int i = 0; i != count; i++) {
+                    vec.values[i] = (DTYPE)pow(*(ptr + i), power);
+                }
+            }
+        }// if bias == NULL
+        else {
+            if (power == (DTYPE)1.0) {
+                for (int i = 0; i != count; i++) {
+                    vec.values[i] = *(ptr + i) - bias[i];
+                }
+            }
+            else if (power == (DTYPE)2.0) {
+                for (int i = 0; i != count; i++) {
+                    DTYPE value = *(ptr + i) - bias[i];
+                    vec.values[i] = value * value;
+                }
+            }
+            else if (power == (DTYPE)0.5) {
+                for (int i = 0; i != count; i++) {
+                    vec.values[i] = (DTYPE)sqrt(*(ptr + i) - bias[i]);
+                }
+            }
+            else {
+                for (int i = 0; i != count; i++) {
+                    vec.values[i] = (DTYPE)pow(*(ptr + i) - bias[i], power);
+                }
+            }
+        }
+    }
+    return vec;
+}
+
+/* overloading [] */
+const DTYPE& VectorBuffer::operator[](int idx)const
+{
+    return values[idx];
+}
+
+/* overloading + */
+VectorBuffer VectorBuffer::operator+(const VectorBuffer &a)
+{
+    for (int i = 0; i != a.size(); i++) {
+        this->values[i] = a[i] + this->values[i];
+    }
+    return *this;
+}
+
+/* conculte the max of two buffer */
+VectorBuffer VectorBuffer::maxData(const VectorBuffer &a) {
+    for (int i = 0; i != a.size(); i++) {
+        this->values[i] = MAX(a[i], this->values[i]);
+    }
+    return *this;
+}
+
+}/* end of the nts (NiuTrans.Tensor) namespace */
\ No newline at end of file
--- a/source/tensor/core/reduce/VectorBuffer.h
+++ b/source/tensor/core/reduce/VectorBuffer.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: ZHANG Yuhao (email: zhangyuhao@stu.neu.edu.cn) 2019-07-23
+*/
+
+//#include <cstring>
+#include <math.h>
+#include "../../XGlobal.h"
+
+namespace nts {
+class VectorBuffer {
+private:
+    /* buffer for concluter */
+    DTYPE values[32 / sizeof(DTYPE)] = { 0 };
+public:
+    /* data size for each buffer */
+    static int size();
+
+    /* constructor */
+    VectorBuffer();
+
+    /* constructor */
+    VectorBuffer(DTYPE val);
+
+    /* load data */
+    static VectorBuffer loadu(const DTYPE* ptr, bool isExp = false, DTYPE power = (DTYPE)1.0F, DTYPE* bias = NULL); 
+
+    /* overloading [] */
+    const DTYPE& operator[](int idx)const; 
+
+    /* overloading + */
+    VectorBuffer operator+(const VectorBuffer &a); 
+
+    /* conculte the max of two buffer */
+    VectorBuffer maxData(const VectorBuffer &a); 
+};
+}
\ No newline at end of file
--- a/source/tensor/core/shape/Concatenate.cpp
+++ b/source/tensor/core/shape/Concatenate.cpp
@@ -99,9 +99,11 @@ XTensor Concatenate(const TensorList &smalls, int dim)
        _Merge(&smalls, &big, dim);
                
        /* tensor connection */
-        XLink::MakeLink(&smalls, &big, SHAPE_MERGE);
-        XLink::AddParamToHeadInt(&big, dim);
-        
+        if (tensor->enableGrad) {
+            XLink::MakeLink(&smalls, &big, SHAPE_MERGE);
+            XLink::AddParamToHeadInt(&big, dim);
+        }
+
        /* destroy variables */
        delete[] dimSize;

@@ -127,8 +129,10 @@ XTensor Concatenate(const TensorList &smalls, int dim)
        _ConcatenateSolely(&smalls, &big, dim);

        /* tensor connection */
-        XLink::MakeLink(&smalls, &big, SHAPE_CONCATENATE);
-        XLink::AddParamToHeadInt(&big, dim);
+        if (tensor->enableGrad) {
+            XLink::MakeLink(&smalls, &big, SHAPE_CONCATENATE);
+            XLink::AddParamToHeadInt(&big, dim);
+        }

        /* destroy variables */
        delete[] dimSize;
@@ -309,9 +313,11 @@ XTensor Concatenate(const XTensor &smallA, const XTensor &smallB, int dim)
        _Merge(&smalls, &big, dim);
                
        /* tensor connection */
-        XLink::MakeLink(&smalls, &big, SHAPE_MERGE);
-        XLink::AddParamToHeadInt(&big, dim);
-        
+        if (tensor->enableGrad) {
+            XLink::MakeLink(&smalls, &big, SHAPE_MERGE);
+            XLink::AddParamToHeadInt(&big, dim);
+        }
+
        /* destroy variables */
        delete[] dimSize;

@@ -337,8 +343,10 @@ XTensor Concatenate(const XTensor &smallA, const XTensor &smallB, int dim)
        _ConcatenateSolely(&smalls, &big, dim);

        /* tensor connection */
-        XLink::MakeLink(&smalls, &big, SHAPE_CONCATENATE);
-        XLink::AddParamToHeadInt(&big, dim);
+        if (tensor->enableGrad) {
+            XLink::MakeLink(&smalls, &big, SHAPE_CONCATENATE);
+            XLink::AddParamToHeadInt(&big, dim);
+        }

        /* destroy variables */
        delete[] dimSize;

--- a/source/tensor/core/shape/Merge.cpp
+++ b/source/tensor/core/shape/Merge.cpp
@@ -222,9 +222,11 @@ XTensor Merge(const XTensor &s, int whereToMerge, int leadingDim)
    _Merge(&s, &t, whereToMerge, leadingDim);

    /* tensor connections */
-    XLink::MakeLink(&s, NULL, &t, SHAPE_MERGE);
-    XLink::AddParamToHeadInt(&t, whereToMerge);
-    XLink::AddParamToHeadInt(&t, leadingDim);
+    if (s.enableGrad) {
+        XLink::MakeLink(&s, NULL, &t, SHAPE_MERGE);
+        XLink::AddParamToHeadInt(&t, whereToMerge);
+        XLink::AddParamToHeadInt(&t, leadingDim);
+    }

    /* destroy variables */
    delete[] dimSize;
@@ -261,7 +263,7 @@ void Merge(const XTensor &s, XTensor &t, int whereToMerge, int leadingDim)
    /* call _Merge function */
    _Merge(&s, &t, whereToMerge, leadingDim);

-    if (t.enableGrad) {
+    if (s.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&s, NULL, &t, SHAPE_MERGE);
        XLink::AddParamToHeadInt(&t, whereToMerge);
@@ -412,8 +414,10 @@ XTensor Merge(const TensorList &smalls, int whereToMerge)
    _Merge(&smalls, &big, whereToMerge);
    
    /* tensor connections */
-    XLink::MakeLink(&smalls, &big, SHAPE_MERGE_LIST);
-    XLink::AddParamToHeadInt(&big, whereToMerge);
+    if (tensor->enableGrad) {
+        XLink::MakeLink(&smalls, &big, SHAPE_MERGE_LIST);
+        XLink::AddParamToHeadInt(&big, whereToMerge);
+    }

    /* destroy variables */
    delete[] dimSize;
@@ -453,8 +457,10 @@ XTensor Merge(const XTensor &smallA, const XTensor &smallB, int whereToMerge)
    _Merge(&smalls, &big, whereToMerge);

    /* tensor connections */
-    XLink::MakeLink(&smalls, &big, SHAPE_MERGE_LIST);
-    XLink::AddParamToHeadInt(&big, whereToMerge);
+    if (smallA.enableGrad) {
+        XLink::MakeLink(&smalls, &big, SHAPE_MERGE_LIST);
+        XLink::AddParamToHeadInt(&big, whereToMerge);
+    }

    /* destroy variables */
    delete[] dimSize;

--- a/source/tensor/core/shape/Reshape.cpp
+++ b/source/tensor/core/shape/Reshape.cpp
@@ -43,9 +43,11 @@ XTensor Reshape(XTensor &s, int order, int * dimSize)
    t.Reshape(order, dimSize);

    /* tensor connections */
-	XLink::MakeLink(&s, NULL, &t, SHAPE_RESHAPE);
+    if (s.enableGrad) {
+        XLink::MakeLink(&s, NULL, &t, SHAPE_RESHAPE);
+    }

-	return t;
+    return t;
 }

 void Reshape(XTensor &s, XTensor &t, int order, int * dimSize)
@@ -57,7 +59,7 @@ void Reshape(XTensor &s, XTensor &t, int order, int * dimSize)
    /* call Reshape function */
    t.Reshape(order, dimSize);

-    if (t.enableGrad) {
+    if (s.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&s, NULL, &t, SHAPE_RESHAPE);
    }

--- a/source/tensor/core/shape/Split.cpp
+++ b/source/tensor/core/shape/Split.cpp
@@ -217,9 +217,11 @@ XTensor Split(const XTensor &s, int whereToSplit, int splitNum)
    _Split(&s, &t, whereToSplit, splitNum);
        
    /* tensor connections */
-    XLink::MakeLink(&s, NULL, &t, SHAPE_SPLIT);
-    XLink::AddParamToHeadInt(&t, whereToSplit);
-    XLink::AddParamToHeadInt(&t, splitNum);
+    if (s.enableGrad) {
+        XLink::MakeLink(&s, NULL, &t, SHAPE_SPLIT);
+        XLink::AddParamToHeadInt(&t, whereToSplit);
+        XLink::AddParamToHeadInt(&t, splitNum);
+    }

    /* destroy variables */
    delete[] dimSize;
@@ -251,7 +253,7 @@ void Split(const XTensor &s, XTensor &t, int whereToSplit, int splitNum)
    /* call _Split function */
    _Split(&s, &t, whereToSplit, splitNum);

-    if (t.enableGrad) {
+    if (s.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&s, NULL, &t, SHAPE_SPLIT);
        XLink::AddParamToHeadInt(&t, whereToSplit);
@@ -409,12 +411,15 @@ void Split(const XTensor &big, TensorList &smalls, int whereToSplit, int splitNu
    /* tensor connections */
    for(int i = 0; i < smalls.count; i++){
        XTensor * s = (XTensor*)smalls.Get(i);
-        XLink::MakeLink(&big, NULL, s, SHAPE_SPLIT_LIST);
-        XLink::AddParamToHeadInt(s, whereToSplit);

-        /* it is tricky here that we keep the id of each 
-           block, rather than the total number of the splits */
-        XLink::AddParamToHeadInt(s, i);
+        if (s->enableGrad) {
+            XLink::MakeLink(&big, NULL, s, SHAPE_SPLIT_LIST);
+            XLink::AddParamToHeadInt(s, whereToSplit);
+
+            /* it is tricky here that we keep the id of each
+               block, rather than the total number of the splits */
+            XLink::AddParamToHeadInt(s, i);
+        }
    }
 }


--- a/source/tensor/core/shape/Squeeze.cpp
+++ b/source/tensor/core/shape/Squeeze.cpp
@@ -121,7 +121,9 @@ XTensor Squeeze(XTensor & source, int leadingDim)
    _Squeeze(&source, &target, leadingDim);

    /* tensor connections */
-    XLink::MakeLink(&source, NULL, &target, SHAPE_SQUEEZE);
+    if (source.enableGrad) {
+        XLink::MakeLink(&source, NULL, &target, SHAPE_SQUEEZE);
+    }

    return target;
 }
@@ -135,7 +137,7 @@ void Squeeze(XTensor & source, XTensor & target, int leadingDim)
    /* call _Squeeze function */
    _Squeeze(&source, &target, leadingDim);

-    if (target.enableGrad) {
+    if (source.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&source, NULL, &target, SHAPE_SQUEEZE);
    }

--- a/source/tensor/core/shape/Transpose.cpp
+++ b/source/tensor/core/shape/Transpose.cpp
@@ -144,9 +144,11 @@ XTensor Transpose(const XTensor &a, const int i, const int j)
    _Transpose(&a, &b, i, j);
    
    /* tensor connection */
-    XLink::MakeLink(&a, NULL, &b, SHAPE_TRANSPOSE);
-    XLink::AddParamToHeadInt(&b, i);
-    XLink::AddParamToHeadInt(&b, j);
+    if (a.enableGrad) {
+        XLink::MakeLink(&a, NULL, &b, SHAPE_TRANSPOSE);
+        XLink::AddParamToHeadInt(&b, i);
+        XLink::AddParamToHeadInt(&b, j);
+    }

    /* destroy variables */
    delete[] dimSize;

--- a/source/tensor/core/shape/Unsqueeze.cpp
+++ b/source/tensor/core/shape/Unsqueeze.cpp
@@ -156,9 +156,11 @@ XTensor Unsqueeze(const XTensor &a, int dim, int dSize)
    _Unsqueeze(&a, &b, dim, dSize);

    /* tensor connections */
-    XLink::MakeLink(&a, NULL, &b, SHAPE_UNSQUEEZE);
-    XLink::AddParamToHeadInt(&b, dim);
-    XLink::AddParamToHeadInt(&b, dSize);
+    if (a.enableGrad) {
+        XLink::MakeLink(&a, NULL, &b, SHAPE_UNSQUEEZE);
+        XLink::AddParamToHeadInt(&b, dim);
+        XLink::AddParamToHeadInt(&b, dSize);
+    }

    /* destroy variables */
    delete[] dimSize;
@@ -191,7 +193,7 @@ void Unsqueeze(const XTensor &a, XTensor &b, int dim, int dSize)
    /* call _Unsqueeze function */
    _Unsqueeze(&a, &b, dim, dSize);

-    if (b.enableGrad) {
+    if (a.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&a, NULL, &b, SHAPE_UNSQUEEZE);
        XLink::AddParamToHeadInt(&b, dim);

--- a/source/tensor/core/sort/TopK.cu
+++ b/source/tensor/core/sort/TopK.cu
@@ -377,8 +377,8 @@ get the top-k items
 template<class T> __global__
 void KernelTopK3(T * input, int stride, int strideNum, int blockNum, int k, T minValue, T * output, int * index)
 {
-    __shared__ CudaHeapNode<T> heapData[(SHARED_MEMORY_SIZE - 1024 * sizeof(T)) / sizeof(CudaHeapNode<T>)];
-    __shared__ T eachHeapMaxValue[1024];
+    __shared__ CudaHeapNode<T> heapData[(SHARED_MEMORY_SIZE - 512 * sizeof(T)) / sizeof(CudaHeapNode<T>)];
+    __shared__ T eachHeapMaxValue[512];
    /*optimization k size the parameter must more than half of k*/
    int parameter = 0;

@@ -429,7 +429,7 @@ void KernelTopK3(T * input, int stride, int strideNum, int blockNum, int k, T mi
    }
    __syncthreads();

-    /*to merge the heap use another way*/
+    /* to merge the heap use another way */
    T minData = minValue;
    int heapLimit = heap.count / 2;
    if (heapLimit % 2 == 0 && heapLimit != 0) heapLimit -= 1;
@@ -438,12 +438,13 @@ void KernelTopK3(T * input, int stride, int strideNum, int blockNum, int k, T mi
            minData = heap.items[counter].value;
    }
    eachHeapMaxValue[threadIdx.y * blockDim.x + threadIdx.x] = minData;
+
    //need more optimation
    if (i == 0) {
-        int threadLimit = (threadIdx.y + 1) * blockDim.x;
+        int threadLimit = threadIdx.y  * blockDim.x + min(blockDim.x,strideNum);
        CudaXHeap<MIN_HEAP, T> chooseHeap(k, heapData + k * ((blockDim.x * blockDim.y) + threadIdx.y));
        int counter = threadIdx.y * blockDim.x;
-        for (; counter < threadIdx.y * blockDim.x + k; ++counter) {
+        for (; counter < threadIdx.y * blockDim.x + min(k, blockDim.x); ++counter) {
            chooseHeap.Push(counter, eachHeapMaxValue[counter]);
        }
        for (; counter < threadLimit; ++counter) {
@@ -451,15 +452,16 @@ void KernelTopK3(T * input, int stride, int strideNum, int blockNum, int k, T mi
                chooseHeap.ReplaceTop(counter, eachHeapMaxValue[counter]);
            }
        }
+        int heapNum = chooseHeap.count;
        CudaXHeap<MIN_HEAP, T>  ansHeapData(k, k - parameter, heapData + k * chooseHeap.items[0].index);
        int miss = parameter;
-        for (counter = 1; counter < k; ++counter) {
+        for (counter = 1; counter < heapNum; ++counter) {
            chooseHeap.items[0] = chooseHeap.items[chooseHeap.count - 1];
            chooseHeap.count--;
            chooseHeap.Down(0);
            CudaHeapNode<T> * cmpHeapData = heapData + k * (chooseHeap.items[0].index);
            int cmpHeapLimit = 0;
-            if (counter + heapLimit <= k - parameter){
+            if (counter + heapLimit <= k - parameter && heapNum == k){
                cmpHeapLimit = heapLimit;
            }
            /* take the max data from the minHeap,so start search from the leaf node */
@@ -770,22 +772,22 @@ void KernelTopKRadixSelect(unsigned int * input, int stride, int strideNum,
   /*
   if (idx == 0)
    {
-    	unsigned int* uintOutput = new unsigned int;
-    	int* tmpIndex = new int;
-    	//*******************something worng***************************
-    	cudaMalloc((void **)&uintOutput, sizeof(unsigned int)* k);
-    	cudaMalloc((void **)&tmpIndex, sizeof(unsigned int)*k);
-    	//*************************************************************
-    	collectNumberOld(input, limit, k, desire, uintOutput, tmpIndex, stride, strideNum);
-    	int blockIndex = idy / stride;
-    	int offsetInBlock = idy% stride;
-
-    	for (int i = stride * k * blockIndex + offsetInBlock, j = 0; j < k; j++, i += stride)
-    	{
-    		//for(int i = )
-    		output[i] = deconvert(uintOutput[j]);
-    		index[i] = tmpIndex[j];
-    	}
+        unsigned int* uintOutput = new unsigned int;
+        int* tmpIndex = new int;
+        //*******************something worng***************************
+        cudaMalloc((void **)&uintOutput, sizeof(unsigned int)* k);
+        cudaMalloc((void **)&tmpIndex, sizeof(unsigned int)*k);
+        //*************************************************************
+        collectNumberOld(input, limit, k, desire, uintOutput, tmpIndex, stride, strideNum);
+        int blockIndex = idy / stride;
+        int offsetInBlock = idy% stride;
+
+        for (int i = stride * k * blockIndex + offsetInBlock, j = 0; j < k; j++, i += stride)
+        {
+            //for(int i = )
+            output[i] = deconvert(uintOutput[j]);
+            index[i] = tmpIndex[j];
+        }
    }
    __syncthreads();
    */
@@ -840,7 +842,7 @@ void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
    /* we run the kernel if the heaps can fit into the shared memory */
    cudaGrids[1] *= cudaBlocks[1];
    cudaBlocks[1] = 1;
-    if ((cudaBlocks[0] * cudaBlocks[1] + 1) * k * (a->unitSize + sizeof(int)) < SHARED_MEMORY_SIZE) {
+    if ((cudaBlocks[0] * cudaBlocks[1] + 1) * k * (a->unitSize + sizeof(int)) + (512 * sizeof(int))< SHARED_MEMORY_SIZE) {
        if (a->dataType == DEFAULT_DTYPE) {
            KernelTopK3<DTYPE> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>>
                                 ((DTYPE*)a->data, stride, strideNumA, blockNum, k, DTYPE_MIN,
@@ -869,7 +871,7 @@ void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
        //delete indexA;
        int workerNum = WORKERSNUM;

-        GDevs.GetCudaThread2D(a->mem->devID,
+        GDevs.GetCudaThread2D(a->devID,
            workerNum, stride * blockNum, MAX_INT,
            cudaGrids, cudaBlocks);
        if (a->dataType == DEFAULT_DTYPE) {

--- a/source/tensor/function/DropoutWithIndex.cpp
+++ b/source/tensor/function/DropoutWithIndex.cpp
@@ -81,8 +81,10 @@ XTensor DropoutWithIndex(const XTensor &x, XTensor &maskIndex, DTYPE scale)
    _ScaleAndShiftMe(&c, scale);

    /* tensor connections */
-    XLink::MakeLink(&x, &maskIndex, &c, MOVEMENT_DROPOUTWITHINDEX);
-    XLink::AddParamToHead(&c, scale);
+    if (x.enableGrad) {
+        XLink::MakeLink(&x, &maskIndex, &c, MOVEMENT_DROPOUTWITHINDEX);
+        XLink::AddParamToHead(&c, scale);
+    }

    return c;
 }

--- a/source/tensor/function/HardTanH.cpp
+++ b/source/tensor/function/HardTanH.cpp
@@ -78,7 +78,9 @@ XTensor HardTanH(const XTensor &x)
    _HardTanH(&x, &y);

    /* tensor connection */
-    XLink::MakeLink(&x, NULL, &y, FUNC_HARDTANH);
+    if (x.enableGrad) {
+        XLink::MakeLink(&x, NULL, &y, FUNC_HARDTANH);
+    }

    return y;
 }
@@ -92,7 +94,7 @@ void HardTanH(const XTensor &x, XTensor &y)
    /* call _HardTanH function */
    _HardTanH(&x, &y);

-    if (y.enableGrad) {
+    if (x.enableGrad) {
        /* tensor connection */
        XLink::MakeLink(&x, NULL, &y, FUNC_HARDTANH);
    }

--- a/source/tensor/function/Identity.cpp
+++ b/source/tensor/function/Identity.cpp
@@ -54,7 +54,9 @@ XTensor Identity(const XTensor &x)
    _Identity(&x, &y);

    /* tensor connection */
-    XLink::MakeLink(&x, NULL, &y, FUNC_IDENTITY);
+    if (x.enableGrad) {
+        XLink::MakeLink(&x, NULL, &y, FUNC_IDENTITY);
+    }

    return y;
 }
@@ -68,7 +70,7 @@ void Identity(const XTensor &x, XTensor &y)
    /* call _Identity function */
    _Identity(&x, &y);

-    if (y.enableGrad) {
+    if (x.enableGrad) {
        /* tensor connection */
        XLink::MakeLink(&x, NULL, &y, FUNC_IDENTITY);
    }

--- a/source/tensor/function/LogSoftmax.cpp
+++ b/source/tensor/function/LogSoftmax.cpp
@@ -188,8 +188,10 @@ XTensor LogSoftmax(const XTensor &x, int leadDim)
    _LogSoftmax(&x, &y, ld);

    /* tensor connection */
-    XLink::MakeLink(&x, NULL, &y, FUNC_LOGSOFTMAX);
-    XLink::AddParamToHeadInt(&y, ld);
+    if (x.enableGrad) {
+        XLink::MakeLink(&x, NULL, &y, FUNC_LOGSOFTMAX);
+        XLink::AddParamToHeadInt(&y, ld);
+    }

    return y;
 }
@@ -215,7 +217,7 @@ void LogSoftmax(const XTensor &x, XTensor &y, int leadDim)
    /* call _LogSoftmax function */
    _LogSoftmax(&x, &y, ld);

-    if (y.enableGrad) {
+    if (x.enableGrad) {
        /* tensor connection */
        XLink::MakeLink(&x, NULL, &y, FUNC_LOGSOFTMAX);
        XLink::AddParamToHeadInt(&y, ld);

--- a/source/tensor/function/Rectify.cpp
+++ b/source/tensor/function/Rectify.cpp
@@ -70,7 +70,9 @@ XTensor Rectify(const XTensor &x)
    _Rectify(&x, &y);

    /* tensor connection */
-    XLink::MakeLink(&x, NULL, &y, FUNC_RECTIFY);
+    if (x.enableGrad) {
+        XLink::MakeLink(&x, NULL, &y, FUNC_RECTIFY);
+    }

    return y;
 }
@@ -84,7 +86,7 @@ void Rectify(const XTensor &x, XTensor &y)
    /* call _Rectify function */
    _Rectify(&x, &y);

-    if (y.enableGrad) {
+    if (x.enableGrad) {
        /* tensor connection */
        XLink::MakeLink(&x, NULL, &y, FUNC_RECTIFY);
    }

--- a/source/tensor/function/Sigmoid.cpp
+++ b/source/tensor/function/Sigmoid.cpp
@@ -73,7 +73,9 @@ XTensor Sigmoid(const XTensor &x)
    _Sigmoid(&x, &y);

    /* tensor connection */
-    XLink::MakeLink(&x, NULL, &y, FUNC_SIGMOID);
+    if (x.enableGrad) {
+        XLink::MakeLink(&x, NULL, &y, FUNC_SIGMOID);
+    }

    return y;
 }
@@ -87,7 +89,7 @@ void Sigmoid(const XTensor &x, XTensor &y)
    /* call _Sigmoid function */
    _Sigmoid(&x, &y);

-    if (y.enableGrad) {
+    if (x.enableGrad) {
        /* tensor connection */
        XLink::MakeLink(&x, NULL, &y, FUNC_SIGMOID);
    }

--- a/source/tensor/function/Softmax.cpp
+++ b/source/tensor/function/Softmax.cpp
@@ -142,8 +142,10 @@ XTensor Softmax(const XTensor &x, int leadDim)
    _Softmax(&x, &y, ld);

    /* tensor connection */
-    XLink::MakeLink(&x, NULL, &y, FUNC_SOFTMAX);
-    XLink::AddParamToHeadInt(&y, ld);
+    if (x.enableGrad) {
+        XLink::MakeLink(&x, NULL, &y, FUNC_SOFTMAX);
+        XLink::AddParamToHeadInt(&y, ld);
+    }

    return y;
 }
@@ -161,7 +163,7 @@ void Softmax(const XTensor &x, XTensor &y, int leadDim)
    /* call _Softmax function */
    _Softmax(&x, &y, ld);

-    if (y.enableGrad) {
+    if (x.enableGrad) {
        /* tensor connection */
        XLink::MakeLink(&x, NULL, &y, FUNC_SOFTMAX);
        XLink::AddParamToHeadInt(&y, ld);

--- a/source/tensor/loss/CrossEntropy.cpp
+++ b/source/tensor/loss/CrossEntropy.cpp
@@ -277,8 +277,11 @@ XTensor CrossEntropy(const XTensor & output, const XTensor & gold,
    tails.Add((XTensor*)&gold);
    tails.Add(weight);
    tails.Add(padding);
-    XLink::MakeLink(&tails, &loss, LOSS_CROSSENTROPY);
-    XLink::AddParamToHeadInt(&loss, dim);
+
+    if (output.enableGrad) {
+        XLink::MakeLink(&tails, &loss, LOSS_CROSSENTROPY);
+        XLink::AddParamToHeadInt(&loss, dim);
+    }

    return loss;
 }
@@ -302,8 +305,11 @@ XTensor CrossEntropy(const XTensor & output, const XTensor & gold,
    tails.Add((XTensor*)&gold);
    tails.Add(weight);
    tails.Add((XTensor*)&padding);
-    XLink::MakeLink(&tails, &loss, LOSS_CROSSENTROPY);
-    XLink::AddParamToHeadInt(&loss, dim);
+
+    if (output.enableGrad) {
+        XLink::MakeLink(&tails, &loss, LOSS_CROSSENTROPY);
+        XLink::AddParamToHeadInt(&loss, dim);
+    }

    return loss;
 }
@@ -677,4 +683,4 @@ void _CrossEntropyBackward(XTensor * dedy, const XTensor * output,
    }
 }

-} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/test/TSetData.cpp
+++ b/source/tensor/test/TSetData.cpp
@@ -406,6 +406,68 @@ bool TestSetData5()
 #endif // USE_CUDA
 }

+/*
+case 6: test SetDataRange function.
+generate data items with a range by start, end and the step
+*/
+bool TestSetData6()
+{
+    /* a input tensor of size (5) */
+    int order = 1;
+    int * dimSize = new int[order];
+    dimSize[0] = 5;
+
+    int unitNum = 1;
+    for (int i = 0; i < order; i++)
+        unitNum *= dimSize[i];
+
+    DTYPE answer[5] = {5.2F, 3.2F, 1.2F, -0.8F, -2.8F};
+
+    /* CPU test */
+    bool cpuTest = true;
+
+    /* create tensors */
+    XTensor * s = NewTensor(order, dimSize);
+
+    /* initialize variables */
+    s->SetZeroAll();
+
+    /* call _SetDataRange function */
+    _SetDataRange(s, 5.2, -3.2, -2);
+
+    /* check results */
+    cpuTest = s->CheckData(answer, unitNum, 1e-4F);
+
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+
+    /* create tensors */
+    XTensor * sGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+
+    /* initialize variables */
+    sGPU->SetZeroAll();
+
+    /* call _SetDataRange function */
+    _SetDataRange(sGPU, 5.2, -3.2, -2);
+
+    gpuTest = sGPU->CheckData(answer, unitNum, 1e-4F);
+
+    /* destroy variables */
+    delete s;
+    delete sGPU;
+    delete[] dimSize;
+
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s;
+    delete[] dimSize;
+
+    return cpuTest;
+#endif // USE_CUDA
+}
+
 /* other cases */
 /*
 TODO!!
@@ -462,6 +524,15 @@ bool TestSetData()
    else
        XPRINT(0, stdout, ">> case 5 passed!\n");

+    /* case 6 test */
+    caseFlag = TestSetData6();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 6 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 6 passed!\n");
+
    /* other cases test */
    /*
    TODO!!

--- a/source/tensor/test/TTopK.cpp
+++ b/source/tensor/test/TTopK.cpp
@@ -105,10 +105,62 @@ bool TestTopK1()
    TopK(sUser, tUser2, indexUser2, dim, k);

    /* check results */
-    cpuTest = t1->CheckData(tAnswer1, tUnitNum) && tUser1.CheckData(tAnswer1, tUnitNum)
-           && t2->CheckData(tAnswer2, tUnitNum) && tUser2.CheckData(tAnswer2, tUnitNum)
-           && index1->CheckData(indexAnswer1, tUnitNum) && indexUser1.CheckData(indexAnswer1, tUnitNum)
-           && index2->CheckData(indexAnswer2, tUnitNum) && indexUser2.CheckData(indexAnswer2, tUnitNum);
+        
+    for (int i = 0; i < tDimSize[1]; ++i)
+    {
+        for (int j = 0; j < tDimSize[0]; ++j)
+        {
+            float tmpData = ((float *)t1->data)[i + tDimSize[1] * j];
+            int tmpIndex = ((int *)index1->data)[i + tDimSize[1] * j];
+            float tmpDataUser = ((float *)tUser1.data)[i + tDimSize[1] * j];
+            int tmpIndexUser = ((int *)indexUser1.data)[i + tDimSize[1] * j];
+            bool flag = false;
+            bool flagUser = false;
+            for (int k = 0; k < tDimSize[0]; ++k)
+            {
+                float* ans = tAnswer1[0];
+                int* ansIndex = indexAnswer1[0];
+                if (tmpData == ans[i + tDimSize[1] * k] && tmpIndex == ansIndex[i + tDimSize[1] * k])
+                {
+                    flag = true;
+                }
+                if (tmpDataUser == ans[i + tDimSize[1] * k] && tmpIndexUser == ansIndex[i + tDimSize[1] * k])
+                {
+                    flagUser = true;
+                }
+            }
+            cpuTest = cpuTest&&flag&&flagUser;
+        }
+    }
+
+    for (int i = 0; i < tDimSize[0]; ++i)
+    {
+        for (int j = 0; j < tDimSize[1]; ++j)
+        {
+            float tmpData = ((float *)t2->data)[i * tDimSize[1] + j];
+            int tmpIndex = ((int *)index2->data)[i * tDimSize[1] + j];
+            float tmpDataUser = ((float *)tUser2.data)[i * tDimSize[1] + j];
+            int tmpIndexUser = ((int *)indexUser2.data)[i * tDimSize[1] + j];
+            bool flag = false;
+            bool flagUser = false;
+            for (int k = 0; k < tDimSize[1]; ++k)
+            {
+                float* ans = tAnswer2[0];
+                int* ansIndex = indexAnswer2[0];
+                if (tmpData == ans[i * tDimSize[1] + k] && tmpIndex == ansIndex[i * tDimSize[1] + k])
+                {
+                    flag = true;
+                }
+                if (tmpDataUser == ans[i * tDimSize[1] + k] && tmpIndexUser == ansIndex[i * tDimSize[1] + k])
+                {
+                    flagUser = true;
+                }
+            }
+            cpuTest = cpuTest&&flag&&flagUser;
+        }
+    }
+
+

 #ifdef USE_CUDA
    /* GPU test */
@@ -152,10 +204,74 @@ bool TestTopK1()
    TopK(sUserGPU, tUserGPU2, indexUserGPU2, dim, k);
    
    /* check results */
-    gpuTest = tGPU1->CheckData(tAnswer1, tUnitNum) && tUserGPU1.CheckData(tAnswer1, tUnitNum)
-              && tGPU2->CheckData(tAnswer2, tUnitNum) && tUserGPU2.CheckData(tAnswer2, tUnitNum)
-              && indexGPU1->CheckData(indexAnswer1, tUnitNum) && indexUserGPU1.CheckData(indexAnswer1, tUnitNum)
-              && indexGPU2->CheckData(indexAnswer2, tUnitNum) && indexUserGPU2.CheckData(indexAnswer2, tUnitNum);
+    float* checkData = new float[tUnitNum];
+    int* checkIndex = new int[tUnitNum];
+    float* checkDataUser = new float[tUnitNum];
+    int* checkIndexUser = new int[tUnitNum];
+
+    cudaMemcpy(checkData, tGPU1->data, sizeof(DTYPE)*tUnitNum,cudaMemcpyDeviceToHost);
+    cudaMemcpy(checkIndex, indexGPU1->data, sizeof(int)*tUnitNum, cudaMemcpyDeviceToHost);
+    cudaMemcpy(checkDataUser, tUserGPU1.data, sizeof(DTYPE)*tUnitNum, cudaMemcpyDeviceToHost);
+    cudaMemcpy(checkIndexUser, indexUserGPU1.data, sizeof(int)*tUnitNum, cudaMemcpyDeviceToHost);
+
+    for (int i = 0; i < tDimSize[1]; ++i)
+    {
+        for (int j = 0; j < tDimSize[0]; ++j)
+        {
+            float tmpData = ((float *)checkData)[i + tDimSize[1] * j];
+            int tmpIndex = ((int *)checkIndex)[i + tDimSize[1] * j];
+            float tmpDataUser = ((float *)checkDataUser)[i + tDimSize[1] * j];
+            int tmpIndexUser = ((int *)checkIndexUser)[i + tDimSize[1] * j];
+            bool flag = false;
+            bool flagUser = false;
+            for (int k = 0; k < tDimSize[0]; ++k)
+            {
+                float* ans = tAnswer1[0];
+                int* ansIndex = indexAnswer1[0];
+                if (tmpData == ans[i + tDimSize[1] * k] && tmpIndex == ansIndex[i + tDimSize[1] * k])
+                {
+                    flag = true;
+                }
+                if (tmpDataUser == ans[i + tDimSize[1] * k] && tmpIndexUser == ansIndex[i + tDimSize[1] * k])
+                {
+                    flagUser = true;
+                }
+            }
+            gpuTest = gpuTest&&flag&&flagUser;
+        }
+    }
+
+    cudaMemcpy(checkData, tGPU2->data, sizeof(DTYPE)*tUnitNum, cudaMemcpyDeviceToHost);
+    cudaMemcpy(checkIndex, indexGPU2->data, sizeof(int)*tUnitNum, cudaMemcpyDeviceToHost);
+    cudaMemcpy(checkDataUser, tUserGPU2.data, sizeof(DTYPE)*tUnitNum, cudaMemcpyDeviceToHost);
+    cudaMemcpy(checkIndexUser, indexUserGPU2.data, sizeof(int)*tUnitNum, cudaMemcpyDeviceToHost);
+
+    for (int i = 0; i < tDimSize[0]; ++i)
+    {
+        for (int j = 0; j < tDimSize[1]; ++j)
+        {
+            float tmpData = ((float *)checkData)[i * tDimSize[1] + j];
+            int tmpIndex = ((int *)checkIndex)[i * tDimSize[1] + j];
+            float tmpDataUser = ((float *)checkDataUser)[i * tDimSize[1] + j];
+            int tmpIndexUser = ((int *)checkIndexUser)[i * tDimSize[1] + j];
+            bool flag = false;
+            bool flagUser = false;
+            for (int k = 0; k < tDimSize[1]; ++k)
+            {
+                float* ans = tAnswer2[0];
+                int* ansIndex = indexAnswer2[0];
+                if (tmpData == ans[i * tDimSize[1] + k] && tmpIndex == ansIndex[i * tDimSize[1] + k])
+                {
+                    flag = true;
+                }
+                if (tmpDataUser == ans[i * tDimSize[1] + k] && tmpIndexUser == ansIndex[i * tDimSize[1] + k])
+                {
+                    flagUser = true;
+                }
+            }
+            gpuTest = gpuTest&&flag&&flagUser;
+        }
+    }

    /* destroy variables */
    delete s;
@@ -170,6 +286,10 @@ bool TestTopK1()
    delete indexGPU2;
    delete[] sDimSize;
    delete[] tDimSize;
+    delete[] checkData;
+    delete[] checkIndex;
+    delete[] checkDataUser;
+    delete[] checkIndexUser;

    return cpuTest && gpuTest;
 #else
@@ -247,8 +367,33 @@ bool TestTopK2()
    TopK(sUser, tUser, indexUser, dim, k);

    /* check results */
-    cpuTest = t->CheckData(tAnswer, tUnitNum) && tUser.CheckData(tAnswer, tUnitNum)
-              && index->CheckData(indexAnswer, tUnitNum) && indexUser.CheckData(indexAnswer, tUnitNum);
+
+    for (int i = 0; i < tDimSize[0]; ++i)
+    {
+        for (int j = 0; j < tDimSize[1]; ++j)
+        {
+            float tmpData = ((float *)t->data)[i * tDimSize[1] + j];
+            int tmpIndex = ((int *)index->data)[i * tDimSize[1] + j];
+            float tmpDataUser = ((float *)tUser.data)[i * tDimSize[1] + j];
+            int tmpIndexUser = ((int *)indexUser.data)[i * tDimSize[1] + j];
+            bool flag = false;
+            bool flagUser = false;
+            for (int k = 0; k < tDimSize[1]; ++k)
+            {
+                float* ans = tAnswer[0];
+                int* ansIndex = indexAnswer[0];
+                if (tmpData == ans[i * tDimSize[1] + k] && tmpIndex == ansIndex[i * tDimSize[1] + k])
+                {
+                    flag = true;
+                }
+                if (tmpDataUser == ans[i * tDimSize[1] + k] && tmpIndexUser == ansIndex[i * tDimSize[1] + k])
+                {
+                    flagUser = true;
+                }
+            }
+            cpuTest = cpuTest&&flag&&flagUser;
+        }
+    }

 #ifdef USE_CUDA
    /* GPU test */
@@ -279,8 +424,42 @@ bool TestTopK2()
    TopK(sUserGPU, tUserGPU, indexUserGPU, dim, k);

    /* check results */
-    gpuTest = tGPU->CheckData(tAnswer, tUnitNum) && tUserGPU.CheckData(tAnswer, tUnitNum)
-              && indexGPU->CheckData(indexAnswer, tUnitNum) && indexUserGPU.CheckData(indexAnswer, tUnitNum);
+    float* checkData = new float[tUnitNum];
+    int* checkIndex = new int[tUnitNum];
+    float* checkDataUser = new float[tUnitNum];
+    int* checkIndexUser = new int[tUnitNum];
+
+    cudaMemcpy(checkData, tGPU->data, sizeof(DTYPE)*tUnitNum, cudaMemcpyDeviceToHost);
+    cudaMemcpy(checkIndex, indexGPU->data, sizeof(int)*tUnitNum, cudaMemcpyDeviceToHost);
+    cudaMemcpy(checkDataUser, tUserGPU.data, sizeof(DTYPE)*tUnitNum, cudaMemcpyDeviceToHost);
+    cudaMemcpy(checkIndexUser, indexUserGPU.data, sizeof(int)*tUnitNum, cudaMemcpyDeviceToHost);
+
+    for (int i = 0; i < tDimSize[0]; ++i)
+    {
+        for (int j = 0; j < tDimSize[1]; ++j)
+        {
+            float tmpData = ((float *)checkData)[i * tDimSize[1] + j];
+            int tmpIndex = ((int *)checkIndex)[i * tDimSize[1] + j];
+            float tmpDataUser = ((float *)checkDataUser)[i * tDimSize[1] + j];
+            int tmpIndexUser = ((int *)checkIndexUser)[i * tDimSize[1] + j];
+            bool flag = false;
+            bool flagUser = false;
+            for (int k = 0; k < tDimSize[1]; ++k)
+            {
+                float* ans = tAnswer[0];
+                int* ansIndex = indexAnswer[0];
+                if (tmpData == ans[i * tDimSize[1] + k] && tmpIndex == ansIndex[i * tDimSize[1] + k])
+                {
+                    flag = true;
+                }
+                if (tmpDataUser == ans[i * tDimSize[1] + k] && tmpIndexUser == ansIndex[i * tDimSize[1] + k])
+                {
+                    flagUser = true;
+                }
+            }
+            gpuTest = gpuTest&&flag&&flagUser;
+        }
+    }

    /* destroy variables */
    delete s;
@@ -291,6 +470,10 @@ bool TestTopK2()
    delete indexGPU;
    delete[] sDimSize;
    delete[] tDimSize;
+    delete[] checkData;
+    delete[] checkIndex;
+    delete[] checkDataUser;
+    delete[] checkIndexUser;

    return cpuTest && gpuTest;
 #else