update setdata by template and optimize some interface

bdf5c952 · xuchen · bd05b21b · bdf5c952 · bdf5c952 · bd05b21b
Commit bdf5c952 authored Feb 19, 2020 by xuchen
--- a/source/Main.cpp
+++ b/source/Main.cpp
@@ -63,155 +63,3 @@ int main( int argc, const char ** argv )
    
    return 0;
 }
-
-void BackwardTest()
-{
-    XNet net;
-
-    XTensor a;
-    XTensor b;
-    XTensor c;
-    a.enableGrad = true;
-    b.enableGrad = false;
-    c.enableGrad = false;
-    XTensor mean;
-    XTensor origin;
-    InitTensor2DV2(&a, 2, 3);
-    InitTensor1DV2(&b, 2);
-
-    a.SetZeroAll();
-    b.SetZeroAll();
-    a.Set2D(1.0F, 0, 0);
-    a.Set2D(2.0F, 0, 1);
-    a.Set2D(3.0F, 0, 2);
-    a.Set2D(4.0F, 1, 0);
-    a.Set2D(5.0F, 1, 1);
-    a.Set2D(6.0F, 1, 2);
-
-    b.Set1D(2.0F, 0);
-    b.Set1D(1.0F, 1);
-
-    DivDim(a, b, c, 0);
-    c.Dump(stderr, "c:");
-    auto loss = CrossEntropy(c, a);
-
-    //XLink::ShowNetwork(stderr, &c);
-
-    net.Backward(loss);
-
-    a.grad->Dump(stderr);
-
-}
-
-void TransposeTest()
-{
-#ifdef USE_CUDA
-    XMem mem0(0, UNI_FREE, MILLION * 64, 1024, MILLION * 64);
-    //XMem mem1(1, UNI_FREE, MILLION * 64, 1024, MILLION * 64);
-    XTensor x;
-    XTensor y;
-    XTensor z;
-
-    int loops = 2000;
-
-    int B = 3 * 2 * 4;
-    int K = 8 * 1;
-    int N = 50;
-    int H = 512 * 4;
-
-    int nnn = GDevs.nGPU;
-
-    InitTensor3DV2(&x, B, N, H, X_FLOAT, 0);
-    InitTensor4DV2(&y, K, B, N, H/K, X_FLOAT, 0);
-    InitTensor3DV2(&z, B, N, H, X_FLOAT, 0);
-
-    cudaEvent_t ctime0;
-    cudaEvent_t ctime1;
-    cudaEvent_t ctime2;
-    cudaEvent_t ctime3;
-    cudaEvent_t ctime4;
-    cudaEvent_t ctime5;
-
-    float elapsedSplit = 0.0;
-    float elapsedMerge = 0.0;
-    float elapsedSum = 0.0;
-
-    cudaEventCreate(&ctime0);
-    cudaEventCreate(&ctime1);
-    cudaEventCreate(&ctime2);
-    cudaEventCreate(&ctime3);
-    cudaEventCreate(&ctime4);
-    cudaEventCreate(&ctime5);
-
-    cudaEventRecord(ctime0, 0);
-
-    double time0 = GetClock();
-    for(int i = 0; i < loops; i++)
-        _Split(&x, &y, 2, K);
-    double time1 = GetClock();
-    
-    cudaEventRecord(ctime1, 0);
-    cudaEventSynchronize(ctime1);
-    cudaEventElapsedTime(&elapsedSplit, ctime0, ctime1);
-
-    cudaEventRecord(ctime2, 0);
-
-    double time2 = GetClock();
-    for(int i = 0; i < loops; i++)
-        _Merge(&y, &x, 3);
-    double time3 = GetClock();
-
-    cudaEventRecord(ctime3, 0);
-    cudaEventSynchronize(ctime3);
-    cudaEventElapsedTime(&elapsedMerge, ctime2, ctime3);
-
-    cudaEventRecord(ctime4, 0);
-
-    double time4 = GetClock();
-    for(int i = 0; i < loops; i++)
-        _Sum(&x, &z, &x);
-    double time5 = GetClock();
-
-    cudaEventRecord(ctime5, 0);
-    cudaEventSynchronize(ctime5);
-    cudaEventElapsedTime(&elapsedSum, ctime4, ctime5);
-
-    fprintf(stderr, "split:%f merge:%f sum:%f\n", time1 - time0, time3 - time2, time5 - time4);
-    fprintf(stderr, "split:%f merge:%f sum:%f\n", elapsedSplit, elapsedMerge, elapsedSum);
-#endif
-}
-
-void SumDimTest()
-{
-    XTensor x;
-    XTensor y;
-    XTensor z;
-
-    int a = 5;
-    int b = 7;
-    int c = 3;
-
-    InitTensor3DV2(&x, a, b, c, X_FLOAT, -1);
-    InitTensor1DV2(&y, c, X_FLOAT, -1);
-    InitTensor3DV2(&z, a, b, c, X_FLOAT, -1);
-
-    x.SetZeroAll();
-    y.SetZeroAll();
-    z.SetZeroAll();
-
-    DTYPE * data = new DTYPE[x.unitNum];
-
-    for(int i = 0; i < x.unitNum; i++)
-        data[i] = (DTYPE)i;
-    x.SetData(data, x.unitNum);
-
-    for(int i = 0; i < y.unitNum; i++)
-        data[i] = -(DTYPE)i;
-    y.SetData(data, y.unitNum);
-
-    _SumDim(&x, &y, &z, 2);
-
-    z.Dump(stderr, "z:");
-
-    delete[] data;
-}
--- a/source/network/XBackwardLoss.cpp
+++ b/source/network/XBackwardLoss.cpp
@@ -33,7 +33,6 @@

 namespace nts{

-
 /* compute dE/dx of a node */
 void XLossGrad::MakeGrad(XTensor * node, bool isEfficient)
 {
@@ -53,15 +52,7 @@ void XLossGrad::MakeGrad(XTensor * node, bool isEfficient)
        XTensor * dedy = output->grad;

        if (income.tailNum == 1) {
-            if (dedy->dataType == X_FLOAT)
-                _SetDataFixedFloat(dedy, 1.0F);
-            else if (dedy->dataType == X_DOUBLE)
-                _SetDataFixedDouble(dedy, 1.0);
-            else if (dedy->dataType == X_INT)
-                _SetDataFixedInt(dedy, 1);
-            else
-                ShowNTErrors("TODO");
-
+            dedy->SetDataFixed(1);
            return;
        }


--- a/source/tensor/Main.cpp
+++ b/source/tensor/Main.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- 
-/*
- *
- * This is the entrance of the low-level tensor library : NiuTrans.Tensor
- *
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2015-12-14
- *
- */
-
-#include <stdio.h>
-#include <math.h>
-#include <time.h>
-#include "XTensor.h"
-#include "XDevice.h"
-#include "./test/Test.h"
-#include "./core/CHeader.h"
-#include "./XBLAS.h"
-#include "./core/sort/TopK.h"
-#include "./core/movement/Gather.h"
-//#define CRTDBG_MAP_ALLOC
-//#include <stdlib.h>  
-//#include <crtdbg.h> 
-
-using namespace nts;
-
-void SmallTest();
-void TransposeTest();
-void PowerTest();
-
-int main( int argc, const char ** argv )
-{
-    //PowerTest();
-    //LittleTest();
-
-    //T2TTest();
-    //T2TTest2();
-
-    //return 0;
-    //_CrtSetBreakAlloc(123);
-
-    /* a tiny test */
-    //SmallTest();
-
-    //_CrtDumpMemoryLeaks();
-    //return 0;
-
-    if(argc > 1 && !strcmp(argv[1], "-test"))
-        Test();
-    else{
-        fprintf(stderr, "Thanks for using NiuTrans.Tensor! This is a library that eases the\n");
-        fprintf(stderr, "use of tensors. All you need is to ... \n\n");
-        fprintf(stderr, "Run this program with \"-test\" for unit test!\n");
-    }
-
-    //_CrtDumpMemoryLeaks();
-
-    return 0;
-}
-
-void myRead(XTensor * tensor, const char * filename, const char * label)
-{
-    FILE * file = fopen(filename, "rb");
-    if(file == NULL)
-        printf("%s\n", filename);
-    tensor->Read(file, label);
-}
-
-void myDump(XTensor * tensor, const char * filename, const char * label)
-{
-    FILE * file = fopen(filename, "wb");
-    if(file == NULL)
-        printf("%s\n", filename);
-    tensor->Dump(file, label);
-}
-
-void PowerTest()
-{
-    XTensor input;
-    XTensor output;
-    InitTensor2D(&input, 256, 10000, X_FLOAT, 0);
-    InitTensor2D(&output, 256, 10000, X_FLOAT, 0);
-    myRead(&input, "1.txt", "");
-
-    _Power(&input, &output, 2);
-    output.Dump(stderr, "", 200);
-}
-
-void SmallTest()
-{
-    XTensor a;
-    XTensor b;
-    XTensor c;
-    XTensor d;
-
-    InitTensor2D(&a, 2, 2);
-    InitTensor2D(&b, 2, 2);
-    a.SetZeroAll();
-    b.SetZeroAll();
-    a.Set2D(1.0F, 0, 0);
-    a.Set2D(2.0F, 1, 1);
-
-    b = Sum(a, Multiply(a, a));
-
-    /* this is prohibited !!!!!!!!!!!!! */
-    //XTensor c = a * b + a;
-    //XTensor d = a + b + c.Lin(0.5F);
-    
-    c = a * b + a;
-    d = a + b + c.Lin(0.5F);
-
-    XLink::CheckNetwork(&d);
-    //XLink::ShowNetwork(stderr, &d);
-        
-    a.Dump(stderr, "a:");
-    b.Dump(stderr, "b:");
-    c.Dump(stderr, "c:");
-    d.Dump(stderr, "d:");
-}
-
-void TransposeTest()
-{
-    XTensor a;
-    XTensor b;
-
-    int I = 2;
-    int J = 3;
-
-    InitTensor4D(&a, 2, 3, 4, 5);
-
-    int * dims = new int[a.order];
-    memcpy(dims, a.dimSize, sizeof(int) * a.order);
-    dims[I] = a.dimSize[J];
-    dims[J] = a.dimSize[I];
-
-    InitTensor(&b, 4, dims);
-
-    a.SetZeroAll();
-    b.SetZeroAll();
-
-    float * data = new float[a.unitNum];
-    for(int i = 0; i < a.unitNum; i++)
-        data[i] = (float)i;
-
-    a.SetData(data, a.unitNum, 0);
-
-    _Transpose(&a, &b, I, J);
-    b.Dump(stderr, "b:");
-
-    delete[] data;
-}
-
-
-
--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -526,6 +526,8 @@ void XTensor::SetDevice(int myDevId, XMem * myMem)
    }
    else {
        myMem = GMems.GetMem(myDevId);
+        FlushToMem(myMem);
+        isInGlobalMem = false;
    }
 }

@@ -818,6 +820,16 @@ void XTensor::Range(DTYPE lower, DTYPE upper, DTYPE step)
    _SetDataRange(this, lower, upper, step);
 }

+/* generate data items with a fixed value */
+template<class T>
+void XTensor::SetDataFixed(T num)
+{
+    _SetDataFixed(this, num);
+}
+template void XTensor::SetDataFixed<int>(int);
+template void XTensor::SetDataFixed<float>(float);
+template void XTensor::SetDataFixed<double>(double);
+
 /* 
 set the tensor items by a uniform distribution in range [lower, upper]
 >> lower - lower value of the range
@@ -825,62 +837,7 @@ set the tensor items by a uniform distribution in range [lower, upper]
 */
 void XTensor::SetDataRand(DTYPE lower, DTYPE upper)
 {
-    // TODO: GPU code!!!!!!!
-
-    if (data == NULL)
-        return;
-
-    // srand((unsigned)time(0));
-    DTYPE variance = upper - lower;
-    void * d = NULL;
-    if (dataType == X_FLOAT) {
-        d = new float[unitNum];
-        for (int i = 0; i < unitNum; i++) {
-            DTYPE value = lower + variance * (float)rand() / RAND_MAX;
-            *((float*)d + i) = value;
-        }
-    }
-    else if (dataType == X_DOUBLE) {
-        d = new double[unitNum];
-        for (int i = 0; i < unitNum; i++) {
-            *((double*)d + i) = lower + variance * rand() / RAND_MAX;
-        }
-    }
-    else {
-        ShowNTErrors("Data type must be X_FLOAT or X_Double!");
-    }
-
-    SetData(d, unitNum);
-    
-    if (dataType == X_FLOAT) {
-        delete[] (float*)d;
-    }
-    else {
-        delete[] (double*)d;
-    }
-}
-
-/* a gauss distribution (Box-Muller method) */
-double GaussRand(DTYPE mean, DTYPE standardDeviation)
-{
-    // TODO: GPU code!!!!!!!
-
-    static double u, v;
-    static int phase = 0;
-    double z;
-    double pi = 3.141592654;
-
-    if (phase == 0){
-        u = (rand() + 1.0) / (RAND_MAX + 1.0);
-        v = (rand() + 1.0) / (RAND_MAX + 1.0);
-        z = sqrt(-2.0 * log(u))* sin(2.0 * pi * v);
-    }
-    else{
-        z = sqrt(-2.0 * log(u)) * cos(2.0 * pi * v);
-    }
-
-    phase = 1 - phase;
-    return mean + (z * standardDeviation);
+    _SetDataRand(this, lower, upper);
 }

 /* 
@@ -890,37 +847,7 @@ set the tensor items by a normal distribution
 */
 void XTensor::SetDataRandn(DTYPE mean, DTYPE standardDeviation)
 {
-    // TODO: cuda code!!!!!!!
-
-    if (data == NULL)
-        return;
-
-    // srand((unsigned)time(0));
-    void * d = NULL;
-    if (dataType == X_FLOAT) {
-        d = new float[unitNum];
-        for (int i = 0; i < unitNum; i++) {
-            *((float*)d + i) = (float)GaussRand(mean, standardDeviation);
-        }
-    }
-    else if (dataType == X_DOUBLE) {
-        d = new double[unitNum];
-        for (int i = 0; i < unitNum; i++) {
-            *((double*)d + i) = GaussRand(mean, standardDeviation);
-        }
-    }
-    else {
-        ShowNTErrors("Data type must be X_FLOAT or X_Double!");
-    }
-
-    SetData(d, unitNum);
-
-    if (dataType == X_FLOAT) {
-        delete[] (float*)d;
-    }
-    else {
-        delete[] (double*)d;
-    }
+    _SetDataRandN(this, mean, standardDeviation);
 }

 /* 

--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
@@ -28,6 +28,7 @@
 #ifndef __XTENSOR_H__
 #define __XTENSOR_H__

+#include <math.h>
 #include "XGlobal.h"
 #include "XMem.h"
 #include "XPRunner.h"
@@ -303,6 +304,10 @@ public:
    /* generate data items with a range by start, end and the step */
    void Range(DTYPE lower, DTYPE upper, DTYPE step);

+    /* generate data items with a fixed value */
+    template<class T>
+    void SetDataFixed(T num);
+
    /* set tensor items by a uniform distribution */
    void SetDataRand(DTYPE lower = 0.0F, DTYPE upper = 1.0F);


--- a/source/tensor/core/CHeader.h
+++ b/source/tensor/core/CHeader.h
@@ -91,9 +91,9 @@
 #include "sort/Sort.h"
 #include "sort/TopK.h"

-#include "utilities/XMatrixSegment.h"
-#include "utilities/FlushToMem.h"
 #include "utilities/CheckData.h"
+#include "utilities/FlushToMem.h"
 #include "utilities/SetAscendingOrder.h"
+#include "utilities/XMatrixSegment.h"

 #endif // __CHEADER_H__
--- a/source/tensor/core/getandset/OnehotAndIndex.cpp
+++ b/source/tensor/core/getandset/OnehotAndIndex.cpp
@@ -116,7 +116,7 @@ void _IndexToOnehot(const XTensor * index, XTensor * onehot,
    float confidence = 1 - labelSmoothingP;
    float lowconfidence = labelSmoothingP / size;

-    _SetDataFixedFloat(onehot, lowconfidence);
+    onehot->SetDataFixed(lowconfidence);

 #ifdef USE_CUDA
    if(onehot->devID >= 0 && index->devID >= 0) {

--- a/source/tensor/core/getandset/SetData.cpp
+++ b/source/tensor/core/getandset/SetData.cpp
@@ -77,277 +77,190 @@ void _SetDataFanInOut(XTensor * tensor, DTYPE gain)
 }

 /*
-generate data items with a fixed value p 
->> tensor - the tensor whose data array would be initialized
->> p - pointer to the number for initializing the tensor
+set a data array with a fixed value
+
+>> d - pointer to the data array
+>> v - the initial value
+>> size - size of the array
 */
-void _SetDataFixed(XTensor * tensor, void * valuePointer)
+template<class T>
+void ArraySetDataFixed(T * d, T v, int size)
 {
-    int num = tensor->unitNum;
-
-    if(tensor->dataType == X_INT){
-        int p = *(int*)valuePointer;
-        if(tensor->devID < 0){
-            int * d = (int*)tensor->data;
-            if(num % 4 == 0){
-                for(int i = 0; i < num; i += 4){
-                    d[i] = p;
-                    d[i + 1] = p;
-                    d[i + 2] = p;
-                    d[i + 3] = p;
-                }
-            }
-            else{
-                for(int i = 0; i < num; i++)
-                    d[i] = p;
+    if (size % 4 == 0) {
+        for (int i = 0; i < size; i += 4) {
+            d[i] = v;
+            d[i + 1] = v;
+            d[i + 2] = v;
+            d[i + 3] = v;
        }
    }
-        else{
-#ifdef USE_CUDA
-            _CudaSetDataFixedInt(tensor, p);
-#endif
-        }
-    }
-    else if(tensor->dataType == X_FLOAT){
-        float p = *(float*)valuePointer;
-        if(tensor->devID < 0){
-            float * d = (float*)tensor->data;
-            if(num % 4 == 0){
-                for(int i = 0; i < num; i += 4){
-                    d[i] = p;
-                    d[i + 1] = p;
-                    d[i + 2] = p;
-                    d[i + 3] = p;
-                }
-            }
-            else{
-                for(int i = 0; i < num; i++)
-                    d[i] = p;
-            }
-        }
-        else{
-#ifdef USE_CUDA
-            _CudaSetDataFixedFloat(tensor, p);
-#endif
-        }
-    }
-    else if(tensor->dataType == X_DOUBLE){
-        double p = *(double*)valuePointer;
-        if(tensor->devID < 0){
-            double * d = (double*)tensor->data;
-            if(num % 4 == 0){
-                for(int i = 0; i < num; i += 4){
-                    d[i] = p;
-                    d[i + 1] = p;
-                    d[i + 2] = p;
-                    d[i + 3] = p;
-                }
-            }
-            else{
-                for(int i = 0; i < num; i++)
-                    d[i] = p;
-            }
-        }
-        else{
-#ifdef USE_CUDA
-            _CudaSetDataFixedDouble(tensor, p);
-#endif
-        }
-    }
-    else{
-        ShowNTErrors("TODO");
+    else {
+        for (int i = 0; i < size; i++)
+            d[i] = v;
    }
 }

 /*
-generate data items with a fixed value p (in default type) 
->> tensor - the tensor whose data array would be initialized
->> p - number in default type
-*/
-void SetDataFixed(XTensor &tensor, DTYPE p)
-{
-    _SetDataFixed(&tensor, &p);
-}
+generate data items with a fixed value

-/*
-generate data items with a fixed value p (in integer)
->> tensor - the tensor whose data array would be initialized
->> p - an integer
+>> tensor - the tensor for initialization
+>> value - the initial value
 */
-void SetDataFixedInt(XTensor &tensor, int p)
+template<class T>
+void _SetDataFixed(XTensor * tensor, T value)
 {
-    CheckNTErrors(tensor.dataType == X_INT, "An integer tensor is required!");
-    _SetDataFixed(&tensor, &p);
-}
+    if (tensor->devID >= 0) {
+#ifdef USE_CUDA
+        _CudaSetDataFixed(tensor, value);
+        return;
+#else
+        ShowNTErrors("Please specify USE_CUDA and recompile the code!");
+#endif
+    }

-/* 
-generate data items with a fixed value p (in integer) 
->> tensor - the tensor whose data array would be initialized
->> p - an int-valued number
-*/
-void _SetDataFixedInt(XTensor * tensor, int p)
-{
-    CheckNTErrors(tensor->dataType == X_INT, "the tensor must be in X_INT!");
+    int num = tensor->unitNum;

-    if(p == 0)
-        tensor->SetZeroAll();
+    if (tensor->dataType == X_INT)
+        ArraySetDataFixed((int*)tensor->data, (int)value, num);
+    else if (tensor->dataType == X_FLOAT)
+        ArraySetDataFixed((float*)tensor->data, (float)value, num);
+    else if (tensor->dataType == X_DOUBLE)
+        ArraySetDataFixed((double*)tensor->data, (double)value, num);
    else
-        _SetDataFixed(tensor, &p);
+        ShowNTErrors("TODO! Unsupported datatype!")
 }
+template void _SetDataFixed<int>(XTensor*, int);
+template void _SetDataFixed<float>(XTensor*, float);
+template void _SetDataFixed<double>(XTensor*, double);

 /*
-generate data items with a fixed value p (in float) 
->> tensor - the tensor whose data array would be initialized
->> p - a float-valued number
-*/
-void _SetDataFixedFloat(XTensor * tensor, float p)
-{
-    CheckNTErrors(tensor->dataType == X_FLOAT, "the tensor must be in X_FLOAT!");
+generate data items with a fixed value p only if the condition entry is non-zero

-    if(p == 0)
-        tensor->SetZeroAll();
-    else
-        _SetDataFixed(tensor, &p);
-}
-
-/* 
-generate data items with a fixed value p (in double) 
->> tensor - the tensor whose data array would be initialized
->> p - a double-valued number
+>> d - pointer to the data array
+>> c - pointer to the condition array
+>> v - the initial value
+>> size - size of the array
 */
-void _SetDataFixedDouble(XTensor * tensor, double p)
+template<class T>
+void ArraySetDataFixedCond(T* d, T* c, T v, int size)
 {
-    CheckNTErrors(tensor->dataType == X_DOUBLE, "the tensor must be in X_DOUBLE!");
-
-    if(p == 0)
-        tensor->SetZeroAll();
-    else
-        _SetDataFixed(tensor, &p);
+    for (int i = 0; i < size; i++) {
+        if (c[i] != 0)
+            d[i] = v;
+    }
 }

 /* 
-generate data items with a fixed value p only if 
-the condition entry is non-zero 
+generate data items with a fixed value p only if the condition entry is non-zero 
+
 >> tensor - the tensor whose data array would be initialized
 >> condition - the condition tensor whose entries would be checked
               for set the corresponding entries in "tensor"
->> p - a given value
+>> value - a given value
 */
-void _SetDataFixedCond(XTensor * tensor, XTensor * condition, DTYPE p)
+template<class T>
+void _SetDataFixedCond(XTensor * tensor, XTensor * condition, T value)
 {
-    int num = tensor->unitNum;
+    CheckDev(tensor->devID, condition->devID);
+    CheckDataType(tensor->dataType, condition->dataType);

-    CheckNTErrors(num == condition->unitNum, "Wrong size of the condition tensor!");
-    CheckNTErrors(condition->unitSize == sizeof(float), "TODO!");
-
-    if(tensor->dataType == DEFAULT_DTYPE){
-        if(tensor->devID < 0){
-            DTYPE * data = (DTYPE*)tensor->data;
-            DTYPE * cond = (DTYPE*)condition->data;
-            for(int i = 0; i < num; i++){
-                if(cond[i] != 0)
-                    data[i] = p;
-            }
-        }
-        else{
+    if (tensor->devID >= 0) {
 #ifdef USE_CUDA
-            _CudaSetDataFixedCondFloat(tensor, condition, p);
+        _CudaSetDataFixedCond(tensor, condition, value);
+        return;
 #else
-            ShowNTErrors("Please specify USE_CUDA and recompile the code");
+        ShowNTErrors("Please specify USE_CUDA and recompile the code!");
 #endif
    }
-    }
-    else{
-        ShowNTErrors("the tensor should be in integer typed!");
-    }
-}

-/* 
-generate data items with a fixed value p only if 
-the condition entry is non-zero 
->> tensor - the tensor whose data array would be initialized
->> condition - the condition tensor whose entries would be checked
-               for set the corresponding entries in "tensor"
->> p - a given value
-*/
-void _SetDataFixedCondInt(XTensor * tensor, XTensor * condition, int p)
-{
    int num = tensor->unitNum;

-    CheckNTErrors(num == condition->unitNum, "Wrong size of the condition tensor!");
-    CheckNTErrors(condition->unitSize == sizeof(float), "TODO!");
-
-    if(tensor->dataType == DEFAULT_DTYPE){
-        if(tensor->devID < 0){
-            int * data = (int*)tensor->data;
-            int * cond = (int*)condition->data;
-            for(int i = 0; i < num; i++){
-                if(cond[i] != 0)
-                    data[i] = p;
-            }
-        }
-        else{
-#ifdef USE_CUDA
-            _CudaSetDataFixedCondInt(tensor, condition, p);
-#else
-            ShowNTErrors("Please specify USE_CUDA and recompile the code");
-#endif
-        }
-    }
-    else{
-        ShowNTErrors("TODO!");
-    }
+    if (tensor->dataType == X_INT)
+        ArraySetDataFixedCond((int*)tensor->data, (int*)condition->data, (int)value, num);
+    else if (tensor->dataType == X_FLOAT)
+        ArraySetDataFixedCond((float*)tensor->data, (float*)condition->data, (float)value, num);
+    else if (tensor->dataType == X_DOUBLE)
+        ArraySetDataFixedCond((double*)tensor->data, (double*)condition->data, (double)value, num);
+    else
+        ShowNTErrors("TODO! Unsupported datatype!")
 }
+template void _SetDataFixedCond<int>(XTensor*, XTensor*, int);
+template void _SetDataFixedCond<float>(XTensor*, XTensor*, float);
+template void _SetDataFixedCond<double>(XTensor*, XTensor*, double);

 /* 
 set data items along with a given dimension (and keep the remaining items unchanged) 
->> tensor - the tensor whose data array would be initialized
+
+>> tensor - the tensor for initialization
 >> beg - the beginning position
 >> len - length along with the given dimension
 >> dim - the dimension along which we set the data
-e.g., given a 3 * 3 tensor 
+   e.g., given a 3 * 3 tensor 
         1 2 3
         4 5 6
         7 8 9
-      when beg = 1, len = 1, dim = 0 and p = 0, we have
+         when beg = 1, len = 1, dim = 0 and value = 0, we have
         1 2 3
         0 0 0
         7 8 9
         i.e., we set all entries of row 1 to 0
+>> value - the given value
 */
-void _SetDataDim(XTensor * tensor, int beg, int len, int dim, DTYPE p)
+template<class T>
+void _SetDataDim(XTensor * tensor, int beg, int len, int dim, T value)
 {
-    int n = tensor->order;
+    int order = tensor->order;
+    int size = tensor->GetDim(dim);
+    if (dim < 0)
+        dim = order + dim; 

-    CheckNTErrors(tensor->dataType == DEFAULT_DTYPE, "TODO!");
-    CheckNTErrors(dim < n && dim >= 0, "Illegal dimension!");
-    CheckNTErrors(beg >= 0 && beg < tensor->GetDim(dim), "Illegal beginning position!");
-    CheckNTErrors(beg + len >= 0 && beg + len < tensor->GetDim(dim), "Illegal length!");
+    CheckNTErrors(dim < order && dim >= 0, "Illegal dimension!");
+    CheckNTErrors(beg >= 0 && beg < size, "Illegal beginning position!");
+    CheckNTErrors(len >= 0 && beg + len <= size, "Illegal length!");
+
+    if (tensor->devID >= 0) {
+#ifdef USE_CUDA
+        _CudaSetDataDim(tensor, beg, len, dim, (DTYPE)value);
+        return;
+#else
+        ShowNTErrors("Please specify USE_CUDA and recompile the code!");
+#endif
+    }

-    if(tensor->devID < 0){
    int stride = 1;
    int blockSize = 1;
    int blockNum  = 1;
-        for(int i = n - 1; i > dim; i--){
+
+    for (int i = order - 1; i > dim; i--)
        stride *= tensor->GetDim(i);
-        }
-        blockSize = stride * tensor->GetDim(dim);
+    blockSize = stride * size;
    blockNum = tensor->unitNum / blockSize;

-        int l = len * stride;
+    int initNum = len * stride;

-        for(int i = 0; i < blockNum; i++){
-            DTYPE * d = (DTYPE*)tensor->data + blockSize * i + beg * stride;    
-            for(int j = 0; j < l; j++)
-                d[j] = p;
+    for(int i = 0; i < blockNum; i++) {
+        if (tensor->dataType == X_INT) {
+            int* d = (int*)tensor->data + blockSize * i + beg * stride;
+            for (int j = 0; j < initNum; j++)
+                d[j] = (int)value;
        }
+        else if (tensor->dataType == X_FLOAT) {
+            float* d = (float*)tensor->data + blockSize * i + beg * stride;
+            for (int j = 0; j < initNum; j++)
+                d[j] = (float)value;
        }
-    else{
-#ifdef USE_CUDA
-        _CudaSetDataDim(tensor, beg, len, dim, p);
-#endif
+        else if (tensor->dataType == X_DOUBLE) {
+            double* d = (double*)tensor->data + blockSize * i + beg * stride;
+            for (int j = 0; j < initNum; j++)
+                d[j] = (double)value;
+        }
+        else
+            ShowNTErrors("TODO! Unsupported datatype!")
    }
 }
+template void _SetDataDim<int>(XTensor*, int, int, int, int);
+template void _SetDataDim<float>(XTensor*, int, int, int, float);
+template void _SetDataDim<double>(XTensor*, int, int, int, double);

 /* 
 modify data items along with a given index and dimension (and keep the remaining items unchanged) 
@@ -355,7 +268,7 @@ modify data items along with a given index and dimension (and keep the remaining
 >> modify - the tensor whose data array would be used to modify the source tensor
 >> dim - the dimension along which we modify the tensor
 >> index - index of the given dimension
-e.g., given a source tensor (3, 3)
+   e.g., given a source tensor (3, 3)
         1 2 3
         4 5 6
         7 8 9
@@ -367,102 +280,127 @@ e.g., given a source tensor (3, 3)
         7 8 9
         i.e., we set entries of row 1 to {1, 2, 3}
 */
-void _SetDataIndexed(XTensor * source, XTensor * modify, int dim, int index)
+void _SetDataIndexed(XTensor * tensor, XTensor * modify, int dim, int index)
 {
-    int order = source->order;
-    int size = source->GetDim(dim);
+    int order = tensor->order;
+    int size = tensor->GetDim(dim);
+    if (dim < 0)
+        dim = order + dim;

-    CheckNTErrors(source->dataType == DEFAULT_DTYPE, "TODO!");
+    CheckDev(tensor->devID, modify->devID);
    CheckNTErrors(dim >= 0 && dim < order, "Illegal dimension!");
    CheckNTErrors(index >= 0 && index < size, "Illegal index!");
    
-    for(int i = 0; i < order - 1; i++){
-        if(i < dim){
-            CheckNTErrors(modify->GetDim(i) == source->GetDim(i), "Illegal dimension!");
+    for(int i = 0; i < order - 1; i++) {
+        if(i < dim) {
+            CheckNTErrors(modify->GetDim(i) == tensor->GetDim(i), "Illegal dimension!");
        }
-        else if(i >= dim){
-            CheckNTErrors(modify->GetDim(i) == source->GetDim(i+1), "Illegal dimension!");
+        else if(i >= dim) {
+            CheckNTErrors(modify->GetDim(i) == tensor->GetDim(i+1), "Illegal dimension!");
        }
    }

-    if(source->devID < 0 && modify->devID < 0){
+    if (tensor->devID >= 0) {
+#ifdef USE_CUDA
+        _CudaSetDataIndexed(tensor, modify, dim, index);
+        return;
+#else
+        ShowNTErrors("Please specify USE_CUDA and recompile the code!");
+#endif
+    }
+
+    if(tensor->devID < 0) {
        int stride = 1;
        int blockSize = 1;
        int blockNum  = 1;

-        for(int i = order - 1; i > dim; i--){
-            stride *= source->GetDim(i);
+        for (int i = order - 1; i > dim; i--) {
+            stride *= tensor->GetDim(i);
        }

-        blockSize = stride * source->GetDim(dim);
-        blockNum = source->unitNum / blockSize;
+        blockSize = stride * tensor->GetDim(dim);
+        blockNum = tensor->unitNum / blockSize;

-        for(int i = 0; i < blockNum; i++){
-            DTYPE * d = (DTYPE*)source->data + blockSize * i + index * stride;
+        for (int i = 0; i < blockNum; i++) {
+            DTYPE * d = (DTYPE*)tensor->data + blockSize * i + index * stride;
            DTYPE * p = (DTYPE*)modify->data + stride * i;
            for(int j = 0; j < stride; j++)
                d[j] = p[j];
        }
    }
-    else if(source->devID >= 0 && modify->devID >= 0) {
-#ifdef USE_CUDA
-        _CudaSetDataIndexed(source, modify, dim, index);
-#else
-        ShowNTErrors("Please specify USE_CUDA and recompile the code!");
-#endif
-    }
-    else{
-        ShowNTErrors("TODO!");
-    }
 }

 /* 
 generate data as lower triangular matrics for last two dimensions 
+
 >> tensor - the tensor whose data to be set
->> p - the value for each entry of the lower triangular matrics
+>> value - the value for each entry of the lower triangular matrics
 >> shift - the offset from diagonal
-e.g., for a 3 * 3 tensor, 
-      when p = 1 ans shift = 0, we have
+
+   e.g., for a 3 * 3 tensor, 
+         when value = 1 ans shift = 0, we have
         1 0 0
         1 1 0
         1 1 1
-      when p = 2 and shift = -1, we have
+         when value = 2 and shift = -1, we have
         0 0 0
         2 0 0
         2 2 0
 */
-void _SetDataLowTri(XTensor * tensor, DTYPE p, int shift)
+void _SetDataLowTri(XTensor * tensor, DTYPE value, int shift)
 {
    int n = tensor->order;

-    CheckNTErrors(tensor->dataType == DEFAULT_DTYPE, "TODO!");
    CheckNTErrors(n >= 2, "The tensor must have a order no less than 2!");
    CheckNTErrors(tensor->GetDim(n - 1) == tensor->GetDim(n - 2), 
                 "The last two dimensions must be of the same size!");

-    if(tensor->devID < 0){
-        int l = tensor->GetDim(-1);
-        int blockNum = 1;
-        int blockSize = l * l;
-        for(int i = 0; i < n - 2; i++)
-            blockNum *= tensor->GetDim(i);
+    tensor->SetZeroAll();
+    if (tensor->devID >= 0) {
+#ifdef USE_CUDA
+        _CudaSetDataLowTri(tensor, value, shift);
+        return;
+#else
+        ShowNTErrors("Please specify USE_CUDA and recompile the code!");
+#endif
+    }

-        for(int i = 0; i < blockNum; i++){
-            DTYPE * d = (DTYPE*)tensor->data + i * blockSize;
-            for(int row = 0; row < l; row++){
-                for(int col = 0; col <= row + shift; col++){
-                    d[row * l + col] = p;
+    int size = tensor->GetDim(-1);
+    int blockSize = size * size;
+    int blockNum = tensor->unitNum / blockSize;
+
+    for (int i = 0; i < blockNum; i++) {
+        for (int row = 0; row < size; row++) {
+            if (tensor->dataType == X_INT) {
+                int * d = (int*)tensor->data + i * blockSize;
+                for (int col = 0; col <= row + shift; col++) {
+                    d[row * size + col] = (int)value;
                }
-                for(int col = MAX(0, row + shift + 1); col < l; col++){
-                    d[row * l + col] = 0;
+                /*for (int col = MAX(0, row + shift + 1); col < size; col++) {
+                    d[row * size + col] = 0;
+                }*/
            }
+            else if (tensor->dataType == X_FLOAT) {
+                float * d = (float*)tensor->data + i * blockSize;
+                for (int col = 0; col <= row + shift; col++) {
+                    d[row * size + col] = (float)value;
                }
+                /*for (int col = MAX(0, row + shift + 1); col < size; col++) {
+                    d[row * size + col] = 0;
+                }*/
            }
+            else if (tensor->dataType == X_DOUBLE) {
+                double * d = (double*)tensor->data + i * blockSize;
+                for (int col = 0; col <= row + shift; col++) {
+                    d[row * size + col] = (double)value;
+                }
+                /*for (int col = MAX(0, row + shift + 1); col < size; col++) {
+                    d[row * size + col] = 0;
+                }*/
+            }
+            else 
+                ShowNTErrors("TODO! Unsupported datatype!")
        }
-    else{
-#ifdef USE_CUDA
-        _CudaSetDataLowTri(tensor, p, shift);
-#endif
    }
 }

@@ -484,7 +422,7 @@ generate data items with a uniform distribution in [lower, upper]
 */
 void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
 {
-    CheckNTErrors(upper > lower, "the high value must be greater than low value!");
+    CheckNTErrors(upper >= lower, "the high value must be greater than low value!");

    if(tensor == NULL)
        return;
@@ -506,27 +444,50 @@ void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
            }
        }
        else{
-            ShowNTErrors("TODO");
+            ShowNTErrors("TODO! Unsupported datatype!")
        }
    }
+    else{
+#ifdef USE_CUDA
        /*
        GPU code
        The trick here is that initialize the data on a temperary tensor on CPU.
        The CPU data is then copied to GPU.
        TODO: generate data points on GPUs straightforwardly.
        */
-    else{
-#ifdef USE_CUDA
-        _CudaSetDataRand(tensor, lower, upper);
+        //_CudaSetDataRand(tensor, lower, upper);
+        int num = tensor->unitNum;
+        DTYPE variance = upper - lower;
+
+        void * d = NULL;
+        if (tensor->dataType == X_FLOAT) {
+            d = new float[num];
+            for (int i = 0; i < num; i++) 
+                *((float*)d + i) = lower + variance * (float)rand() / RAND_MAX;
+        }
+        else if (tensor->dataType == X_DOUBLE) {
+            d = new double[num];
+            for (int i = 0; i < num; i++) 
+                *((double*)d + i) = (double)lower + variance * rand() / RAND_MAX;
+        }
+        else {
+            ShowNTErrors("Data type must be X_FLOAT or X_Double!");
+        }
+
+        tensor->SetData(d, num);
+
+        if (tensor->dataType == X_FLOAT) {
+            delete[](float*)d;
+        }
+        else {
+            delete[](double*)d;
+        }
 #endif
-        //XTensor * t2 = NewTensorV2(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, -1);
-        //_SetDataRand(t2, low, high);
-        //_CopyValues(t2, tensor);
-        //delete t2;
    }
 }

 /* generate data items with a range by start, end and the step
+
 >> tensor - the tensor whose data array would be initialized
 >> start - the begin of the array
 >> end - the end of the array (not included self)
@@ -537,7 +498,7 @@ void _SetDataRange(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE step)
    CheckNTErrors((tensor->order == 1), "Tensor must be 1 dimension!");

    /* compute the true length according to the (start, end, step) */
-    DTYPE size = fabs(upper - lower);
+    DTYPE size = (DTYPE)fabs(upper - lower);
    int num = ceil(size / fabs(step));
    CheckNTErrors((tensor->unitNum == num), "Unit number of the tensor is not matched.");

@@ -554,7 +515,7 @@ void _SetDataRange(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE step)
            *((float*)data + i) = lower + i * step;
    }
    else {
-        ShowNTErrors("TODO!");
+        ShowNTErrors("TODO! Unsupported datatype!")
    }

    /* set the data from the array */
@@ -564,8 +525,10 @@ void _SetDataRange(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE step)
 }

 /* 
-generate data items with a uniform distribution in [lower, upper] and set
-the item to a pre-defined value if the item >= p, set the item to 0 otherwise
+generate data items with a uniform distribution in [lower, upper] and 
+set the item to a pre-defined value if the item >= p, 
+set the item to 0 otherwise
+
 >> tensor - the tensor whose data array would be initialized
 >> lower - lower value of the range
 >> upper - upper value of the range
@@ -596,8 +559,30 @@ void _SetDataRandP(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE va
    }
 }

+/* a gauss distribution (Box-Muller method) */
+double GaussRand(DTYPE mean, DTYPE standardDeviation)
+{
+    static double u, v;
+    static int phase = 0;
+    double z;
+    double pi = 3.141592654;
+
+    if (phase == 0) {
+        u = (rand() + 1.0) / (RAND_MAX + 1.0);
+        v = (rand() + 1.0) / (RAND_MAX + 1.0);
+        z = sqrt(-2.0 * log(u)) * sin(2.0 * pi * v);
+    }
+    else {
+        z = sqrt(-2.0 * log(u)) * cos(2.0 * pi * v);
+    }
+
+    phase = 1 - phase;
+    return mean + (z * standardDeviation);
+}
+
 /*
 generate data items with a normal distribution with specified mean and standard deviation 
+
 >> tensor - the tensor that keeps the data
 >> mean - mean or expectation of the distribution
 >> standardDeviation - standard deviation of the distribution
@@ -605,7 +590,31 @@ generate data items with a normal distribution with specified mean and standard 
 void _SetDataRandN(XTensor * tensor, DTYPE mean, DTYPE standardDeviation)
 {
    // TODO: rewrite it and add cuda code!!!!!!!
-    tensor->SetDataRandn(mean, standardDeviation);
+    int num = tensor->unitNum;
+
+    void * d = NULL;
+    if (tensor->dataType == X_FLOAT) {
+        d = new float[num];
+        for (int i = 0; i < num; i++)
+            *((float*)d + i) = (float)GaussRand(mean, standardDeviation);
+    }
+    else if (tensor->dataType == X_DOUBLE) {
+        d = new double[num];
+        for (int i = 0; i < num; i++)
+            *((double*)d + i) = GaussRand(mean, standardDeviation);
+    }
+    else {
+        ShowNTErrors("TODO! Unsupported datatype!")
+    }
+
+    tensor->SetData(d, num);
+
+    if (tensor->dataType == X_FLOAT) {
+        delete[](float*)d;
+    }
+    else {
+        delete[](double*)d;
+    }
 }

 /* 

--- a/source/tensor/core/getandset/SetData.cu
+++ b/source/tensor/core/getandset/SetData.cu
 /* 
-* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */

 /*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-18
-* I'm surprised that I did not write this file till today.
-*/
+ * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-18
+ * I'm surprised that I did not write this file till today.
+ */

 #include <curand.h>
 #include <time.h>
@@ -33,109 +33,34 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA

 /*
-set an integer data array with a fixed value p (in int) 
->> d - pointer to the data array
->> size - size of the array
->> p - the initial value
-*/
-__global__ 
-void KernelSetDataFixedInt(int * d, int size, int p)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-    if (i < size)
-        d[i] = p;
-}
-
-/* 
-generate data items with a fixed value p (in int) 
->> tensor - the tensor for initialization
->> p - the initial value
-*/
-void _CudaSetDataFixedInt(XTensor * tensor, int p)
-{
-    CheckNTErrors(tensor->dataType == X_INT, "the tensor must be in X_INT!");
+set a data array with a fixed value

-    int gridSize[3];
-    int blockSize[3];
-
-    GDevs.GetCudaThread(tensor->devID, tensor->unitNum, gridSize, blockSize);
-
-    dim3 blocks(gridSize[0]);
-    dim3 threads(blockSize[0]);
-
-    int devIDBackup;
-    ProtectCudaDev(tensor->devID, devIDBackup);
-
-    KernelSetDataFixedInt <<<blocks, threads >>>((int*)tensor->data, tensor->unitNum, p);
-
-    BacktoCudaDev(tensor->devID, devIDBackup);
-}
-
-/* 
-set a float data array with a fixed value p (in int) 
 >> d - pointer to the data array
+>> v - the initial value
 >> size - size of the array
->> p - the initial value
 */
+template<class T>
 __global__
-void KernelSetDataFixedFloat(float * d, int size, float p)
+void KernelSetDataFixed(T * d, T v, int size)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    if (i < size)
-        d[i] = p;
+        d[i] = v;
 }
+template __global__ void KernelSetDataFixed<int>(int *, int, int);
+template __global__ void KernelSetDataFixed<float>(float *, float, int);
+template __global__ void KernelSetDataFixed<double>(double *, double, int);

 /* 
-generate data items with a fixed value p (in float)
->> tensor - the tensor for initialization
->> p - the initial value
-*/
-void _CudaSetDataFixedFloat(XTensor * tensor, float p)
-{
-    CheckNTErrors(tensor->dataType == X_FLOAT, "the tensor must be in X_FLOAT!");
-
-    int gridSize[3];
-    int blockSize[3];
-
-    GDevs.GetCudaThread(tensor->devID, tensor->unitNum, gridSize, blockSize);
-
-    dim3 blocks(gridSize[0]);
-    dim3 threads(blockSize[0]);
-
-    int devIDBackup;
-    ProtectCudaDev(tensor->devID, devIDBackup);
-
-    KernelSetDataFixedFloat <<<blocks, threads >>>((float*)tensor->data, tensor->unitNum, p);
-
-    BacktoCudaDev(tensor->devID, devIDBackup);
-}
+generate data items with a fixed value 

-/* 
-set a double data array with a fixed value p (in int) 
->> d - pointer to the data array
->> size - size of the array
->> p - the initial value
-*/
-__global__ 
-void KernelSetDataFixedDouble(double * d, int size, double p)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-    if (i < size)
-        d[i] = p;
-}
-
-/* 
-generate data items with a fixed value p (in double) 
 >> tensor - the tensor for initialization
->> p - the initial value
+>> value - the initial value
 */
-void _CudaSetDataFixedDouble(XTensor * tensor, double p)
+template<class T>
+void _CudaSetDataFixed(XTensor * tensor, T value)
 {
-    CheckNTErrors(tensor->dataType == X_DOUBLE, "the tensor must be in X_DOUBLE!");
-
    int gridSize[3];
    int blockSize[3];

@@ -145,59 +70,23 @@ void _CudaSetDataFixedDouble(XTensor * tensor, double p)
    dim3 threads(blockSize[0]);

    int devIDBackup;
-    ProtectCudaDev(tensor->devID, devIDBackup);
-
-    KernelSetDataFixedDouble <<<blocks, threads >>>((double*)tensor->data, tensor->unitNum, p);
-
-    BacktoCudaDev(tensor->devID, devIDBackup);
-}
-
-/* 
-set a float data array with a fixed value p (in int) only 
-if the condition entry is non-zero 
->> d - pointer to the data array
->> c - pointer to the condition array
->> size - size of the array
->> p - the initial value
-*/
-__global__ 
-void KernelSetDataFixedCondFloat(float * d, float * c, int size, float p)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;

-    if (i < size && c[i] != 0)
-        d[i] = p;
-}
-
-/* 
-generate data items with a fixed value p (in float) only 
-if the condition entry is non-zero 
->> tensor - the tensor for initialization
->> condition - the condition tensor whose entry would be check to
-               set the corresponding entry in "tensor"
->> p - the initial value   
-*/
-void _CudaSetDataFixedCondFloat(XTensor * tensor, XTensor * condition, float p)
-{
-    CheckNTErrors(tensor->dataType == X_FLOAT, "the tensor must be in X_FLOAT!");
-    CheckNTErrors(condition->unitSize == sizeof(float), "TODO!");
-
-    int gridSize[3];
-    int blockSize[3];
-
-    GDevs.GetCudaThread(tensor->devID, tensor->unitNum, gridSize, blockSize);
-
-    dim3 blocks(gridSize[0]);
-    dim3 threads(blockSize[0]);
-
-    int devIDBackup;
    ProtectCudaDev(tensor->devID, devIDBackup);

-    KernelSetDataFixedCondFloat <<<blocks, threads >>>((float*)tensor->data, (float*)condition->data, 
-                                                               tensor->unitNum, p);
+    if (tensor->dataType == X_INT)
+        KernelSetDataFixed << <blocks, threads >> > ((int*)tensor->data, (int)value, tensor->unitNum);
+    else if (tensor->dataType == X_FLOAT)
+        KernelSetDataFixed << <blocks, threads >> > ((float*)tensor->data, (float)value, tensor->unitNum);
+    else if (tensor->dataType == X_DOUBLE)
+        KernelSetDataFixed << <blocks, threads >> > ((double*)tensor->data, (double)value, tensor->unitNum);
+    else
+        ShowNTErrors("TODO! Unsupported datatype!")

    BacktoCudaDev(tensor->devID, devIDBackup);
 }
+template void _CudaSetDataFixed<int>(XTensor *, int);
+template void _CudaSetDataFixed<float>(XTensor *, float);
+template void _CudaSetDataFixed<double>(XTensor *, double);

 /* 
 set a float data array with a fixed value p (in int) only 
@@ -207,28 +96,30 @@ if the condition entry is non-zero
 >> size - size of the array
 >> p - the initial value
 */
+template<class T>
 __global__ 
-void KernelSetDataFixedCondInt(int * d, float * c, int size, int p)
+void KernelSetDataFixedCond(T * d, T * c, T value, int size)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    if (i < size && c[i] != 0)
-        d[i] = p;
+        d[i] = value;
 }
-
+template __global__ void KernelSetDataFixedCond<int>(int*, int*, int, int);
+template __global__ void KernelSetDataFixedCond<float>(float*, float*, float, int);
+template __global__ void KernelSetDataFixedCond<double>(double*, double*, double, int);
 /* 
-generate data items with a fixed value p (in int) only 
-if the condition entry is non-zero 
+generate data items with a fixed value p 
+only if the condition entry is non-zero 
+
 >> tensor - the tensor for initialization
 >> condition - the condition tensor whose entry would be check to
               set the corresponding entry in "tensor"
->> p - the initial value   
+>> value - the initial value   
 */
-void _CudaSetDataFixedCondInt(XTensor * tensor, XTensor * condition, int p)
+template<class T>
+void _CudaSetDataFixedCond(XTensor* tensor, XTensor* condition, T value)
 {
-    CheckNTErrors(tensor->dataType == X_FLOAT, "the tensor must be in X_FLOAT!");
-    CheckNTErrors(condition->unitSize == sizeof(float), "TODO!");
-
    int gridSize[3];
    int blockSize[3];

@@ -240,11 +131,24 @@ void _CudaSetDataFixedCondInt(XTensor * tensor, XTensor * condition, int p)
    int devIDBackup;
    ProtectCudaDev(tensor->devID, devIDBackup);

-    KernelSetDataFixedCondInt <<<blocks, threads >>>((int*)tensor->data, (float*)condition->data, 
-                                                           tensor->unitNum, p);
+    if (tensor->dataType == X_INT)
+        KernelSetDataFixedCond <<< blocks, threads >>> ((int*)tensor->data, (int*)condition->data,
+                                                       (int)value, tensor->unitNum);
+    else if (tensor->dataType == X_FLOAT)
+        KernelSetDataFixedCond <<< blocks, threads >>> ((float*)tensor->data, (float*)condition->data,
+                                                       (float)value, tensor->unitNum);
+
+    else if (tensor->dataType == X_DOUBLE)
+        KernelSetDataFixedCond <<< blocks, threads >>> ((double*)tensor->data, (double*)condition->data,
+                                                       (double)value, tensor->unitNum);
+    else
+        ShowNTErrors("TODO! Unsupported datatype!")

    BacktoCudaDev(tensor->devID, devIDBackup);
 }
+template void _CudaSetDataFixedCond<int>(XTensor*, XTensor*, int);
+template void _CudaSetDataFixedCond<float>(XTensor*, XTensor*, float);
+template void _CudaSetDataFixedCond<double>(XTensor*, XTensor*, double);

 /* 
 set data array with a uniform distribution in [low, high] 
@@ -309,8 +213,9 @@ set data items along with a given dimension (and keep the remaining items unchan
 >> blockSize - size of a data block
 >> blockNum - number of data blocks
 */
+template<class T>
 __global__
-void KernelSetDataDim(DTYPE * d, int beg, int len, int blockSize, int blockNum, DTYPE p)
+void KernelSetDataDim(T * d, int beg, int len, int blockSize, int blockNum, T p)
 {
    /* offset in each block */
    int i = blockDim.x * blockIdx.x + threadIdx.x;
@@ -326,6 +231,9 @@ void KernelSetDataDim(DTYPE * d, int beg, int len, int blockSize, int blockNum, 

    d[blockSize * j + i] = p;
 }
+template __global__ void KernelSetDataDim<int>(int*, int, int, int, int, int);
+template __global__ void KernelSetDataDim<float>(float*, int, int, int, int, float);
+template __global__ void KernelSetDataDim<double>(double*, int, int, int, int, double);

 /* 
 set data items along with a given dimension (and keep the remaining items unchanged) - cuda version
@@ -343,7 +251,8 @@ e.g., given a 3 * 3 tensor
      7 8 9
      i.e., we set all entries of row 1 to 0
 */
-void _CudaSetDataDim(XTensor * tensor, int beg, int len, int dim, DTYPE p)
+template<class T>
+void _CudaSetDataDim(XTensor * tensor, int beg, int len, int dim, T p)
 {
    int n = tensor->order;

@@ -372,11 +281,24 @@ void _CudaSetDataDim(XTensor * tensor, int beg, int len, int dim, DTYPE p)
    int devIDBackup;
    ProtectCudaDev(tensor->devID, devIDBackup);

-    KernelSetDataDim<<<blocks, threads >>>((DTYPE*)tensor->data, beg * stride, 
-                                            len * stride, blockSize, blockNum, p);
+    if (tensor->dataType == X_INT)
+        KernelSetDataDim << <blocks, threads >> > ((int*)tensor->data, beg * stride,
+                                                    len * stride, blockSize, blockNum, (int)p);
+    else if (tensor->dataType == X_FLOAT)
+        KernelSetDataDim << <blocks, threads >> > ((float*)tensor->data, beg * stride,
+                                                    len * stride, blockSize, blockNum, (float)p);
+
+    else if (tensor->dataType == X_DOUBLE)
+        KernelSetDataDim << <blocks, threads >> > ((double*)tensor->data, beg * stride,
+                                                    len * stride, blockSize, blockNum, (double)p);
+    else
+        ShowNTErrors("TODO! Unsupported datatype!")

    BacktoCudaDev(tensor->devID, devIDBackup);
 }
+template void _CudaSetDataDim<int>(XTensor*, int, int, int, int);
+template void _CudaSetDataDim<float>(XTensor*, int, int, int, float);
+template void _CudaSetDataDim<double>(XTensor*, int, int, int, double);

 /* 
 modify data items along with a given index and dimension 
@@ -462,6 +384,7 @@ void _CudaSetDataIndexed(XTensor * source, XTensor * modify, int dim, int index)

 /* 
 set lower triangular matrics for each block
+
 >> d - pointer to the data array
 >> l - row number (or column number) of each block, i.e, 
       a block is l * l matrix
@@ -469,7 +392,7 @@ set lower triangular matrics for each block
 >> blockNum - number of the blocks
 >> p - the value for each entry of the lower triangular matrics
 >> shift - the offset from diagonal
-e.g., for a 3* 3 tensor, 
+   e.g., for a 3* 3 tensor, 
         when p = 1 ans shift = 0, we have
         1 0 0
         1 1 0
@@ -503,33 +426,26 @@ void KernelSetDataLowTri(DTYPE * d, int l, int blockSize, int blockNum, DTYPE p,

 /*
 generate data as lower triangular matrics for last two dimensions (cuda version)
+
 >> tensor - the tensor whose data to be set
->> p - the value for each entry of the lower triangular matrics
+>> value - the value for each entry of the lower triangular matrics
 >> shift - the offset from diagonal
-e.g., for a 3* 3 tensor, 
-      when p = 1 ans shift = 0, we have
+
+   e.g., for a 3 * 3 tensor,
+         when value = 1 ans shift = 0, we have
         1 0 0
         1 1 0
         1 1 1
-      when p = 2 and shift = -1, we have
+         when value = 2 and shift = -1, we have
         0 0 0
         2 0 0
         2 2 0
 */
-void _CudaSetDataLowTri(XTensor * tensor, DTYPE p, int shift)
+void _CudaSetDataLowTri(XTensor * tensor, DTYPE value, int shift)
 {
-    int n = tensor->order;
-
-    CheckNTErrors(tensor->dataType == DEFAULT_DTYPE, "TODO!");
-    CheckNTErrors(n >= 2, "The tensor must have a order no less than 2!");
-    CheckNTErrors(tensor->GetDim(n - 1) == tensor->GetDim(n - 2), 
-                 "The last two dimensions must be of the same size!");
-
-    int l = tensor->GetDim(-1);
-    int blockNum = 1;
-    int blockSize = l * l;
-    for(int i = 0; i < n - 2; i++)
-        blockNum *= tensor->GetDim(i);
+    int size = tensor->GetDim(-1);
+    int blockSize = size * size;
+    int blockNum = tensor->unitNum / blockSize;

    int cudaGrids[3];
    int cudaBlocks[3];
@@ -542,7 +458,7 @@ void _CudaSetDataLowTri(XTensor * tensor, DTYPE p, int shift)
    int devIDBackup;
    ProtectCudaDev(tensor->devID, devIDBackup);

-    KernelSetDataLowTri<<<blocks, threads >>>((DTYPE*)tensor->data, l, blockSize, blockNum, p, shift);
+    KernelSetDataLowTri<<<blocks, threads >>>((DTYPE*)tensor->data, size, blockSize, blockNum, value, shift);

    BacktoCudaDev(tensor->devID, devIDBackup);
 }

--- a/source/tensor/core/getandset/SetData.cuh
+++ b/source/tensor/core/getandset/SetData.cuh
@@ -28,31 +28,24 @@

 namespace nts { // namespace nts(NiuTrans.Tensor)

-/* generate data items with a fixed value p (in int) */
-void _CudaSetDataFixedInt(XTensor * tensor, int p);
+/* generate data items with a fixed value */
+template<class T>
+void _CudaSetDataFixed(XTensor * tensor, T value);

-/* generate data items with a fixed value p (in float) */
-void _CudaSetDataFixedFloat(XTensor * tensor, float p);
-
-/* generate data items with a fixed value p (in double) */
-void _CudaSetDataFixedDouble(XTensor * tensor, double p);
-
-/* generate data items with a fixed value p (in float) only 
-   if the condition entry is non-zero */
-void _CudaSetDataFixedCondFloat(XTensor * tensor, XTensor * condition, float p);
-
-/* generate data items with a fixed value p (in int) only 
-   if the condition entry is non-zero */
-void _CudaSetDataFixedCondInt(XTensor * tensor, XTensor * condition, int p);
+/* generate data items with a fixed value p
+   only if the condition entry is non-zero */
+template<class T>
+void _CudaSetDataFixedCond(XTensor * tensor, XTensor * condition, T p);

 /* set data items along with a given dimension (and keep the remaining items unchanged) */
-void _CudaSetDataDim(XTensor * tensor, int beg, int len, int dim, DTYPE p);
+template<class T>
+void _CudaSetDataDim(XTensor * tensor, int beg, int len, int dim, T p);

 /* modify data items along with a given index and dimension (and keep the remaining items unchanged) */
 void _CudaSetDataIndexed(XTensor * source, XTensor * modify, int dim, int index);

 /* generate data as lower triangular matrics for last two dimensions (cuda version) */
-void _CudaSetDataLowTri(XTensor * tensor, DTYPE p, int shift);
+void _CudaSetDataLowTri(XTensor * tensor, DTYPE value, int shift);

 /* generate data items with a uniform distribution in [lower, upper] */
 void _CudaSetDataRand(const XTensor * tensor, DTYPE lower, DTYPE upper);

--- a/source/tensor/core/getandset/SetData.h
+++ b/source/tensor/core/getandset/SetData.h
@@ -30,32 +30,17 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* generate data items with a xavier initialization */
 void _SetDataFanInOut(XTensor * tensor, DTYPE gain = 1.0F);

-/* generate data items with a fixed value p */
-void _SetDataFixed(XTensor * tensor, void * valuePointer);
+/* generate data items with a fixed value */
+template<class T>
+void _SetDataFixed(XTensor * tensor, T value);

-/* generate data items with a fixed value p (in default type) */
-void SetDataFixed(XTensor &tensor, DTYPE p);
-    
-/* generate data items with a fixed value p (in integer) */
-void SetDataFixedInt(XTensor &tensor, int p);
-
-/* generate data items with a fixed value p (in int) */
-void _SetDataFixedInt(XTensor * tensor, int p);
-
-/* generate data items with a fixed value p (in float) */
-void _SetDataFixedFloat(XTensor * tensor, float p);
-
-/* generate data items with a fixed value p (in double) */
-void _SetDataFixedDouble(XTensor * tensor, double p);
-
-/* generate data items with a fixed value p only if the condition entry is non-zero */
-void _SetDataFixedCond(XTensor * tensor, XTensor * condition, DTYPE p);
-
-/* generate data items with a fixed value p only if the condition entry is non-zero */
-void _SetDataFixedCondInt(XTensor * tensor, XTensor * condition, int p);
+/* generate data items with a fixed value only if the condition entry is non-zero */
+template<class T>
+void _SetDataFixedCond(XTensor* tensor, XTensor* condition, T value);

 /* set data items along with a given dimension (and keep the remaining items unchanged) */
-void _SetDataDim(XTensor * tensor, int beg, int len, int dim, DTYPE p);
+template<class T>
+void _SetDataDim(XTensor * tensor, int beg, int len, int dim, T p);

 /* modify data items along with a given index and dimension (and keep the remaining items unchanged) */
 void _SetDataIndexed(XTensor * source, XTensor * modify, int dim, int index);

--- a/source/tensor/function/DropoutWithIndex.cpp
+++ b/source/tensor/function/DropoutWithIndex.cpp
@@ -70,7 +70,7 @@ XTensor DropoutWithIndex(const XTensor &x, XTensor &maskIndex, DTYPE scale)

    InitTensor1DV2(&c, x.unitNum, x.dataType, x.devID, x.mem);

-    _SetDataFixedFloat(&c, 1.0F);
+    c.SetDataFixed(1.0);

    _DropoutWithIndex(&x, &maskIndex, &c);


--- a/source/tensor/function/Loss.cpp
+++ b/source/tensor/function/Loss.cpp
@@ -383,15 +383,7 @@ void _LossBackward(XTensor * dedy, XTensor * t, XTensor * y,
                  int leadDim, int tBeg, int tLen, int yBeg)
 {
    if(t == NULL){
-        if(dedy->dataType == X_FLOAT)
-            _SetDataFixedFloat(dedy, 1.0F);
-        else if(dedy->dataType == X_DOUBLE)
-            _SetDataFixedDouble(dedy, 1.0);
-        else if(dedy->dataType == X_INT)
-            _SetDataFixedInt(dedy, 1);
-        else{
-            ShowNTErrors("TODO");
-        }
+        dedy->SetDataFixed(1);
        return;
    }


--- a/source/tensor/test/TDropout.cpp
+++ b/source/tensor/test/TDropout.cpp
@@ -50,7 +50,7 @@ bool TestDropout1()
    XTensor yUser;

    /* initialize variables */
-    _SetDataFixedFloat(x, 1.0F);
+    x->SetDataFixed(1);
    y->SetZeroAll();

    /* call Dropout function */
@@ -88,7 +88,7 @@ bool TestDropout1()
    XTensor yUserGPU;

    /* initialize variables */
-    _SetDataFixedFloat(xGPU, 1.0F);
+    xGPU->SetDataFixed(1);
    yGPU->SetZeroAll();

    /* call Dropout function */
@@ -157,10 +157,10 @@ bool TestDropout2()
    XTensor * dedy = NewTensorV2(order, dimSize);

    /* initialize variables */
-    _SetDataFixedFloat(x, 1.0F);
+    x->SetDataFixed(1.0);
    y->SetZeroAll();
    dedx->SetZeroAll();
-    _SetDataFixedFloat(dedy, 1.5F);
+    dedy->SetDataFixed(1.5);

    /* call Dropout function */
    float dropProb = 0.5F;
@@ -183,10 +183,10 @@ bool TestDropout2()
    XTensor * dedyGPU = NewTensorV2(order, dimSize, X_FLOAT, 1.0F, 0);

    /* initialize variables */
-    _SetDataFixedFloat(xGPU, 1.0F);
+    xGPU->SetDataFixed(1.0);
    yGPU->SetZeroAll();
    dedxGPU->SetZeroAll();
-    _SetDataFixedFloat(dedyGPU, 1.5F);
+    dedyGPU->SetDataFixed(1.5);

    /* call Dropout function */
    _Dropout(xGPU, yGPU, seed, dropProb);

--- a/source/tensor/test/TReduceSum.cpp
+++ b/source/tensor/test/TReduceSum.cpp
@@ -195,8 +195,8 @@ bool TestReduceSum2()
    XTensor tUser;

    /* initialize variables */
-    _SetDataFixedFloat(s, 1.0F);
-    _SetDataFixedFloat(answer, (float)s->GetDim(1));
+    s->SetDataFixed(1);
+    answer->SetDataFixed(s->GetDim(1));

    /* call ReduceSum function */
    _ReduceSum(s, t, 1);
@@ -215,7 +215,7 @@ bool TestReduceSum2()
    XTensor tUserGPU;

    /* initialize variables */
-    _SetDataFixedFloat(sGPU, 1.0F);
+    sGPU->SetDataFixed(1);

    /* call ReduceSum function */
    _ReduceSum(sGPU, tGPU, 1);
@@ -284,8 +284,8 @@ bool TestReduceSum3()
    XTensor tUser;

    /* initialize variables */
-    _SetDataFixedFloat(s, 1.0F);
-    _SetDataFixedFloat(answer, (float)s->GetDim(1));
+    s->SetDataFixed(1);
+    answer->SetDataFixed(s->GetDim(1));

    /* call ReduceSum function */
    _ReduceSum(s, t, 1);
@@ -304,7 +304,7 @@ bool TestReduceSum3()
    XTensor tUserGPU;

    /* initialize variables */
-    _SetDataFixedFloat(sGPU, 1.0F);
+    sGPU->SetDataFixed(1);

    /* call ReduceSum function */
    _ReduceSum(sGPU, tGPU, 1);
@@ -373,8 +373,8 @@ bool TestReduceSum4()
    XTensor tUser;

    /* initialize variables */
-    _SetDataFixedFloat(s, 1.0F);
-    _SetDataFixedFloat(answer, (float)s->GetDim(1));
+    s->SetDataFixed(1);
+    answer->SetDataFixed(s->GetDim(1));

    /* call ReduceSum function */
    _ReduceSum(s, t, 1);
@@ -393,7 +393,7 @@ bool TestReduceSum4()
    XTensor tUserGPU;

    /* initialize variables */
-    _SetDataFixedFloat(sGPU, 1.0F);
+    sGPU->SetDataFixed(1);

    /* call ReduceSum function */
    _ReduceSum(sGPU, tGPU, 1);
@@ -464,8 +464,8 @@ bool TestReduceSum5()
    XTensor tUser;

    /* initialize variables */
-    _SetDataFixedFloat(s, 1.0F);
-    _SetDataFixedFloat(answer, (float)s->GetDim(1));
+    s->SetDataFixed(1);
+    answer->SetDataFixed(s->GetDim(1));

    /* call ReduceSum function */
    _ReduceSum(s, t, 1);
@@ -484,7 +484,7 @@ bool TestReduceSum5()
    XTensor tUserGPU;

    /* initialize variables */
-    _SetDataFixedFloat(sGPU, 1.0F);
+    sGPU->SetDataFixed(1);

    /* call ReduceSum function */
    _ReduceSum(sGPU, tGPU, 1);
@@ -556,8 +556,8 @@ bool TestReduceSum6()
    XTensor tUser;

    /* initialize variables */
-    _SetDataFixedFloat(s, 1.0F);
-    _SetDataFixedFloat(answer, (float)s->GetDim(1));
+    s->SetDataFixed(1);
+    answer->SetDataFixed(s->GetDim(1));

    /* call ReduceSum function */
    _ReduceSum(s, t, 1);
@@ -576,7 +576,7 @@ bool TestReduceSum6()
    XTensor tUserGPU;

    /* initialize variables */
-    _SetDataFixedFloat(sGPU, 1.0F);
+    sGPU->SetDataFixed(1);

    /* call ReduceSum function */
    _ReduceSum(sGPU, tGPU, 1);

--- a/source/tensor/test/TSetData.cpp
+++ b/source/tensor/test/TSetData.cpp
@@ -119,7 +119,7 @@ bool TestSetData2()
    XTensor * modify = NewTensorV2(dataOrder, dataDimSize);

    /* Initialize variables */
-    _SetDataFixedFloat(s, 1.0F);
+    s->SetDataFixed(1);
    modify->SetData(data, dataUnitNum);

    /* call SetDataIndexed function */
@@ -137,7 +137,7 @@ bool TestSetData2()
    XTensor * modifyGPU = NewTensorV2(dataOrder, dataDimSize, X_FLOAT, 1.0F, 0);

    /* Initialize variables */
-    _SetDataFixedFloat(sGPU, 1.0F);
+    sGPU->SetDataFixed(1);
    modifyGPU->SetData(data, dataUnitNum);

    /* call SetDataIndexed function */
@@ -212,11 +212,11 @@ bool TestSetData3()
    XTensor * modify = NewTensorV2(dataOrder, dataDimSize);

    /* Initialize variables */
-    _SetDataFixedFloat(s, 1.0F);
+    s->SetDataFixed(1);
    modify->SetData(data, dataUnitNum);

    /* call SetDataIndexed function */
-    _SetDataFixedFloat(s, 1.0F);
+    s->SetDataFixed(1);
    _SetDataIndexed(s, modify, 1, 1);
    
    /* check results */
@@ -231,7 +231,7 @@ bool TestSetData3()
    XTensor * modifyGPU = NewTensorV2(dataOrder, dataDimSize, X_FLOAT, 1.0F, 0);

    /* Initialize variables */
-    _SetDataFixedFloat(sGPU, 1.0F);
+    sGPU->SetDataFixed(1);
    modifyGPU->SetData(data, dataUnitNum);
    
    /* call SetDataIndexed function */

--- a/source/tensor/test/TSpread.cpp
+++ b/source/tensor/test/TSpread.cpp
@@ -91,7 +91,7 @@ bool TestSpread1()
    XTensor * modify = NewTensorV2(dataOrder, dataDimSize);

    /* Initialize variables */
-    _SetDataFixedFloat(s, 0.0F);
+    s->SetZeroAll();
    modify->SetData(data, dataUnitNum);

    /* call _Spread function */
@@ -109,7 +109,7 @@ bool TestSpread1()
    XTensor * modifyGPU = NewTensorV2(dataOrder, dataDimSize, X_FLOAT, 1.0F, 0);

    /* Initialize variables */
-    _SetDataFixedFloat(sGPU, 0.0F);
+    sGPU->SetZeroAll();
    modifyGPU->SetData(data, dataUnitNum);
    
    /* call _Spread function */

--- a/source/tensor/test/TSumDim.cpp
+++ b/source/tensor/test/TSumDim.cpp
@@ -296,8 +296,8 @@ bool TestSumDim3()
    /* initialize variables */
    a->SetZeroAll();
    cMe->SetZeroAll();
-    _SetDataFixedFloat(b, 1.0F);
-    _SetDataFixedFloat(answer, 1.0F);
+    b->SetDataFixed(1);
+    answer->SetDataFixed(1);

    /* call SumDim function */
    _SumDim(a, b, c, 1);
@@ -323,7 +323,7 @@ bool TestSumDim3()
    /* Initialize variables */
    aGPU->SetZeroAll();
    cMe->SetZeroAll();
-    _SetDataFixedFloat(bGPU, 1.0F);
+    bGPU->SetDataFixed(1);

    /* call sum function */
    _SumDim(aGPU, bGPU, cGPU, 1);
@@ -405,8 +405,8 @@ bool TestSumDim4()
    /* initialize variables */
    a->SetZeroAll();
    cMe->SetZeroAll();
-    _SetDataFixedFloat(b, 1.0F);
-    _SetDataFixedFloat(answer, 1.0F);
+    b->SetDataFixed(1);
+    answer->SetDataFixed(1);

    /* call SumDim function */
    _SumDim(a, b, c, 1);
@@ -432,7 +432,7 @@ bool TestSumDim4()
    /* Initialize variables */
    aGPU->SetZeroAll();
    cMe->SetZeroAll();
-    _SetDataFixedFloat(bGPU, 1.0F);
+    bGPU->SetDataFixed(1);

    /* call sum function */
    _SumDim(aGPU, bGPU, cGPU, 1);