1. implement SetData by template 2. update float16 datatype of SetData

aa875fba · linye · 9f14dc72 · aa875fba · aa875fba · aa875fba
Commit aa875fba authored Jul 22, 2019 by linye
--- a/source/network/Main.cpp
+++ b/source/network/Main.cpp
@@ -399,8 +399,8 @@ void xcTest()
    InitTensor2D(&t2, 2, 4, X_FLOAT, 0, NULL);
    XTensor tensor;
    
-    _SetDataFixedFloat(&t1, 1.0F);
-    _SetDataFixedFloat(&t2, 2.0F);
+    _SetDataFixed(&t1, 1.0F);
+    _SetDataFixed(&t2, 2.0F);

    tensor = t1 + t2;


--- a/source/network/XBackwardLoss.cpp
+++ b/source/network/XBackwardLoss.cpp
@@ -52,15 +52,7 @@ void XLossGrad::MakeGrad(XTensor * node, bool isEfficient)
    XTensor * dedy = output->grad;

    if (income.tailNum == 1) {
-        if(dedy->dataType == X_FLOAT)
-            _SetDataFixedFloat(dedy, 1.0F);
-        else if(dedy->dataType == X_DOUBLE)
-            _SetDataFixedDouble(dedy, 1.0);
-        else if(dedy->dataType == X_INT)
-            _SetDataFixedInt(dedy, 1);
-        else
-            ShowNTErrors("TODO");
-
+        _SetDataFixed(dedy, 1.0F);
        return;
    }

@@ -144,15 +136,7 @@ void XLossGrad::Compute(XTensor * gold, XTensor * y,
                        LOSS_FUNCTION_NAME lossName)
 {
    if(gold == NULL){
-        if(dedy->dataType == X_FLOAT)
-            _SetDataFixedFloat(dedy, 1.0F);
-        else if(dedy->dataType == X_DOUBLE)
-            _SetDataFixedDouble(dedy, 1.0);
-        else if(dedy->dataType == X_INT)
-            _SetDataFixedInt(dedy, 1);
-        else{
-            ShowNTErrors("TODO");
-        }
+        _SetDataFixed(dedy, 1.0F);
        return;
    }


--- a/source/tensor/core/getandset/SetData.cpp
+++ b/source/tensor/core/getandset/SetData.cpp
@@ -25,6 +25,7 @@
 #include "SetData.cuh"
 #include "../../XUtility.h"
 #include "../movement/CopyValues.h"
+#include "ConvertDataType.h"

 #if !defined( WIN32 ) && !defined( _WIN32 )
    #include "sys/time.h"
@@ -77,153 +78,78 @@ void _SetDataFanInOut(XTensor * tensor, DTYPE gain)
 }

 /* 
-generate data items with a fixed value p 
+generate data items with a fixed value 
 >> tensor - the tensor whose data array would be initialized
->> p - pointer to the number for initializing the tensor
+>> value - pointer to the number for initializing the tensor
 */
-void _SetDataFixed(XTensor * tensor, void * valuePointer)
+template<class T>
+void _SetDataFixed(XTensor * tensor, T value)
 {
+#ifdef USE_CUDA
+    if (tensor->devID >= 0) {
+        _CudaSetDataFixed(tensor, value);
+        return;
+    }
+#endif
+
    int num = tensor->unitNum;

-    if(tensor->dataType == X_INT){
-        int p = *(int*)valuePointer;
-        if(tensor->devID < 0){
-            int * d = (int*)tensor->data;
-            if(num % 4 == 0){
-                for(int i = 0; i < num; i += 4){
-                    d[i] = p;
-                    d[i + 1] = p;
-                    d[i + 2] = p;
-                    d[i + 3] = p;
-                }
-            }
-            else{
-                for(int i = 0; i < num; i++)
-                    d[i] = p;
+    if (tensor->dataType == X_INT) {
+        int * d = (int*)tensor->data;
+        int v = (int)value;
+        if (num % 4 == 0) {
+            for (int i = 0; i < num; i += 4) {
+                d[i] = v;
+                d[i + 1] = v;
+                d[i + 2] = v;
+                d[i + 3] = v;
            }
        }
-        else{
-#ifdef USE_CUDA
-            _CudaSetDataFixedInt(tensor, p);
-#endif
+        else {
+            for (int i = 0; i < num; i++)
+                d[i] = v;
        }
    }
-    else if(tensor->dataType == X_FLOAT){
-        float p = *(float*)valuePointer;
-        if(tensor->devID < 0){
-            float * d = (float*)tensor->data;
-            if(num % 4 == 0){
-                for(int i = 0; i < num; i += 4){
-                    d[i] = p;
-                    d[i + 1] = p;
-                    d[i + 2] = p;
-                    d[i + 3] = p;
-                }
-            }
-            else{
-                for(int i = 0; i < num; i++)
-                    d[i] = p;
+    else if (tensor->dataType == X_FLOAT) {
+        float * d = (float*)tensor->data;
+        float v = (float)value;
+        if (num % 4 == 0) {
+            for (int i = 0; i < num; i += 4) {
+                d[i] = v;
+                d[i + 1] = v;
+                d[i + 2] = v;
+                d[i + 3] = v;
            }
        }
-        else{
-#ifdef USE_CUDA
-            _CudaSetDataFixedFloat(tensor, p);
-#endif
+        else {
+            for (int i = 0; i < num; i++)
+                d[i] = v;
        }
    }
-    else if(tensor->dataType == X_DOUBLE){
-        double p = *(double*)valuePointer;
-        if(tensor->devID < 0){
-            double * d = (double*)tensor->data;
-            if(num % 4 == 0){
-                for(int i = 0; i < num; i += 4){
-                    d[i] = p;
-                    d[i + 1] = p;
-                    d[i + 2] = p;
-                    d[i + 3] = p;
-                }
-            }
-            else{
-                for(int i = 0; i < num; i++)
-                    d[i] = p;
+    else if (tensor->dataType == X_DOUBLE) {
+        double * d = (double*)tensor->data;
+        double v = (double)value;
+
+        if (num % 4 == 0) {
+            for (int i = 0; i < num; i += 4) {
+                d[i] = v;
+                d[i + 1] = v;
+                d[i + 2] = v;
+                d[i + 3] = v;
            }
        }
-        else{
-#ifdef USE_CUDA
-            _CudaSetDataFixedDouble(tensor, p);
-#endif
+        else {
+            for (int i = 0; i < num; i++)
+                d[i] = v;
        }
    }
-    else{
-        ShowNTErrors("TODO");
-    }
-}
-
-/* 
-generate data items with a fixed value p (in default type) 
->> tensor - the tensor whose data array would be initialized
->> p - number in default type
-*/
-void SetDataFixed(XTensor &tensor, DTYPE p)
-{
-    _SetDataFixed(&tensor, &p);
-}
-    
-/*
-generate data items with a fixed value p (in integer)
->> tensor - the tensor whose data array would be initialized
->> p - an integer
-*/
-void SetDataFixedInt(XTensor &tensor, int p)
-{
-    CheckNTErrors(tensor.dataType == X_INT, "An integer tensor is required!");
-    _SetDataFixed(&tensor, &p);
-}
-
-/* 
-generate data items with a fixed value p (in integer) 
->> tensor - the tensor whose data array would be initialized
->> p - an int-valued number
-*/
-void _SetDataFixedInt(XTensor * tensor, int p)
-{
-    CheckNTErrors(tensor->dataType == X_INT, "the tensor must be in X_INT!");
-
-    if(p == 0)
-        tensor->SetZeroAll();
    else
-        _SetDataFixed(tensor, &p);
-}
-
-/*
-generate data items with a fixed value p (in float) 
->> tensor - the tensor whose data array would be initialized
->> p - a float-valued number
-*/
-void _SetDataFixedFloat(XTensor * tensor, float p)
-{
-    CheckNTErrors(tensor->dataType == X_FLOAT, "the tensor must be in X_FLOAT!");
-
-    if(p == 0)
-        tensor->SetZeroAll();
-    else
-        _SetDataFixed(tensor, &p);
+        ShowNTErrors("TODO");
 }

-/* 
-generate data items with a fixed value p (in double) 
->> tensor - the tensor whose data array would be initialized
->> p - a double-valued number
-*/
-void _SetDataFixedDouble(XTensor * tensor, double p)
-{
-    CheckNTErrors(tensor->dataType == X_DOUBLE, "the tensor must be in X_DOUBLE!");
-
-    if(p == 0)
-        tensor->SetZeroAll();
-    else
-        _SetDataFixed(tensor, &p);
-}
+template void _SetDataFixed<int>(XTensor*, int);
+template void _SetDataFixed<float>(XTensor*, float);
+template void _SetDataFixed<double>(XTensor*, double);

 /* 
 set data items along with a given dimension (and keep the remaining items unchanged) 
@@ -396,7 +322,7 @@ generate data items with a uniform distribution in [lower, upper]
 >> lower - lower value of the range
 >> upper - upper value of the range
 */
-void _SetDataRand(const XTensor * tensor, DTYPE lower, DTYPE upper)
+void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
 {
    CheckNTErrors(upper > lower, "the high value must be greater than low value!");

@@ -433,10 +359,6 @@ void _SetDataRand(const XTensor * tensor, DTYPE lower, DTYPE upper)
 #ifdef USE_CUDA
        _CudaSetDataRand(tensor, lower, upper);
 #endif
-        //XTensor * t2 = NewTensor(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, -1);
-        //_SetDataRand(t2, low, high);
-        //_CopyValues(t2, tensor);
-        //delete t2;
    }
 }

@@ -449,10 +371,8 @@ the item to a pre-defined value if the item >= p, set the item to 0 otherwise
 >> p - the threshold
 >> value - the value we intend to assign to the item
 */
-void _SetDataRandP(const XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value)
+void _SetDataRandP(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value)
 {
-    CheckNTErrors(tensor->dataType == DEFAULT_DTYPE, "TODO");
-
    if (tensor->devID < 0) {
        _SetDataRand(tensor, lower, upper);


--- a/source/tensor/core/getandset/SetData.cu
+++ b/source/tensor/core/getandset/SetData.cu
@@ -19,6 +19,7 @@
 /*
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-18
 * I'm surprised that I did not write this file till today.
+* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-22 float16 added
 */

 #include <curand.h>
@@ -27,17 +28,19 @@
 #include <curand_kernel.h>
 #include "../../XDevice.h"
 #include "../../XUtility.h"
+#include "ConvertDataType.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

-/* 
-set an integer data array with a fixed value p (in int) 
+/*
+set an data array with a fixed value p (in int, float, float16, double)
 >> d - pointer to the data array
 >> size - size of the array
 >> p - the initial value
 */
-__global__ 
-void KernelSetDataFixedInt(int * d, int size, int p)
+template<class T>
+__global__
+void KernelSetDataFixed(T * d, int size, T p)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;

@@ -45,15 +48,14 @@ void KernelSetDataFixedInt(int * d, int size, int p)
        d[i] = p;
 }

-/* 
-generate data items with a fixed value p (in int) 
+/*
+generate data items with a fixed value p (in int, float, float16, double)
 >> tensor - the tensor for initialization
 >> p - the initial value
 */
-void _CudaSetDataFixedInt(XTensor * tensor, int p)
+template<class T>
+void _CudaSetDataFixed(XTensor * tensor, T p)
 {
-    CheckNTErrors(tensor->dataType == X_INT, "the tensor must be in X_INT!");
-
    int gridSize[3];
    int blockSize[3];

@@ -65,34 +67,59 @@ void _CudaSetDataFixedInt(XTensor * tensor, int p)
    int devIDBackup;
    ProtectCudaDev(tensor->devID, devIDBackup);

-    KernelSetDataFixedInt <<<blocks, threads >>>((int*)tensor->data, tensor->unitNum, p);
+    if (tensor->dataType == X_INT){
+        KernelSetDataFixed<<<blocks, threads>>>((int*)tensor->data, tensor->unitNum, (int)p);
+    }
+    else if (tensor->dataType == X_FLOAT){
+        KernelSetDataFixed<<<blocks, threads>>>((DTYPE*)tensor->data, tensor->unitNum, (float)p);
+    }
+    else if (tensor->dataType == X_DOUBLE){
+        KernelSetDataFixed<<<blocks, threads>>>((double*)tensor->data, tensor->unitNum, (double)p);
+    }
+    else if (tensor->dataType == X_FLOAT16){
+        half p1 = __float2half(p);
+        KernelSetDataFixed<<<blocks, threads>>>((__half*)tensor->data, tensor->unitNum, p1);
+    }
+    else
+        ShowNTErrors("TODO");

    BacktoCudaDev(tensor->devID, devIDBackup);
 }

-/* 
-set a float data array with a fixed value p (in int) 
->> d - pointer to the data array
+template void _CudaSetDataFixed<int>(XTensor*, int);
+template void _CudaSetDataFixed<float>(XTensor*, float);
+template void _CudaSetDataFixed<double>(XTensor*, double);
+//__device__
+//template void _CudaSetDataFixed<half>(XTensor*, half);
+
+/*
+set data array with a uniform distribution in [low, high]
+>> deviceStates - the state of curand
+>> d - float, float16, double datatype pointer to the data array
 >> size - size of the array
->> p - the initial value
+>> lower - low value of the range
+>> variance - the variance of the range
 */
-__global__ 
-void KernelSetDataFixedFloat(float * d, int size, float p)
+template<class T>
+__global__
+void KernelSetDataRand(T * d, int size, T lower, T variance)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;

-    if (i < size)
-        d[i] = p;
+    if (i < size) {
+        d[i] = d[i] * variance + lower;
+    }
 }

-/* 
-generate data items with a fixed value p (in float)
->> tensor - the tensor for initialization
->> p - the initial value
+/*
+generate data items with a uniform distribution in [lower, upper]
+>> tensor - the tensor whose data array would be initialized
+>> lower - lower value of the range
+>> upper - upper value of the range
 */
-void _CudaSetDataFixedFloat(XTensor * tensor, float p)
+void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
 {
-    CheckNTErrors(tensor->dataType == X_FLOAT, "the tensor must be in X_FLOAT!");
+    CheckNTErrors(upper > lower, "the high value must be greater than low value!");

    int gridSize[3];
    int blockSize[3];
@@ -105,34 +132,69 @@ void _CudaSetDataFixedFloat(XTensor * tensor, float p)
    int devIDBackup;
    ProtectCudaDev(tensor->devID, devIDBackup);

-    KernelSetDataFixedFloat <<<blocks, threads >>>((float*)tensor->data, tensor->unitNum, p);
+    XTensor tensor1(tensor->order, tensor->dimSize, X_FLOAT, tensor->denseRatio, tensor->devID, tensor->mem);
+
+    if (tensor->dataType == X_FLOAT || tensor->dataType == X_DOUBLE){
+        curandGenerator_t & gen = GDevs.GPUs[tensor->devID].gen;
+        curandGenerateUniform(gen, (float*)tensor->data, tensor->unitNum);
+    }
+    else {
+        curandGenerator_t & gen = GDevs.GPUs[tensor->devID].gen;
+        curandGenerateUniform(gen, (float*)tensor1.data, tensor1.unitNum);
+    }
+
+    DTYPE variance = upper - lower;
+
+    if (tensor->dataType == X_FLOAT){
+        KernelSetDataRand<<<blocks, threads>>>((DTYPE*)tensor->data, tensor->unitNum, lower, variance);
+    }
+    else if (tensor->dataType == X_FLOAT16){
+        _ConvertDataType(&tensor1, tensor);
+
+        half lower1 = __float2half(lower);
+        half variance1 = __float2half(variance);
+        KernelSetDataRand<<<blocks, threads>>>((__half*)tensor->data, tensor->unitNum, lower1, variance1);
+    }
+    else {
+        ShowNTErrors("TODO");
+    }

    BacktoCudaDev(tensor->devID, devIDBackup);
 }

-/* 
-set a double data array with a fixed value p (in int) 
+/*
+set data items to a pre-defined value if its value >= p, set it to 0 otherwise
 >> d - pointer to the data array
 >> size - size of the array
->> p - the initial value
+>> lower - low value of the range
+>> variance - the variance of the range
 */
-__global__ 
-void KernelSetDataFixedDouble(double * d, int size, double p)
+template<class T>
+__global__
+void KernelSetDataPCut(T * d, int size, T p, T value)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;

-    if (i < size)
-        d[i] = p;
+    if (i < size) {
+        if (d[i] >= p)
+            d[i] = value;
+        else
+            d[i] = 0;
+    }
 }

-/* 
-generate data items with a fixed value p (in double) 
->> tensor - the tensor for initialization
->> p - the initial value
+/*
+generate data items with a uniform distribution in [lower, upper] and set
+the item to a pre-defined value if the item >= p, set the item to 0 otherwise
+>> tensor - the tensor whose data array would be initialized
+>> lower - lower value of the range
+>> upper - upper value of the range
+>> p - the threshold
+>> value - the value we intend to assign to the item
 */
-void _CudaSetDataFixedDouble(XTensor * tensor, double p)
+void _CudaSetDataRandP(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value)
 {
-    CheckNTErrors(tensor->dataType == X_DOUBLE, "the tensor must be in X_DOUBLE!");
+    _CudaSetDataRand(tensor, lower, upper);

    int gridSize[3];
    int blockSize[3];
@@ -145,67 +207,19 @@ void _CudaSetDataFixedDouble(XTensor * tensor, double p)
    int devIDBackup;
    ProtectCudaDev(tensor->devID, devIDBackup);

-    KernelSetDataFixedDouble <<<blocks, threads >>>((double*)tensor->data, tensor->unitNum, p);
-
-    BacktoCudaDev(tensor->devID, devIDBackup);
-}
-
-/* 
-set data array with a uniform distribution in [low, high] 
->> deviceStates - the state of curand
->> d - float datatype pointer to the data array 
->> size - size of the array
->> lower - low value of the range
->> variance - the variance of the range
-*/
-__global__
-void KernelSetDataRandFloat(float * d, int size, DTYPE lower, DTYPE variance)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-    
-    if (i < size) {
-        d[i] = d[i] * variance + lower;
+    if (tensor->dataType == X_FLOAT) {
+        KernelSetDataPCut<<<blocks, threads>>>((DTYPE*)tensor->data, tensor->unitNum, p, value);
    }
-}
-/* 
-set data array with a uniform distribution in [low, high] 
->> deviceStates - the state of curand
->> d - double datatype pointer to the data array
->> size - size of the array
->> lower - low value of the range
->> variance - the variance of the range
-*/
-__global__
-void KernelSetDataRandDouble(double * d, int size, DTYPE lower, DTYPE variance)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-    
-    if (i < size){
-        d[i] = d[i] * variance + lower;
+    else if (tensor->dataType == X_FLOAT16) {
+        half p1 = __float2half(p);
+        half value1 = __float2half(value);
+        KernelSetDataPCut<<<blocks, threads>>>((__half*)tensor->data, tensor->unitNum, p1, value1);
    }
-}

-/*
-set data items to a pre-defined value if its value >= p, set it to 0 otherwise
->> d - pointer to the data array
->> size - size of the array
->> lower - low value of the range
->> variance - the variance of the range
-*/
-__global__
-void KernelSetDataPCut(DTYPE * d, int size, DTYPE p, DTYPE value)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-    if (i < size) {
-        if (d[i] >= p)
-            d[i] = value;
-        else
-            d[i] = 0;
-    }
+    BacktoCudaDev(tensor->devID, devIDBackup);
 }

-/* 
+/*
 set data items along with a given dimension (and keep the remaining items unchanged) - kernel version
 >> tensor - the tensor whose data array would be initialized
 >> beg - the beginning position
@@ -213,8 +227,9 @@ set data items along with a given dimension (and keep the remaining items unchan
 >> blockSize - size of a data block
 >> blockNum - number of data blocks
 */
+template<class T>
 __global__
-void KernelSetDataDim(DTYPE * d, int beg, int len, int blockSize, int blockNum, DTYPE p)
+void KernelSetDataDim(T * d, int beg, int len, int blockSize, int blockNum, T p)
 {
    /* offset in each block */
    int i = blockDim.x * blockIdx.x + threadIdx.x;
@@ -222,10 +237,10 @@ void KernelSetDataDim(DTYPE * d, int beg, int len, int blockSize, int blockNum, 
    /* block id */
    int j = blockDim.y * blockIdx.y + threadIdx.y;

-    if(i >= blockSize || j > blockNum)
+    if (i >= blockSize || j > blockNum)
        return;

-    if(i < beg || i >= beg + len)
+    if (i < beg || i >= beg + len)
        return;

    d[blockSize * j + i] = p;
@@ -251,15 +266,14 @@ void _CudaSetDataDim(XTensor * tensor, int beg, int len, int dim, DTYPE p)
 {
    int n = tensor->order;

-    CheckNTErrors(tensor->dataType == DEFAULT_DTYPE, "TODO!");
    CheckNTErrors(dim < n && dim >= 0, "Illegal dimension!");
    CheckNTErrors(beg >= 0 && beg < tensor->GetDim(dim), "Illegal beginning position!");
    CheckNTErrors(beg + len >= 0 && beg + len < tensor->GetDim(dim), "Illegal length!");

    int stride = 1;
    int blockSize = 1;
-    int blockNum  = 1;
-    for(int i = n - 1; i > dim; i--){
+    int blockNum = 1;
+    for (int i = n - 1; i > dim; i--) {
        stride *= tensor->GetDim(i);
    }
    blockSize = stride * tensor->GetDim(dim);
@@ -276,8 +290,15 @@ void _CudaSetDataDim(XTensor * tensor, int beg, int len, int dim, DTYPE p)
    int devIDBackup;
    ProtectCudaDev(tensor->devID, devIDBackup);

-    KernelSetDataDim<<<blocks, threads >>>((DTYPE*)tensor->data, beg * stride, 
-                                            len * stride, blockSize, blockNum, p);
+    if (tensor->dataType == X_FLOAT){
+        KernelSetDataDim<<<blocks, threads>>>((DTYPE*)tensor->data, beg * stride,
+                                               len * stride, blockSize, blockNum, p);
+    }
+    else if (tensor->dataType == X_FLOAT16){
+        half p1 = __float2half(p);
+        KernelSetDataDim<<<blocks, threads>>>((__half*)tensor->data, beg * stride,
+                                               len * stride, blockSize, blockNum, p1);
+    }

    BacktoCudaDev(tensor->devID, devIDBackup);
 }
@@ -292,16 +313,17 @@ modify data items along with a given index and dimension
 >> blockSize - size of a data block
 >> stride - stride of a data block
 */
+template<class T>
 __global__
-void KernelSetDataIndexed(DTYPE * s, DTYPE * m, int blockNum, int blockSize, int stride)
+void KernelSetDataIndexed(T * s, T * m, int blockNum, int blockSize, int stride)
 {
    /* offset in each block */
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    /* block id */
    int j = blockDim.y * blockIdx.y + threadIdx.y;
-    
-    if(i >= stride || j >= blockNum)
+
+    if (i >= stride || j >= blockNum)
        return;

    int x = blockSize * j + i;
@@ -332,7 +354,6 @@ void _CudaSetDataIndexed(XTensor * source, XTensor * modify, int dim, int index)
    int order = source->order;
    int size = source->GetDim(dim);

-    CheckNTErrors(source->dataType == DEFAULT_DTYPE, "TODO!");
    CheckNTErrors(dim >= 0 && dim < order, "Illegal dimension!");
    CheckNTErrors(index >= 0 && index < size, "Illegal index!");
    
@@ -358,8 +379,14 @@ void _CudaSetDataIndexed(XTensor * source, XTensor * modify, int dim, int index)
    int devIDBackup;
    ProtectCudaDev(source->devID, devIDBackup);
    
-    KernelSetDataIndexed<<<blocks, threads >>>((DTYPE*)source->data + index * stride, (DTYPE*)modify->data, 
-                                                blockNum, blockSize, stride);
+    if (source->dataType == X_FLOAT){
+        KernelSetDataIndexed<<<blocks, threads>>>((DTYPE*)source->data + index * stride, (DTYPE*)modify->data,
+                                                   blockNum, blockSize, stride);
+    }
+    else if (source->dataType == X_FLOAT16){
+        KernelSetDataIndexed<<<blocks, threads>>>((__half*)source->data + index * stride, (__half*)modify->data,
+            blockNum, blockSize, stride);
+    }

    BacktoCudaDev(source->devID, devIDBackup);
 }
@@ -452,71 +479,6 @@ void _CudaSetDataLowTri(XTensor * tensor, DTYPE p, int shift)
 }

 /*
-generate data items with a uniform distribution in [lower, upper]
->> tensor - the tensor whose data array would be initialized
->> lower - lower value of the range
->> upper - upper value of the range
-*/
-void _CudaSetDataRand(const XTensor * tensor, DTYPE lower, DTYPE upper)
-{
-    CheckNTErrors(upper > lower, "the high value must be greater than low value!");
-
-    int gridSize[3];
-    int blockSize[3];
-
-    GDevs.GetCudaThread(tensor->devID, tensor->unitNum, gridSize, blockSize);
-
-    dim3 blocks(gridSize[0]);
-    dim3 threads(blockSize[0]);
-
-    int devIDBackup;
-    ProtectCudaDev(tensor->devID, devIDBackup);
-    
-    curandGenerator_t & gen = GDevs.GPUs[tensor->devID].gen;
-    curandGenerateUniform(gen , (float*)tensor->data , tensor->unitNum);
-    
-    DTYPE variance = upper - lower;
-
-    if(variance != 1.0F || lower != 0){
-        if (tensor->dataType == X_FLOAT)
-            KernelSetDataRandFloat  <<<blocks, threads >>>((float*) tensor->data, tensor->unitNum, lower, variance);
-        else if (tensor->dataType == X_DOUBLE)
-            KernelSetDataRandDouble <<<blocks, threads >>>((double*)tensor->data, tensor->unitNum, lower, variance);
-    }
-
-    BacktoCudaDev(tensor->devID, devIDBackup);
-}
-
-/* 
-generate data items with a uniform distribution in [lower, upper] and set
-the item to a pre-defined value if the item >= p, set the item to 0 otherwise 
->> tensor - the tensor whose data array would be initialized
->> lower - lower value of the range
->> upper - upper value of the range
->> p - the threshold
->> value - the value we intend to assign to the item
-*/
-void _CudaSetDataRandP(const XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value)
-{
-    _CudaSetDataRand(tensor, lower, upper);
-
-    int gridSize[3];
-    int blockSize[3];
-
-    GDevs.GetCudaThread(tensor->devID, tensor->unitNum, gridSize, blockSize);
-
-    dim3 blocks(gridSize[0]);
-    dim3 threads(blockSize[0]);
-
-    int devIDBackup;
-    ProtectCudaDev(tensor->devID, devIDBackup);
-    
-    KernelSetDataPCut << <blocks, threads >> >((float*)tensor->data, tensor->unitNum, p, value);
-
-    BacktoCudaDev(tensor->devID, devIDBackup);
-}
-
-/*
 set the data with an array of offsets (kernel version)
 >> data - pointer to the data array
 >> offsets - offset for each data item

--- a/source/tensor/core/getandset/SetData.cuh
+++ b/source/tensor/core/getandset/SetData.cuh
@@ -19,6 +19,7 @@
 /*
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-18
 * I'm surprised that I did not write this file till today.
+* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-22 float16 added
 */

 #ifndef __SETDATA_CUH__
@@ -28,14 +29,9 @@

 namespace nts { // namespace nts(NiuTrans.Tensor)

-/* generate data items with a fixed value p (in int) */
-void _CudaSetDataFixedInt(XTensor * tensor, int p);
-
-/* generate data items with a fixed value p (in float) */
-void _CudaSetDataFixedFloat(XTensor * tensor, float p);
-
-/* generate data items with a fixed value p (in double) */
-void _CudaSetDataFixedDouble(XTensor * tensor, double p);
+/* generate data items with a fixed value p (in int, float, float16, double) */
+template<class T>
+void _CudaSetDataFixed(XTensor * tensor, T p);

 /* set data items along with a given dimension (and keep the remaining items unchanged) */
 void _CudaSetDataDim(XTensor * tensor, int beg, int len, int dim, DTYPE p);
@@ -47,11 +43,11 @@ void _CudaSetDataIndexed(XTensor * source, XTensor * modify, int dim, int index)
 void _CudaSetDataLowTri(XTensor * tensor, DTYPE p, int shift);

 /* generate data items with a uniform distribution in [lower, upper] */
-void _CudaSetDataRand(const XTensor * tensor, DTYPE lower, DTYPE upper);
+void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper);

 /* generate data items with a uniform distribution in [lower, upper] and set
   the item to a pre-defined value if the item >= p, set the item to 0 otherwise */
-void _CudaSetDataRandP(const XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value);
+void _CudaSetDataRandP(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value);

 /* set the data with an array of offsets */
 void _CudaSetDataWithOffset(XTensor * tensor, MTYPE * offsets, DTYPE value, MTYPE num);

--- a/source/tensor/core/getandset/SetData.h
+++ b/source/tensor/core/getandset/SetData.h
@@ -24,29 +24,19 @@
 #define __SETDATA_H__

 #include "../../XTensor.h"
+#include "SetData.cuh"

 namespace nts { // namespace nts(NiuTrans.Tensor)

 /* generate data items with a xavier initialization */
 void _SetDataFanInOut(XTensor * tensor, DTYPE gain = 1.0F);

-/* generate data items with a fixed value p */
-void _SetDataFixed(XTensor * tensor, void * valuePointer);
+///* generate data items with a fixed value p */
+//void _SetDataFixed(XTensor * tensor, void * valuePointer);

 /* generate data items with a fixed value p (in default type) */
-void SetDataFixed(XTensor &tensor, DTYPE p);
-    
-/* generate data items with a fixed value p (in integer) */
-void SetDataFixedInt(XTensor &tensor, int p);
-
-/* generate data items with a fixed value p (in int) */
-void _SetDataFixedInt(XTensor * tensor, int p);
-
-/* generate data items with a fixed value p (in float) */
-void _SetDataFixedFloat(XTensor * tensor, float p);
-
-/* generate data items with a fixed value p (in double) */
-void _SetDataFixedDouble(XTensor * tensor, double p);
+template<class T>
+void _SetDataFixed(XTensor * tensor, T value);

 /* set data items along with a given dimension (and keep the remaining items unchanged) */
 void _SetDataDim(XTensor * tensor, int beg, int len, int dim, DTYPE p);
@@ -58,11 +48,11 @@ void _SetDataIndexed(XTensor * source, XTensor * modify, int dim, int index);
 void _SetDataLowTri(XTensor * tensor, DTYPE p, int shift);

 /* generate data items with a uniform distribution in [lower, upper] */
-void _SetDataRand(const XTensor * tensor, DTYPE lower, DTYPE upper);
+void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper);

 /* generate data items with a uniform distribution in [lower, upper] and set 
   the item to a pre-defined value if the item >= p, set the item to 0 otherwise */
-void _SetDataRandP(const XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value);
+void _SetDataRandP(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value);

 /* generate data items with a normal distribution with specified mean and standard deviation */
 void _SetDataRandN(XTensor * tensor, DTYPE mean = 0.0F, DTYPE standardDeviation = 1.0F);

--- a/source/tensor/function/DropoutWithIndex.cpp
+++ b/source/tensor/function/DropoutWithIndex.cpp
@@ -70,7 +70,7 @@ XTensor DropoutWithIndex(const XTensor &x, XTensor &maskIndex, DTYPE scale)

    InitTensor1D(&c, x.unitNum, x.dataType, x.devID, x.mem);

-    _SetDataFixedFloat(&c, 1.0F);
+    _SetDataFixed(&c, 1.0F);

    _DropoutWithIndex(&x, &maskIndex, &c);


--- a/source/tensor/function/Loss.cpp
+++ b/source/tensor/function/Loss.cpp
@@ -385,11 +385,11 @@ void _LossBackward(XTensor * dedy, XTensor * t, XTensor * y,
 {
    if(t == NULL){
        if(dedy->dataType == X_FLOAT)
-            _SetDataFixedFloat(dedy, 1.0F);
+            _SetDataFixed(dedy, 1.0F);
        else if(dedy->dataType == X_DOUBLE)
-            _SetDataFixedDouble(dedy, 1.0);
+            _SetDataFixed(dedy, 1.0);
        else if(dedy->dataType == X_INT)
-            _SetDataFixedInt(dedy, 1);
+            _SetDataFixed(dedy, 1);
        else{
            ShowNTErrors("TODO");
        }

--- a/source/tensor/test/TDropout.cpp
+++ b/source/tensor/test/TDropout.cpp
@@ -50,7 +50,7 @@ bool TestDropout1()
    XTensor yUser;

    /* initialize variables */
-    _SetDataFixedFloat(x, 1.0F);
+    _SetDataFixed(x, 1.0F);
    y->SetZeroAll();

    /* call Dropout function */
@@ -88,7 +88,7 @@ bool TestDropout1()
    XTensor yUserGPU;

    /* initialize variables */
-    _SetDataFixedFloat(xGPU, 1.0F);
+    _SetDataFixed(xGPU, 1.0F);
    yGPU->SetZeroAll();

    /* call Dropout function */
@@ -157,10 +157,10 @@ bool TestDropout2()
    XTensor * dedy = NewTensor(order, dimSize);

    /* initialize variables */
-    _SetDataFixedFloat(x, 1.0F);
+    _SetDataFixed(x, 1.0F);
    y->SetZeroAll();
    dedx->SetZeroAll();
-    _SetDataFixedFloat(dedy, 1.5F);
+    _SetDataFixed(dedy, 1.5F);

    /* call Dropout function */
    float dropProb = 0.5F;
@@ -183,10 +183,10 @@ bool TestDropout2()
    XTensor * dedyGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);

    /* initialize variables */
-    _SetDataFixedFloat(xGPU, 1.0F);
+    _SetDataFixed(xGPU, 1.0F);
    yGPU->SetZeroAll();
    dedxGPU->SetZeroAll();
-    _SetDataFixedFloat(dedyGPU, 1.5F);
+    _SetDataFixed(dedyGPU, 1.5F);

    /* call Dropout function */
    _Dropout(xGPU, yGPU, seed, dropProb);

--- a/source/tensor/test/TReduceSum.cpp
+++ b/source/tensor/test/TReduceSum.cpp
@@ -196,8 +196,8 @@ bool TestReduceSum2()
    XTensor tUser;

    /* initialize variables */
-    _SetDataFixedFloat(s, 1.0F);
-    _SetDataFixedFloat(answer, (float)s->GetDim(1));
+    _SetDataFixed(s, 1.0F);
+    _SetDataFixed(answer, (float)s->GetDim(1));

    /* call ReduceSum function */
    _ReduceSum(s, t, 1);
@@ -216,7 +216,7 @@ bool TestReduceSum2()
    XTensor tUserGPU;

    /* initialize variables */
-    _SetDataFixedFloat(sGPU, 1.0F);
+    _SetDataFixed(sGPU, 1.0F);

    /* call ReduceSum function */
    _ReduceSum(sGPU, tGPU, 1);
@@ -285,8 +285,8 @@ bool TestReduceSum3()
    XTensor tUser;

    /* initialize variables */
-    _SetDataFixedFloat(s, 1.0F);
-    _SetDataFixedFloat(answer, (float)s->GetDim(1));
+    _SetDataFixed(s, 1.0F);
+    _SetDataFixed(answer, (float)s->GetDim(1));

    /* call ReduceSum function */
    _ReduceSum(s, t, 1);
@@ -305,7 +305,7 @@ bool TestReduceSum3()
    XTensor tUserGPU;

    /* initialize variables */
-    _SetDataFixedFloat(sGPU, 1.0F);
+    _SetDataFixed(sGPU, 1.0F);

    /* call ReduceSum function */
    _ReduceSum(sGPU, tGPU, 1);
@@ -374,8 +374,8 @@ bool TestReduceSum4()
    XTensor tUser;

    /* initialize variables */
-    _SetDataFixedFloat(s, 1.0F);
-    _SetDataFixedFloat(answer, (float)s->GetDim(1));
+    _SetDataFixed(s, 1.0F);
+    _SetDataFixed(answer, (float)s->GetDim(1));

    /* call ReduceSum function */
    _ReduceSum(s, t, 1);
@@ -394,7 +394,7 @@ bool TestReduceSum4()
    XTensor tUserGPU;

    /* initialize variables */
-    _SetDataFixedFloat(sGPU, 1.0F);
+    _SetDataFixed(sGPU, 1.0F);

    /* call ReduceSum function */
    _ReduceSum(sGPU, tGPU, 1);
@@ -465,8 +465,8 @@ bool TestReduceSum5()
    XTensor tUser;

    /* initialize variables */
-    _SetDataFixedFloat(s, 1.0F);
-    _SetDataFixedFloat(answer, (float)s->GetDim(1));
+    _SetDataFixed(s, 1.0F);
+    _SetDataFixed(answer, (float)s->GetDim(1));

    /* call ReduceSum function */
    _ReduceSum(s, t, 1);
@@ -485,7 +485,7 @@ bool TestReduceSum5()
    XTensor tUserGPU;

    /* initialize variables */
-    _SetDataFixedFloat(sGPU, 1.0F);
+    _SetDataFixed(sGPU, 1.0F);

    /* call ReduceSum function */
    _ReduceSum(sGPU, tGPU, 1);
@@ -556,8 +556,8 @@ bool TestReduceSum6()
    XTensor tUser;

    /* initialize variables */
-    _SetDataFixedFloat(s, 1.0F);
-    _SetDataFixedFloat(answer, (float)s->GetDim(1));
+    _SetDataFixed(s, 1.0F);
+    _SetDataFixed(answer, (float)s->GetDim(1));

    /* call ReduceSum function */
    _ReduceSum(s, t, 1);
@@ -576,7 +576,7 @@ bool TestReduceSum6()
    XTensor tUserGPU;

    /* initialize variables */
-    _SetDataFixedFloat(sGPU, 1.0F);
+    _SetDataFixed(sGPU, 1.0F);

    /* call ReduceSum function */
    _ReduceSum(sGPU, tGPU, 1);

--- a/source/tensor/test/TSetData.cpp
+++ b/source/tensor/test/TSetData.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.Tensor - an open-source tensor library
 * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
 * All rights reserved.
 *
@@ -17,10 +17,12 @@

 /*
 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-06
+ * $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-22 float16 added
 */

 #include "TSetData.h"
 #include "../core/getandset/SetData.h"
+#include "../core/getandset/ConvertDataType.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

@@ -118,7 +120,7 @@ bool TestSetData2()
    XTensor * modify = NewTensor(dataOrder, dataDimSize);

    /* Initialize variables */
-    _SetDataFixedFloat(s, 1.0F);
+    _SetDataFixed(s, 1.0F);
    modify->SetData(data, dataUnitNum);

    /* call SetDataIndexed function */
@@ -136,7 +138,7 @@ bool TestSetData2()
    XTensor * modifyGPU = NewTensor(dataOrder, dataDimSize, X_FLOAT, 1.0F, 0);

    /* Initialize variables */
-    _SetDataFixedFloat(sGPU, 1.0F);
+    _SetDataFixed(sGPU, 1.0);
    modifyGPU->SetData(data, dataUnitNum);

    /* call SetDataIndexed function */
@@ -211,11 +213,11 @@ bool TestSetData3()
    XTensor * modify = NewTensor(dataOrder, dataDimSize);

    /* Initialize variables */
-    _SetDataFixedFloat(s, 1.0F);
+    _SetDataFixed(s, 1.0);
    modify->SetData(data, dataUnitNum);

    /* call SetDataIndexed function */
-    _SetDataFixedFloat(s, 1.0F);
+    _SetDataFixed(s, 1.0);
    _SetDataIndexed(s, modify, 1, 1);
    
    /* check results */
@@ -230,7 +232,7 @@ bool TestSetData3()
    XTensor * modifyGPU = NewTensor(dataOrder, dataDimSize, X_FLOAT, 1.0F, 0);

    /* Initialize variables */
-    _SetDataFixedFloat(sGPU, 1.0F);
+    _SetDataFixed(sGPU, 1.0);
    modifyGPU->SetData(data, dataUnitNum);
    
    /* call SetDataIndexed function */
@@ -406,6 +408,427 @@ bool TestSetData5()
 #endif // USE_CUDA
 }

+/*
+case 6: float16 test SetDataRand function.
+set the tensor items by a uniform distribution in range [lower, upper].
+*/
+bool TestSetData6()
+{
+    /* a input tensor of size (2, 4) */
+    int sOrder = 2;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 2;
+    sDimSize[1] = 4;
+
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+
+    DTYPE answer[2][4] = {0};
+
+    /* CPU test */
+    bool cpuTest = true;
+
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+
+    /* create tensors */
+    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+
+    /* create float16 tensors */
+    XTensor sHalfGPU;
+
+    /* convert data type from float to float16 */
+    sHalfGPU = ConvertDataType(*sGPU, X_FLOAT16);
+
+    /* call setdatarand function */
+    _SetDataRand(&sHalfGPU, 0.0, 1.0);
+
+    /* convert data type from float16 to float */
+    _ConvertDataType(&sHalfGPU, sGPU);
+
+    /* check results */
+    gpuTest = sGPU->CheckData(answer, sUnitNum, 1.0F);
+
+    /* destroy variables */
+    delete sGPU;
+    delete[] sDimSize;
+
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete[] sDimSize;
+
+    return cpuTest;
+#endif // USE_CUDA
+}
+
+/*
+case 7: float16 test SetDataRandP function.
+first set the tensor items by a uniform distribution in range [lower, upper].
+then set the item to a pre-defined value if the item >= p, set the item to 0 otherwise
+*/
+bool TestSetData7()
+{
+    /* a input tensor of size (2, 4) */
+    int sOrder = 2;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 2;
+    sDimSize[1] = 4;
+
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+
+    DTYPE answer[2][4] = {0};
+
+    /* CPU test */
+    bool cpuTest = true;
+
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+
+    /* create tensors */
+    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+
+    /* create float16 tensors */
+    XTensor sHalfGPU;
+
+    /* convert data type from float to float16 */
+    sHalfGPU = ConvertDataType(*sGPU, X_FLOAT16);
+
+    /* call setdatarandp function */
+    _SetDataRandP(&sHalfGPU, 0.0, 1.0, 0.5, 1.0);
+
+    /* convert data type from float16 to float */
+    _ConvertDataType(&sHalfGPU, sGPU);
+
+    /* check results */
+    gpuTest = sGPU->CheckData(answer, sUnitNum, 1.1F);
+
+    /* destroy variables */
+    delete sGPU;
+    delete[] sDimSize;
+
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete[] sDimSize;
+
+    return cpuTest;
+#endif // USE_CUDA
+}
+
+/*
+case 8: float16 test SetDataIndexed function.
+modify data items along with a given dimension.
+*/
+bool TestSetData8()
+{
+    /* a input tensor of size (2, 4) */
+    int sOrder = 2;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 2;
+    sDimSize[1] = 4;
+
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+
+    /* a data tensor of size (4) for GPU test */
+    int dataOrder = 1;
+    int * dataDimSize = new int[dataOrder];
+    dataDimSize[0] = 4;
+
+    int dataUnitNum = 1;
+    for (int i = 0; i < dataOrder; i++)
+        dataUnitNum *= dataDimSize[i];
+
+    DTYPE data[4] = {0.0F, 1.0F, 2.0F, 3.0F};
+    DTYPE answer[2][4] = { {1.0F, 1.0F, 1.0F, 1.0F},
+                           {0.0F, 1.0F, 2.0F, 3.0F} };
+
+    /* CPU test */
+    bool cpuTest = true;
+
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+
+    /* create tensors */
+    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * modifyGPU = NewTensor(dataOrder, dataDimSize, X_FLOAT, 1.0F, 0);
+
+    /* create float16 tensors */
+    XTensor sHalfGPU;
+    XTensor modifyHalfGPU;
+
+    /* Initialize modifyGPU */
+    modifyGPU->SetData(data, dataUnitNum);
+
+    /* convert data type from float to float16 */
+    sHalfGPU = ConvertDataType(*sGPU, X_FLOAT16);
+    modifyHalfGPU = ConvertDataType(*modifyGPU, X_FLOAT16);
+
+    /* Initialize sHalfGPU */
+    _SetDataFixed(&sHalfGPU, 1.0);
+
+    /* call setdataindexed function */
+    _SetDataIndexed(&sHalfGPU, &modifyHalfGPU, 0, 1);
+
+    /* convert data type from float16 to float */
+    _ConvertDataType(&sHalfGPU, sGPU);
+
+    /* check results */
+    gpuTest = sGPU->CheckData(answer, sUnitNum, 1e-5F);
+
+    /* destroy variables */
+    delete sGPU;
+    delete modifyGPU;
+    delete[] sDimSize;
+    delete[] dataDimSize;
+
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete[] sDimSize;
+    delete[] dataDimSize;
+
+    return cpuTest;
+#endif // USE_CUDA
+}
+
+/*
+case 9: float16 test SetDataIndexed function.
+modify data items along with a given dimension.
+*/
+bool TestSetData9()
+{
+    /* a input tensor of size (2, 4, 3) */
+    int sOrder = 3;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 2;
+    sDimSize[1] = 4;
+    sDimSize[2] = 3;
+
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+
+    /* a data tensor of size (2, 3) for GPU test */
+    int dataOrder = 2;
+    int * dataDimSize = new int[dataOrder];
+    dataDimSize[0] = 2;
+    dataDimSize[1] = 3;
+
+    int dataUnitNum = 1;
+    for (int i = 0; i < dataOrder; i++)
+        dataUnitNum *= dataDimSize[i];
+
+    DTYPE data[2][3] = { { 0.0F, 1.0F, 2.0F },
+                         { 3.0F, 4.0F, 5.0F } };
+
+    DTYPE answer[2][4][3] = { { {1.0F, 1.0F, 1.0F},
+                                {0.0F, 1.0F, 2.0F},
+                                {1.0F, 1.0F, 1.0F},
+                                {1.0F, 1.0F, 1.0F} },
+                              { {1.0F, 1.0F, 1.0F},
+                                {3.0F, 4.0F, 5.0F},
+                                {1.0F, 1.0F, 1.0F},
+                                {1.0F, 1.0F, 1.0F} } };
+
+    /* CPU test */
+    bool cpuTest = true;
+
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+
+    /* create tensors */
+    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * modifyGPU = NewTensor(dataOrder, dataDimSize, X_FLOAT, 1.0F, 0);
+
+    /* create float16 tensors */
+    XTensor sHalfGPU;
+    XTensor modifyHalfGPU;
+
+    /* Initialize modifyGPU */
+    modifyGPU->SetData(data, dataUnitNum);
+
+    /* convert data type from float to float16 */
+    sHalfGPU = ConvertDataType(*sGPU, X_FLOAT16);
+    modifyHalfGPU = ConvertDataType(*modifyGPU, X_FLOAT16);
+
+    /* Initialize sHalfGPU */
+    _SetDataFixed(&sHalfGPU, 1.0);
+
+    /* call setdataindexed function */
+    _SetDataIndexed(&sHalfGPU, &modifyHalfGPU, 1, 1);
+
+    /* convert data type from float16 to float */
+    _ConvertDataType(&sHalfGPU, sGPU);
+
+    /* check results */
+    gpuTest = sGPU->CheckData(answer, sUnitNum, 1e-5F);
+
+    /* destroy variables */
+    delete sGPU;
+    delete modifyGPU;
+    delete[] sDimSize;
+    delete[] dataDimSize;
+
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete[] sDimSize;
+    delete[] dataDimSize;
+
+    return cpuTest;
+#endif // USE_CUDA
+}
+
+/*
+case 10: float16 test SetDataDim function.
+set data items along with a given dimension (and keep the remaining items unchanged)
+*/
+bool TestSetData10()
+{
+    /* a input tensor of size (3, 3) */
+    int order = 2;
+    int * dimSize = new int[order];
+    dimSize[0] = 3;
+    dimSize[1] = 3;
+
+    int unitNum = 1;
+    for (int i = 0; i < order; i++)
+        unitNum *= dimSize[i];
+
+    DTYPE sData[3][3] = { {1.0F, 2.0F, 3.0F},
+                          {4.0F, 5.0F, 6.0F},
+                          {7.0F, 8.0F, 9.0F} };
+    DTYPE answer[3][3] = { {1.0F, 2.0F, 3.0F},
+                           {0.0F, 0.0F, 0.0F},
+                           {7.0F, 8.0F, 9.0F} };
+
+    /* CPU test */
+    bool cpuTest = true;
+
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+
+    /* create tensors */
+    XTensor * sGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+
+    /* create float16 tensors */
+    XTensor sHalfGPU;
+
+    /* initialize variables */
+    sGPU->SetData(sData, unitNum);
+
+    /* convert data type from float to float16 */
+    sHalfGPU = ConvertDataType(*sGPU, X_FLOAT16);
+
+    /* call _setdatadim function */
+    _SetDataDim(&sHalfGPU, 1, 1, 0, 0);
+
+    /* convert data type from float16 to float */
+    _ConvertDataType(&sHalfGPU, sGPU);
+
+    /* check results */
+    gpuTest = sGPU->CheckData(answer, unitNum, 1e-4F);
+
+    /* destroy variables */
+    delete sGPU;
+    delete[] dimSize;
+
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete[] dimSize;
+
+    return cpuTest;
+#endif // USE_CUDA
+}
+
+/*
+case 11: float16 test SetDataDim function.
+set data items along with a given dimension (and keep the remaining items unchanged)
+*/
+bool TestSetData11()
+{
+    /* a input tensor of size (2, 4, 3) */
+    int order = 3;
+    int * dimSize = new int[order];
+    dimSize[0] = 2;
+    dimSize[1] = 4;
+    dimSize[2] = 3;
+
+    int unitNum = 1;
+    for (int i = 0; i < order; i++)
+        unitNum *= dimSize[i];
+
+    DTYPE data[2][4][3] = { { {1.0F, 1.0F, 1.0F},
+                              {0.0F, 1.0F, 2.0F},
+                              {1.0F, 1.0F, 1.0F},
+                              {1.0F, 1.0F, 1.0F} },
+                            { {1.0F, 1.0F, 1.0F},
+                              {3.0F, 4.0F, 5.0F},
+                              {1.0F, 1.0F, 1.0F},
+                              {1.0F, 1.0F, 1.0F} } };
+
+    DTYPE answer[2][4][3] = { { {1.0F, 1.0F, 1.0F},
+                                {0.0F, 1.0F, 2.0F},
+                                {5.0F, 5.0F, 5.0F},
+                                {1.0F, 1.0F, 1.0F} },
+                              { {1.0F, 1.0F, 1.0F},
+                                {3.0F, 4.0F, 5.0F},
+                                {5.0F, 5.0F, 5.0F},
+                                {1.0F, 1.0F, 1.0F} } };
+
+    /* CPU test */
+    bool cpuTest = true;
+
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+
+    /* create tensors */
+    XTensor * sGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+
+    /* create float16 tensors */
+    XTensor sHalfGPU;
+
+    /* initialize variables */
+    sGPU->SetData(data, unitNum);
+
+    /* convert data type from float to float16 */
+    sHalfGPU = ConvertDataType(*sGPU, X_FLOAT16);
+
+    /* call _setdatadim function */
+    _SetDataDim(&sHalfGPU, 2, 1, 1, 5.0F);
+
+    /* convert data type from float16 to float */
+    _ConvertDataType(&sHalfGPU, sGPU);
+
+    /* check results */
+    gpuTest = sGPU->CheckData(answer, unitNum, 1e-4F);
+
+    /* destroy variables */
+    delete sGPU;
+    delete[] dimSize;
+
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete[] dimSize;
+
+    return cpuTest;
+#endif // USE_CUDA
+}
+
 /* other cases */
 /*
 TODO!!
@@ -462,6 +885,60 @@ bool TestSetData()
    else
        XPRINT(0, stdout, ">> case 5 passed!\n");

+    /* case 6 test */
+    caseFlag = TestSetData6();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 6 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 6 passed!\n");
+
+    /* case 7 test */
+    caseFlag = TestSetData7();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 7 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 7 passed!\n");
+
+    /* case 8 test */
+    caseFlag = TestSetData8();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 8 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 8 passed!\n");
+
+    /* case 9 test */
+    caseFlag = TestSetData9();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 9 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 9 passed!\n");
+
+    /* case 10 test */
+    caseFlag = TestSetData10();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 10 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 10 passed!\n");
+
+    /* case 11 test */
+    caseFlag = TestSetData11();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 11 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 11 passed!\n");
+
    /* other cases test */
    /*
    TODO!!

--- a/source/tensor/test/TSpread.cpp
+++ b/source/tensor/test/TSpread.cpp
@@ -90,7 +90,7 @@ bool TestSpread1()
    XTensor * modify = NewTensor(dataOrder, dataDimSize);

    /* Initialize variables */
-    _SetDataFixedFloat(s, 0.0F);
+    _SetDataFixed(s, 0.0F);
    modify->SetData(data, dataUnitNum);

    /* call _Spread function */
@@ -108,7 +108,7 @@ bool TestSpread1()
    XTensor * modifyGPU = NewTensor(dataOrder, dataDimSize, X_FLOAT, 1.0F, 0);

    /* Initialize variables */
-    _SetDataFixedFloat(sGPU, 0.0F);
+    _SetDataFixed(sGPU, 0.0F);
    modifyGPU->SetData(data, dataUnitNum);
    
    /* call _Spread function */

--- a/source/tensor/test/TSumDim.cpp
+++ b/source/tensor/test/TSumDim.cpp
@@ -295,8 +295,8 @@ bool TestSumDim3()
    /* initialize variables */
    a->SetZeroAll();
    cMe->SetZeroAll();
-    _SetDataFixedFloat(b, 1.0F);
-    _SetDataFixedFloat(answer, 1.0F);
+    _SetDataFixed(b, 1.0F);
+    _SetDataFixed(answer, 1.0F);

    /* call SumDim function */
    _SumDim(a, b, c, 1);
@@ -322,7 +322,7 @@ bool TestSumDim3()
    /* Initialize variables */
    aGPU->SetZeroAll();
    cMe->SetZeroAll();
-    _SetDataFixedFloat(bGPU, 1.0F);
+    _SetDataFixed(bGPU, 1.0F);

    /* call sum function */
    _SumDim(aGPU, bGPU, cGPU, 1);
@@ -404,8 +404,8 @@ bool TestSumDim4()
    /* initialize variables */
    a->SetZeroAll();
    cMe->SetZeroAll();
-    _SetDataFixedFloat(b, 1.0F);
-    _SetDataFixedFloat(answer, 1.0F);
+    _SetDataFixed(b, 1.0F);
+    _SetDataFixed(answer, 1.0F);

    /* call SumDim function */
    _SumDim(a, b, c, 1);
@@ -431,7 +431,7 @@ bool TestSumDim4()
    /* Initialize variables */
    aGPU->SetZeroAll();
    cMe->SetZeroAll();
-    _SetDataFixedFloat(bGPU, 1.0F);
+    _SetDataFixed(bGPU, 1.0F);

    /* call sum function */
    _SumDim(aGPU, bGPU, cGPU, 1);

--- a/source/tensor/test/Test.cpp
+++ b/source/tensor/test/Test.cpp
@@ -30,7 +30,7 @@ bool Test()
    XPRINT(0, stdout, "Testing the XTensor utilites ... \n\n");
    
    //wrong = !TestAbsolute() || wrong;
-    wrong = !TestClip() || wrong;
+    //wrong = !TestClip() || wrong;
    //wrong = !TestCompare() || wrong;
    //wrong = !TestConcatenate() || wrong;
    //wrong = !TestConcatenateSolely() || wrong;
@@ -38,8 +38,8 @@ bool Test()
    //wrong = !TestConvertDataType() || wrong;
    //wrong = !TestCopyIndexed() || wrong;
    //wrong = !TestCopyValues() || wrong;
-    wrong = !TestDiv() || wrong;
-    wrong = !TestDivDim() || wrong;
+    //wrong = !TestDiv() || wrong;
+    //wrong = !TestDivDim() || wrong;
    //wrong = !TestExp() || wrong;
    //wrong = !TestGather() || wrong;
    //wrong = !TestLog() || wrong;
@@ -49,7 +49,7 @@ bool Test()
    //wrong = !TestMatrixMulBatched() || wrong;
    //wrong = !TestMerge() || wrong;
    //wrong = !TestMultiply() || wrong;
-    wrong = !TestMultiplyDim() || wrong;
+    //wrong = !TestMultiplyDim() || wrong;
    //wrong = !TestNegate() || wrong;
    //wrong = !TestNormalize() || wrong;
    //wrong = !TestPower() || wrong;
@@ -60,17 +60,17 @@ bool Test()
    //wrong = !TestReduceSumSquared() || wrong;
    //wrong = !TestReduceVariance() || wrong;
    //wrong = !TestRound() || wrong;
-    wrong = !TestScaleAndShift() || wrong;
+    //wrong = !TestScaleAndShift() || wrong;
    //wrong = !TestSelect() || wrong;
    //wrong = !TestSetAscendingOrder() || wrong;
-    //wrong = !TestSetData() || wrong;
+    wrong = !TestSetData() || wrong;
    //wrong = !TestSign() || wrong;
    //wrong = !TestSin() || wrong;
    //wrong = !TestSort() || wrong;
    //wrong = !TestSplit() || wrong;
    //wrong = !TestSpread() || wrong;
    //wrong = !TestSub() || wrong;
-    wrong = !TestSum() || wrong;
+    //wrong = !TestSum() || wrong;
    //wrong = !TestSumByColumnTV() || wrong;
    //wrong = !TestSumByColumnVT() || wrong;
    //wrong = !TestSumDim() || wrong;