Commit 3f48d22b by liyinqiao

Merge with HU Chi branch (Don't use this! It's an incomplete version)

1. Support X_INT dataType for Clip and ScaleAndShift function.
2. Minor error fixed.
parent 7876ba5b
...@@ -45,10 +45,10 @@ void _Clip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper) ...@@ -45,10 +45,10 @@ void _Clip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper)
#endif #endif
CheckNTErrors((_IsSameShaped(a, b)), "Input tensors should have the same type!"); CheckNTErrors((_IsSameShaped(a, b)), "Input tensors should have the same type!");
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
DTYPE * d = (DTYPE*)a->data; if (a->dataType == DEFAULT_DTYPE) {
DTYPE * db = (DTYPE*)b->data; DTYPE* d = (DTYPE*)a->data;
DTYPE* db = (DTYPE*)b->data;
for (int i = 0; i < a->unitNum; i++) { for (int i = 0; i < a->unitNum; i++) {
if (d[i] > upper) if (d[i] > upper)
db[i] = upper; db[i] = upper;
...@@ -57,6 +57,21 @@ void _Clip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper) ...@@ -57,6 +57,21 @@ void _Clip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper)
else else
db[i] = d[i]; db[i] = d[i];
} }
}
else if (a->dataType == X_INT) {
int* d = (int*)a->data;
int* db = (int*)b->data;
for (int i = 0; i < a->unitNum; i++) {
if (d[i] > upper)
db[i] = upper;
else if (d[i] < lower)
db[i] = lower;
else
db[i] = d[i];
}
}
else
ShowNTErrors("TODO!");
} }
/* /*
......
...@@ -36,8 +36,9 @@ set each entry to its clip value (CUDA Kernel) ...@@ -36,8 +36,9 @@ set each entry to its clip value (CUDA Kernel)
>> upper - the upper border >> upper - the upper border
>> size - size of the data array >> size - size of the data array
*/ */
template <class T>
__global__ __global__
void KernelClip(DTYPE * a, DTYPE * b, DTYPE lower, DTYPE upper, int size) void KernelClip(T * a, T * b, T lower, T upper, int size)
{ {
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
...@@ -63,6 +64,7 @@ This is for float16 computation ...@@ -63,6 +64,7 @@ This is for float16 computation
__global__ __global__
void KernelClip(__half * a, __half * b, DTYPE lower, DTYPE upper, int size) void KernelClip(__half * a, __half * b, DTYPE lower, DTYPE upper, int size)
{ {
ShowNTErrors("TODO!");
return; return;
} }
...@@ -90,7 +92,13 @@ void _CudaClip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper) ...@@ -90,7 +92,13 @@ void _CudaClip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper)
ProtectCudaDev(a->devID, devIDBackup); ProtectCudaDev(a->devID, devIDBackup);
if (a->dataType == DEFAULT_DTYPE) { if (a->dataType == DEFAULT_DTYPE) {
KernelClip << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, lower, upper, a->unitNum); KernelClip<DTYPE> << <blocks, threads >> >((DTYPE *)a->data, (DTYPE *)b->data, lower, upper, a->unitNum);
}
else if (a->dataType == X_INT) {
int lower1 = (int)lower;
int upper1 = (int)upper;
KernelClip<int> << <blocks, threads >> >((int *)a->data, (int *)b->data, lower1, upper1, a->unitNum);
} }
else if (a->dataType == X_FLOAT16) { else if (a->dataType == X_FLOAT16) {
KernelClip << <blocks, threads >> >((__half*)a->data, (__half*)b->data, lower, upper, a->unitNum); KernelClip << <blocks, threads >> >((__half*)a->data, (__half*)b->data, lower, upper, a->unitNum);
......
...@@ -29,8 +29,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -29,8 +29,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* set each entry to its clip value (CUDA Kernel) */ /* set each entry to its clip value (CUDA Kernel) */
__global__ template <class T> __global__
void KernelClip(DTYPE * a, DTYPE * b, DTYPE lower, DTYPE upper, int size); void KernelClip(T * a, T * b, T lower, T upper, int size);
/* set each entry to its clip value (CUDA Kernel) with float16 data type*/ /* set each entry to its clip value (CUDA Kernel) with float16 data type*/
__global__ __global__
......
...@@ -47,11 +47,9 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift) ...@@ -47,11 +47,9 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift)
return; return;
} }
#endif #endif
if (a->dataType == DEFAULT_DTYPE) {
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "The tensor is not in the default data type!");
/* sparse tensor */ /* sparse tensor */
if(a->isSparse){ if(a->isSparse) {
int num = a->unitNumNonZero; int num = a->unitNumNonZero;
char * d = (char*)a->data + sizeof(int); char * d = (char*)a->data + sizeof(int);
char * f = d + (sizeof(int) + sizeof(DTYPE)) * 0 + sizeof(int); char * f = d + (sizeof(int) + sizeof(DTYPE)) * 0 + sizeof(int);
...@@ -66,7 +64,7 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift) ...@@ -66,7 +64,7 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift)
} }
} }
/* dense tensor */ /* dense tensor */
else{ else {
DTYPE * va = (DTYPE*)a->data; DTYPE * va = (DTYPE*)a->data;
DTYPE * vb = (DTYPE*)b->data; DTYPE * vb = (DTYPE*)b->data;
for(int i = 0; i < b->unitNum; i++){ for(int i = 0; i < b->unitNum; i++){
...@@ -75,6 +73,36 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift) ...@@ -75,6 +73,36 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift)
vb++; vb++;
} }
} }
}
else if (a->dataType == X_INT) {
/* sparse tensor */
if(a->isSparse) {
int num = a->unitNumNonZero;
char * d = (char*)a->data + sizeof(int);
char * f = d + (sizeof(int) + sizeof(int)) * 0 + sizeof(int);
char * db = (char*)b->data + sizeof(int);
char * fb = db + (sizeof(int) + sizeof(int)) * 0 + sizeof(int);
for(int i = 0; i < num; i++){
int * v = (int*)f;
int * vb = (int*)fb;
*vb = *v * scale + shift;
f += sizeof(int) + sizeof(int);
fb += sizeof(int) + sizeof(int);
}
}
/* dense tensor */
else {
int * va = (int*)a->data;
int * vb = (int*)b->data;
for(int i = 0; i < b->unitNum; i++){
*vb = *va * scale + shift;
va++;
vb++;
}
}
}
else
ShowNTErrors("TODO!");
} }
/* /*
......
...@@ -34,9 +34,9 @@ scale and shift all tensor entires b = a * scale + shift (CUDA Kernel) ...@@ -34,9 +34,9 @@ scale and shift all tensor entires b = a * scale + shift (CUDA Kernel)
>> scale - how much we want to scale it >> scale - how much we want to scale it
>> shift - how much we want to shift it >> shift - how much we want to shift it
*/ */
template<bool isUnitScale, bool isZeroShift> template<class T, bool isUnitScale, bool isZeroShift>
__global__ __global__
void KernelScaleAndShift(DTYPE * a, DTYPE * b, int size, DTYPE scale, DTYPE shift) void KernelScaleAndShift(T * a, T * b, int size, T scale, T shift)
{ {
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
...@@ -108,13 +108,26 @@ void _CudaScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift ...@@ -108,13 +108,26 @@ void _CudaScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift
if(a->dataType == DEFAULT_DTYPE){ if(a->dataType == DEFAULT_DTYPE){
if(scale == 1.0F && shift == 0) if(scale == 1.0F && shift == 0)
KernelScaleAndShift<true, true> <<<blocks, threads>>>((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift); KernelScaleAndShift<DTYPE, true, true> <<<blocks, threads>>>((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
else if (scale == 1.0F && shift != 0) else if (scale == 1.0F && shift != 0)
KernelScaleAndShift<true, false> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift); KernelScaleAndShift<DTYPE, true, false> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
else if(scale != 1.0F && shift == 0) else if(scale != 1.0F && shift == 0)
KernelScaleAndShift<false, true> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift); KernelScaleAndShift<DTYPE, false, true> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
else else
KernelScaleAndShift<false, false> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift); KernelScaleAndShift<DTYPE, false, false> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
}
else if (a->dataType == X_INT) {
int scale2 = int(scale);
int shift2 = int(shift);
if (scale == 1.0F && shift == 0)
KernelScaleAndShift<int, true, true><<<blocks, threads>>>((int *)a->data, (int *)b->data, a->unitNum, scale2, shift2);
else if (scale == 1.0F && shift != 0)
KernelScaleAndShift<int, true, false><<<blocks, threads>>>((int *)a->data, (int *)b->data, a->unitNum, scale2, shift2);
else if (scale != 1.0F && shift == 0)
KernelScaleAndShift<int, false, true><<<blocks, threads>>>((int *)a->data, (int *)b->data, a->unitNum, scale2, shift2);
else
KernelScaleAndShift<int, false, false><<<blocks, threads>>>((int *)a->data, (int *)b->data, a->unitNum, scale2, shift2);
} }
else if(a->dataType == X_FLOAT16){ else if(a->dataType == X_FLOAT16){
unsigned short scale2 = FloatToFloat16(scale); unsigned short scale2 = FloatToFloat16(scale);
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论