Commit 3f48d22b by liyinqiao

Merge with HU Chi branch (Don't use this! It's an incomplete version)

1. Support X_INT dataType for Clip and ScaleAndShift function.
2. Minor error fixed.
parent 7876ba5b
......@@ -45,18 +45,33 @@ void _Clip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper)
#endif
CheckNTErrors((_IsSameShaped(a, b)), "Input tensors should have the same type!");
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
DTYPE * d = (DTYPE*)a->data;
DTYPE * db = (DTYPE*)b->data;
for (int i = 0; i < a->unitNum; i++) {
if (d[i] > upper)
db[i] = upper;
else if (d[i] < lower)
db[i] = lower;
else
db[i] = d[i];
if (a->dataType == DEFAULT_DTYPE) {
DTYPE* d = (DTYPE*)a->data;
DTYPE* db = (DTYPE*)b->data;
for (int i = 0; i < a->unitNum; i++) {
if (d[i] > upper)
db[i] = upper;
else if (d[i] < lower)
db[i] = lower;
else
db[i] = d[i];
}
}
else if (a->dataType == X_INT) {
int* d = (int*)a->data;
int* db = (int*)b->data;
for (int i = 0; i < a->unitNum; i++) {
if (d[i] > upper)
db[i] = upper;
else if (d[i] < lower)
db[i] = lower;
else
db[i] = d[i];
}
}
else
ShowNTErrors("TODO!");
}
/*
......
......@@ -36,8 +36,9 @@ set each entry to its clip value (CUDA Kernel)
>> upper - the upper border
>> size - size of the data array
*/
template <class T>
__global__
void KernelClip(DTYPE * a, DTYPE * b, DTYPE lower, DTYPE upper, int size)
void KernelClip(T * a, T * b, T lower, T upper, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
......@@ -63,6 +64,7 @@ This is for float16 computation
__global__
void KernelClip(__half * a, __half * b, DTYPE lower, DTYPE upper, int size)
{
ShowNTErrors("TODO!");
return;
}
......@@ -90,7 +92,13 @@ void _CudaClip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper)
ProtectCudaDev(a->devID, devIDBackup);
if (a->dataType == DEFAULT_DTYPE) {
KernelClip << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, lower, upper, a->unitNum);
KernelClip<DTYPE> << <blocks, threads >> >((DTYPE *)a->data, (DTYPE *)b->data, lower, upper, a->unitNum);
}
else if (a->dataType == X_INT) {
int lower1 = (int)lower;
int upper1 = (int)upper;
KernelClip<int> << <blocks, threads >> >((int *)a->data, (int *)b->data, lower1, upper1, a->unitNum);
}
else if (a->dataType == X_FLOAT16) {
KernelClip << <blocks, threads >> >((__half*)a->data, (__half*)b->data, lower, upper, a->unitNum);
......
......@@ -29,8 +29,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* set each entry to its clip value (CUDA Kernel) */
__global__
void KernelClip(DTYPE * a, DTYPE * b, DTYPE lower, DTYPE upper, int size);
template <class T> __global__
void KernelClip(T * a, T * b, T lower, T upper, int size);
/* set each entry to its clip value (CUDA Kernel) with float16 data type*/
__global__
......
......@@ -47,34 +47,62 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift)
return;
}
#endif
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "The tensor is not in the default data type!");
/* sparse tensor */
if(a->isSparse){
int num = a->unitNumNonZero;
char * d = (char*)a->data + sizeof(int);
char * f = d + (sizeof(int) + sizeof(DTYPE)) * 0 + sizeof(int);
char * db = (char*)b->data + sizeof(int);
char * fb = db + (sizeof(int) + sizeof(DTYPE)) * 0 + sizeof(int);
for(int i = 0; i < num; i++){
DTYPE * v = (DTYPE*)f;
DTYPE * vb = (DTYPE*)fb;
*vb = *v * scale + shift;
f += sizeof(int) + sizeof(DTYPE);
fb += sizeof(int) + sizeof(DTYPE);
if (a->dataType == DEFAULT_DTYPE) {
/* sparse tensor */
if(a->isSparse) {
int num = a->unitNumNonZero;
char * d = (char*)a->data + sizeof(int);
char * f = d + (sizeof(int) + sizeof(DTYPE)) * 0 + sizeof(int);
char * db = (char*)b->data + sizeof(int);
char * fb = db + (sizeof(int) + sizeof(DTYPE)) * 0 + sizeof(int);
for(int i = 0; i < num; i++){
DTYPE * v = (DTYPE*)f;
DTYPE * vb = (DTYPE*)fb;
*vb = *v * scale + shift;
f += sizeof(int) + sizeof(DTYPE);
fb += sizeof(int) + sizeof(DTYPE);
}
}
/* dense tensor */
else {
DTYPE * va = (DTYPE*)a->data;
DTYPE * vb = (DTYPE*)b->data;
for(int i = 0; i < b->unitNum; i++){
*vb = *va * scale + shift;
va++;
vb++;
}
}
}
/* dense tensor */
else{
DTYPE * va = (DTYPE*)a->data;
DTYPE * vb = (DTYPE*)b->data;
for(int i = 0; i < b->unitNum; i++){
*vb = *va * scale + shift;
va++;
vb++;
else if (a->dataType == X_INT) {
/* sparse tensor */
if(a->isSparse) {
int num = a->unitNumNonZero;
char * d = (char*)a->data + sizeof(int);
char * f = d + (sizeof(int) + sizeof(int)) * 0 + sizeof(int);
char * db = (char*)b->data + sizeof(int);
char * fb = db + (sizeof(int) + sizeof(int)) * 0 + sizeof(int);
for(int i = 0; i < num; i++){
int * v = (int*)f;
int * vb = (int*)fb;
*vb = *v * scale + shift;
f += sizeof(int) + sizeof(int);
fb += sizeof(int) + sizeof(int);
}
}
/* dense tensor */
else {
int * va = (int*)a->data;
int * vb = (int*)b->data;
for(int i = 0; i < b->unitNum; i++){
*vb = *va * scale + shift;
va++;
vb++;
}
}
}
else
ShowNTErrors("TODO!");
}
/*
......
......@@ -34,9 +34,9 @@ scale and shift all tensor entires b = a * scale + shift (CUDA Kernel)
>> scale - how much we want to scale it
>> shift - how much we want to shift it
*/
template<bool isUnitScale, bool isZeroShift>
template<class T, bool isUnitScale, bool isZeroShift>
__global__
void KernelScaleAndShift(DTYPE * a, DTYPE * b, int size, DTYPE scale, DTYPE shift)
void KernelScaleAndShift(T * a, T * b, int size, T scale, T shift)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
......@@ -108,13 +108,26 @@ void _CudaScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift
if(a->dataType == DEFAULT_DTYPE){
if(scale == 1.0F && shift == 0)
KernelScaleAndShift<true, true> <<<blocks, threads>>>((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
KernelScaleAndShift<DTYPE, true, true> <<<blocks, threads>>>((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
else if (scale == 1.0F && shift != 0)
KernelScaleAndShift<true, false> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
KernelScaleAndShift<DTYPE, true, false> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
else if(scale != 1.0F && shift == 0)
KernelScaleAndShift<false, true> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
KernelScaleAndShift<DTYPE, false, true> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
else
KernelScaleAndShift<false, false> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
KernelScaleAndShift<DTYPE, false, false> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
}
else if (a->dataType == X_INT) {
int scale2 = int(scale);
int shift2 = int(shift);
if (scale == 1.0F && shift == 0)
KernelScaleAndShift<int, true, true><<<blocks, threads>>>((int *)a->data, (int *)b->data, a->unitNum, scale2, shift2);
else if (scale == 1.0F && shift != 0)
KernelScaleAndShift<int, true, false><<<blocks, threads>>>((int *)a->data, (int *)b->data, a->unitNum, scale2, shift2);
else if (scale != 1.0F && shift == 0)
KernelScaleAndShift<int, false, true><<<blocks, threads>>>((int *)a->data, (int *)b->data, a->unitNum, scale2, shift2);
else
KernelScaleAndShift<int, false, false><<<blocks, threads>>>((int *)a->data, (int *)b->data, a->unitNum, scale2, shift2);
}
else if(a->dataType == X_FLOAT16){
unsigned short scale2 = FloatToFloat16(scale);
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论