Commit e155205c by huchi

replace some functions for GPU

parent 9ebfb0be
......@@ -45,6 +45,15 @@ void KernelADD(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta)
c[i] = a[i] + b[i] * beta;
}
__global__
void KernelADD(int * a, int * b, int * c, int size, int beta)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size)
c[i] = a[i] + b[i] * beta;
}
/*
tensor summation c = a + b * \beta (cuda version)
>> a - a tensor
......@@ -100,6 +109,17 @@ void _CudaSum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
KernelADD << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, a->unitNum, beta);
}
}
else if (a->dataType == X_INT &&
b->dataType == X_INT &&
c->dataType == X_INT)
{
int gridSize[3], blockSize[3];
GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
dim3 blocks(gridSize[0]);
dim3 threads(blockSize[0]);
KernelADD << <blocks, threads >> >((int*)a->data, (int*)b->data, (int*)c->data, a->unitNum, (int)beta);
}
else {
// TODO!!
ShowNTErrors("TODO!");
......
......@@ -36,8 +36,9 @@ set each entry to its clip value (CUDA Kernel)
>> upper - the upper border
>> size - size of the data array
*/
template <class T>
__global__
void KernelClip(DTYPE * a, DTYPE * b, DTYPE lower, DTYPE upper, int size)
void KernelClip(T * a, T * b, T lower, T upper, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
......@@ -52,21 +53,6 @@ void KernelClip(DTYPE * a, DTYPE * b, DTYPE lower, DTYPE upper, int size)
}
/*
set each entry to its clip value with float16 data type value (CUDA Kernel)
This is for float16 computation
>> a - pointer to input data array
>> b - pointer to output data array
>> lower - the lower border
>> upper - the upper border
>> size - size of the data array
*/
__global__
void KernelClip(__half * a, __half * b, DTYPE lower, DTYPE upper, int size)
{
return;
}
/*
set each entry to its clip value
>> a - input tensor we are processing
>> b - output tensor we are processing
......@@ -92,8 +78,11 @@ void _CudaClip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper)
if (a->dataType == DEFAULT_DTYPE) {
KernelClip << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, lower, upper, a->unitNum);
}
else if (a->dataType == X_FLOAT16) {
KernelClip << <blocks, threads >> >((__half*)a->data, (__half*)b->data, lower, upper, a->unitNum);
else if (a->dataType == X_INT) {
int lower1 = (int)lower;
int upper1 = (int)upper;
KernelClip << <blocks, threads >> >((int *)a->data, (int *)b->data, lower1, upper1, a->unitNum);
}
else {
ShowNTErrors("TODO!");
......
......@@ -29,8 +29,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* set each entry to its clip value (CUDA Kernel) */
__global__
void KernelClip(DTYPE * a, DTYPE * b, DTYPE lower, DTYPE upper, int size);
template <class T> __global__
void KernelClip(T * a, T * b, T lower, T upper, int size);
/* set each entry to its clip value (CUDA Kernel) with float16 data type*/
__global__
......
......@@ -34,9 +34,9 @@ scale and shift all tensor entires b = a * scale + shift (CUDA Kernel)
>> scale - how much we want to scale it
>> shift - how much we want to shift it
*/
template<bool isUnitScale, bool isZeroShift>
template<class T, bool isUnitScale, bool isZeroShift>
__global__
void KernelScaleAndShift(DTYPE * a, DTYPE * b, int size, DTYPE scale, DTYPE shift)
void KernelScaleAndShift(T * a, T * b, int size, T scale, T shift)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
......@@ -56,28 +56,6 @@ void KernelScaleAndShift(DTYPE * a, DTYPE * b, int size, DTYPE scale, DTYPE shif
}
}
/*
scale and shift all tensor entires p = p * scale + shift (CUDA Kernel)
This is for float16 computation
>> a - the input data array
>> b - the output data array
>> size - the size of d
>> scale - how much we want to scale it
>> shift - how much we want to shift it
*/
__global__
void KernelScaleAndShift(__half * a, __half * b, int size, __half scale, __half shift)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
if(i < size)
b[i] = __hadd(__hmul(a[i], scale), shift);
#else
if (i < size)
b[i] = __float2half(__half2float(a[i]) * __half2float(scale) + __half2float(shift));
#endif
}
/*
scale and shift all tensor entires
......@@ -108,20 +86,26 @@ void _CudaScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift
if(a->dataType == DEFAULT_DTYPE){
if(scale == 1.0F && shift == 0)
KernelScaleAndShift<true, true> <<<blocks, threads>>>((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
KernelScaleAndShift<DTYPE, true, true> <<<blocks, threads>>>((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
else if (scale == 1.0F && shift != 0)
KernelScaleAndShift<true, false> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
KernelScaleAndShift<DTYPE, true, false> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
else if(scale != 1.0F && shift == 0)
KernelScaleAndShift<false, true> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
KernelScaleAndShift<DTYPE, false, true> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
else
KernelScaleAndShift<false, false> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
KernelScaleAndShift<DTYPE, false, false> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
}
else if(a->dataType == X_FLOAT16){
unsigned short scale2 = FloatToFloat16(scale);
unsigned short shift2 = FloatToFloat16(shift);
__half * scaleft16p = (__half*)&scale2;
__half * shiftft16p = (__half*)&shift2;
KernelScaleAndShift<<<blocks, threads>>>((__half*)a->data, (__half*)b->data, a->unitNum, *scaleft16p, *shiftft16p);
else if (a->dataType == X_INT) {
int scale2 = int(scale);
int shift2 = int(shift);
if (scale == 1.0F && shift == 0)
KernelScaleAndShift<int, true, true><<<blocks, threads>>>((int *)a->data, (int *)b->data, a->unitNum, scale2, shift2);
else if (scale == 1.0F && shift != 0)
KernelScaleAndShift<int, true, false><<<blocks, threads>>>((int *)a->data, (int *)b->data, a->unitNum, scale2, shift2);
else if (scale != 1.0F && shift == 0)
KernelScaleAndShift<int, false, true><<<blocks, threads>>>((int *)a->data, (int *)b->data, a->unitNum, scale2, shift2);
else
KernelScaleAndShift<int, false, false><<<blocks, threads>>>((int *)a->data, (int *)b->data, a->unitNum, scale2, shift2);
}
else{
ShowNTErrors("TODO!");
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论