ReduceSum.cu 34.9 KB
Newer Older
xiaotong committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
/* NiuTrans.Tensor - an open-source tensor library
 * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/

22 23
#include "../../XDevice.h"
#include "../../XUtility.h"
xiaotong committed
24 25 26 27 28 29
#include "ReduceSum.cuh"

namespace nts{ // namespace nts(NiuTrans.Tensor)

#ifdef USE_CUDA

30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
/*
use PTX code to reduce float data
*/
__device__ __forceinline__  
float shflDownReduceSum(float input)
{
    float output;
    asm volatile(
        "{"
        ".reg .f32 r0;"
        "shfl.down.b32  r0, %1, 0x10, 0x1f;"
        "add.f32        %1, r0, %1;"
        "shfl.down.b32  r0, %1, 0x8, 0xf;"
        "add.f32        %1, r0, %1;"
        "shfl.down.b32  r0, %1, 0x4, 0x7;"
        "add.f32        %1, r0, %1;"
        "shfl.down.b32  r0, %1, 0x2, 0x3;"
        "add.f32        %1, r0, %1;"
        "shfl.down.b32  r0, %1, 0x1, 0x1;"
        "add.f32        %0, r0, %1;"
        "}"
        : "=f"(output) : "f"(input));
    return output;
}

/*
use PTX code to reduce int data
*/
__device__ __forceinline__  
int shflDownReduceSum(int input)
{
    int output;
    asm volatile(
        "{"
        ".reg .s32 r0;"
        "shfl.down.b32  r0, %1, 0x10, 0x1f;"
        "add.s32        %1, r0, %1;"
        "shfl.down.b32  r0, %1, 0x8, 0xf;"
        "add.s32        %1, r0, %1;"
        "shfl.down.b32  r0, %1, 0x4, 0x7;"
        "add.s32        %1, r0, %1;"
        "shfl.down.b32  r0, %1, 0x2, 0x3;"
        "add.s32        %1, r0, %1;"
        "shfl.down.b32  r0, %1, 0x1, 0x1;"
        "add.s32        %0, r0, %1;"
        "}"
        : "=r"(output) : "r"(input));
    return output;
}


xiaotong committed
81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107
/* 
reduce a tensor to another that keeps the sum along a dimension  - slow version
Given a block of data, we go over each dimension i in the stride and we have
sum_i = sum_{0<=j<strideNum} exp(input_{i,j} - shift) if isExp == true;
      = sum_{0<=j<strideNum} input_{i,j} - shift if isExp == false;
where we can view the block as a matrix and input_{i,j} represent the item at the
crossing of the i-th columne and the j-th row.
>> input - the input array (representing a tensor)
>> output - the sum over each block. NOTE: output is also an array
>> stride - stride that we need to move to the next item
>> strideNum - how many strides we need to finish the reduce
>> reducedStrideNum - the number of strides after reducation 
>> blockSize - size of the block (i.e., stride * strideNum)
>> blockNum - how many blocks
>> shift - the bias imposed on the input
>> power - power of the item in the array
>> isExp - specify if we perform exp() on the input
*/
 __global__
void KernelReduceSum(DTYPE * input, DTYPE * output, 
                     int stride, int strideNum, int reducedStrideNum, 
                     int blockSize, int blockNum, 
                     DTYPE * shift, DTYPE power, bool isExp)
{
    __shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK * MIN_CUDA_SHARED_MEM_COL_SIZE/2];
    __shared__ DTYPE bias[MAX_CUDA_THREAD_NUM_PER_BLOCK];

108 109 110
    int idx = threadIdx.y * blockDim.x + threadIdx.x;
    unsigned int i = blockIdx.y*blockDim.y + threadIdx.y;
    unsigned int j = blockIdx.x*blockDim.x + threadIdx.x;
xiaotong committed
111 112 113 114

    if(i >= stride * blockNum)
        return;

115 116
    if(threadIdx.x == 0)
        bias[threadIdx.y] = shift != NULL ? shift[i] : 0;
xiaotong committed
117 118 119 120 121 122 123

    __syncthreads();

    int k = i / stride;
    int iOffset = i % stride;
    bool isValid = (i < stride * blockNum && j < strideNum);

124
    DTYPE value =  isValid ? input[blockSize * k + stride * j + iOffset] - bias[threadIdx.y] : 0;
xiaotong committed
125 126 127 128 129 130 131 132 133 134 135 136 137 138

    if(power != (DTYPE)1.0){
        if(power == (DTYPE)2.0)
            value = value * value;
        else if(power == (DTYPE)0.5)
            value = sqrt(value);
        else
            value = pow(value, power);
    }

    if(isExp && isValid)
        value = exp(value);

    /* load data into the shared mem */
139
    iData[threadIdx.y * blockDim.x + threadIdx.x] = value;
xiaotong committed
140 141 142 143

    __syncthreads();

    /* do reduction in shared mem */
144 145
    for (unsigned int s = blockDim.x/2; s > 0; s >>= 1){
        if (threadIdx.x < s)
xiaotong committed
146 147 148 149 150
            iData[idx] += iData[idx + s];

        __syncthreads();
    }
    /* write result for this block to the output array */
151 152
    if (threadIdx.x == 0 && blockIdx.x < reducedStrideNum) 
        output[(k * reducedStrideNum + blockIdx.x) * stride + iOffset] = iData[threadIdx.y * blockDim.x];
xiaotong committed
153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209
}

 /* 
reduce a tensor to another that keeps the sum along a dimension  - slow version
This is for float16 reduction.
Given a block of data, we go over each dimension i in the stride and we have
sum_i = sum_{0<=j<strideNum} exp(input_{i,j} - shift) if isExp == true;
      = sum_{0<=j<strideNum} input_{i,j} - shift if isExp == false;
where we can view the block as a matrix and input_{i,j} represent the item at the
crossing of the i-th columne and the j-th row.
>> input - the input array (representing a tensor)
>> output - the sum over each block. NOTE: output is also an array
>> stride - stride that we need to move to the next item
>> strideNum - how many strides we need to finish the reduce
>> reducedStrideNum - the number of strides after reducation 
>> blockSize - size of the block (i.e., stride * strideNum)
>> blockNum - how many blocks
>> shift - the bias imposed on the input
>> power - power of the item in the array
>> isExp - specify if we perform exp() on the input
*/
 __global__
void KernelReduceSum(__half * input, __half * output, 
                     int stride, int strideNum, int reducedStrideNum, 
                     int blockSize, int blockNum, 
                     __half * shift, __half power, bool isExp)
{
    int idx = threadIdx.x * blockDim.y + threadIdx.y;
    unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
    unsigned int j = blockIdx.y*blockDim.y + threadIdx.y;

    if(i >= stride * blockNum)
        return;

#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
    __shared__ __half iData[MAX_CUDA_THREAD_NUM_PER_BLOCK * MIN_CUDA_SHARED_MEM_COL_SIZE/2];
    __shared__ __half bias[MAX_CUDA_THREAD_NUM_PER_BLOCK];

    if(threadIdx.y == 0)
        bias[threadIdx.x] = shift != NULL ? shift[i] : __half(0);
#else
    __shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK * MIN_CUDA_SHARED_MEM_COL_SIZE/2];
    __shared__ DTYPE bias[MAX_CUDA_THREAD_NUM_PER_BLOCK];

    if(threadIdx.y == 0)
        bias[threadIdx.x] = shift != NULL ? __half(shift[i]) : __half(0);
#endif

    __syncthreads();

    int k = i / stride;
    int iOffset = i % stride;
    bool isValid = (i < stride * blockNum && j < strideNum);

#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
    __half value = isValid ? __hsub(input[blockSize * k + stride * j + iOffset], bias[threadIdx.x]) : __half(0);
    DTYPE power2 = __half2float(power);
210

xiaotong committed
211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283
    if(power2 != (DTYPE)1.0){
        if(power2 == (DTYPE)2.0)
            value = __hmul(value, value);
        else if(power2 == (DTYPE)0.5)
            value = hsqrt(value);
    }

    if(isExp && isValid)
        value = hexp(value);
#else
    DTYPE value =  isValid ? __half2float(input[blockSize * k + stride * j + iOffset]) - __half2float(bias[threadIdx.x]) : 0;
    DTYPE power2 = __half2float(power);

    if(power2 != (DTYPE)1.0){
        if(power2 == (DTYPE)2.0)
            value = value * value;
        else if(power2 == (DTYPE)0.5)
            value = sqrt(value);
        else
            value = pow(value, power2);
    }

    if(isExp && isValid)
        value = exp(value);
#endif

    /* load data into the shared mem */
    iData[threadIdx.x * blockDim.y + threadIdx.y] = value;

    __syncthreads();

    /* do reduction in shared mem */
    for (unsigned int s = blockDim.y/2; s > 0; s >>= 1){
        if (threadIdx.y < s)
            iData[idx] += iData[idx + s];

        __syncthreads();
    }

#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
    /* write result for this block to the output array */
    if (threadIdx.y == 0 && blockIdx.y < reducedStrideNum) 
        output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = iData[threadIdx.x * blockDim.y];
#else
    /* write result for this block to the output array */
    if (threadIdx.y == 0 && blockIdx.y < reducedStrideNum) 
        output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = __half(iData[threadIdx.x * blockDim.y]);
#endif

}

/* 
reduce a tensor to another that keeps the sum along a dimension  - fast version
>> input - the input array (representing a tensor)
>> output - the sum over each block. NOTE: output is also an array
>> stride - stride that we need to move to the next item
>> strideNum - how many strides we need to finish the reduce
>> reducedStrideNum - the number of strides after reducation 
>> blockSize - size of the block (i.e., stride * strideNum)
>> blockNum - how many blocks
>> shift - the bias imposed on the input
>> power - power of the item in the array
>> isExp - specify if we perform exp() on the input
*/
template <unsigned int goodSize> __global__
void KernelReduceSumFast(DTYPE * input, DTYPE * output, 
                         int stride, int strideNum, int reducedStrideNum, 
                         int blockSize, int blockNum,
                         DTYPE * shift, DTYPE power, bool isExp)
{
    __shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK];
    __shared__ DTYPE bias[MAX_CUDA_THREAD_NUM_PER_BLOCK];

284 285 286 287
    unsigned int tid = threadIdx.x;
    unsigned int j = blockIdx.x * (blockDim.x * 2) + threadIdx.x;
    unsigned int i = blockIdx.y * blockDim.y + threadIdx.y;

xiaotong committed
288 289
    if(i >= stride * blockNum)
        return;
290 291 292

    if (threadIdx.x == 0)
        bias[threadIdx.y] = shift != NULL ? shift[i] : 0;
xiaotong committed
293 294 295 296 297 298 299 300

    __syncthreads();

    /* first level reduction */
    int k = i / stride;
    int iOffset = i % stride;

    bool isValid = j < strideNum;
301
    bool isValid2 = j + blockDim.x < strideNum;
xiaotong committed
302

303
    DTYPE * data =  iData + threadIdx.y * blockDim.x;
xiaotong committed
304
    DTYPE * inputData = input  + k * blockSize;
305 306
    DTYPE value  = isValid ? inputData[j * stride + iOffset] - bias[threadIdx.y]: 0;
    DTYPE value2 = isValid2 ? inputData[(j + blockDim.x) * stride + iOffset] - bias[threadIdx.y]: 0;
xiaotong committed
307 308 309 310
    
    if(power != (DTYPE)1.0){
        if(power == (DTYPE)2.0){
            value = value * value;
311
            value2 = value2 * value2;
xiaotong committed
312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329
        }
        else if(power == (DTYPE)0.5){
            value = sqrt(value);
            value2 = sqrt(value2);
        }
        else{
            value = pow(value, power);
            value2 = pow(value2, power);
        }
    }

    if(isExp){
        if(isValid)
            value = exp(value);
        if(isValid2)
            value2 = exp(value2);
    }

330
    value = value + value2;
331

332
    __syncthreads();
333
    
334
    value = shflDownReduceSum(value);
335 336 337
    if ((tid & 0x1f) == 0) 
        data[tid / 32] = value;

338
    __syncthreads();
339
    
340
    if (tid < 32){
341
        if (tid < blockDim.x / 32)
342
            value = data[tid];
343 344 345 346 347 348 349
        else
	        value = 0;
        value = shflDownReduceSum(value);

        if (tid == 0 && blockIdx.x < reducedStrideNum) {
            output[(k * reducedStrideNum + blockIdx.x) * stride + iOffset] = value;
        }
350
    }
xiaotong committed
351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485
}

/* 
reduce a tensor to another that keeps the sum along a dimension  - fast version
This is for float16 reduction
>> input - the input array (representing a tensor)
>> output - the sum over each block. NOTE: output is also an array
>> stride - stride that we need to move to the next item
>> strideNum - how many strides we need to finish the reduce
>> reducedStrideNum - the number of strides after reducation 
>> blockSize - size of the block (i.e., stride * strideNum)
>> blockNum - how many blocks
>> shift - the bias imposed on the input
>> power - power of the item in the array
>> isExp - specify if we perform exp() on the input
*/
template <unsigned int goodSize> __global__
void KernelReduceSumFast(__half * input, __half * output, 
                         int stride, int strideNum, int reducedStrideNum, 
                         int blockSize, int blockNum,
                         __half * shift, __half power, bool isExp)
{
    unsigned int tid = threadIdx.y;
    unsigned int j = blockIdx.y * (blockDim.y * 2) + threadIdx.y;
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
    
    if(i >= stride * blockNum)
        return;

#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
    __shared__ __half iData[MAX_CUDA_THREAD_NUM_PER_BLOCK];
    __shared__ __half bias[MAX_CUDA_THREAD_NUM_PER_BLOCK];

    if(threadIdx.y == 0)
        bias[threadIdx.x] = shift != NULL ? shift[i] : __float2half(0);
#else
    __shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK];
    __shared__ DTYPE bias[MAX_CUDA_THREAD_NUM_PER_BLOCK];

    if(threadIdx.y == 0)
        bias[threadIdx.x] = shift != NULL ? __half2float(shift[i]) : 0;
#endif

    __syncthreads();

    /* first level reduction */
    int k = i / stride;
    int iOffset = i % stride;
    bool isValid = j < strideNum;
    bool isValid2 = j + blockDim.y < strideNum;

#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
    __half * data =  iData + threadIdx.x * blockDim.y;
    __half * inputData = input  + k * blockSize;
    __half value  = isValid ? __hsub(inputData[j * stride + iOffset], bias[threadIdx.x]) : __float2half(0);
    __half value2 = isValid2 ? __hsub(inputData[(j + blockDim.y) * stride + iOffset], bias[threadIdx.x]) : __float2half(0);

    DTYPE powerf = __half2float(power);

    if(powerf != (DTYPE)1.0){
        if(powerf == (DTYPE)2.0){
            value = __hmul(value, value);
            value2 = __hmul(value2, value2);
        }
        else if(powerf == (DTYPE)0.5){
            value = hsqrt(value);
            value2 = hsqrt(value2);
        }
    }

    if(isExp){
        if(isValid)
            value = hexp(value);
        if(isValid2)
            value2 = hexp(value2);
    }

#else
    DTYPE * data =  iData + threadIdx.x * blockDim.y;
    __half * inputData = input  + k * blockSize;
    DTYPE value  = isValid ? __half2float(inputData[j * stride + iOffset]) - __half2float(bias[threadIdx.x]): 0;
    DTYPE value2 = isValid2 ? __half2float(inputData[(j + blockDim.y) * stride + iOffset]) - __half2float(bias[threadIdx.x]): 0;

    DTYPE powerf = __half2float(power);

    if(powerf != (DTYPE)1.0){
        if(powerf == (DTYPE)2.0){
            value = value * value;
            value2 = value2 *value2;
        }
        else if(powerf == (DTYPE)0.5){
            value = sqrt(value);
            value2 = sqrt(value2);
        }
        else{
            value = pow(value, powerf);
            value2 = pow(value2, powerf);
        }
    }

    if(isExp){
        if(isValid)
            value = exp(value);
        if(isValid2)
            value2 = exp(value2);
    }
#endif

    /* load data into the shared mem */
    data[tid] = value + value2;

    __syncthreads();

    /* unroll the warp */
    if(goodSize >= 512) {if(tid < 256) {data[tid] += data[tid + 256];} __syncthreads();}
    if(goodSize >= 256) {if(tid < 128) {data[tid] += data[tid + 128];} __syncthreads();}
    if(goodSize >= 128) {if(tid <  64) {data[tid] += data[tid +  64];} __syncthreads();}
    if(goodSize >= 64)  {if(tid <  32) {data[tid] += data[tid +  32];} __syncthreads();}
    if(goodSize >= 32)  {if(tid <  16) {data[tid] += data[tid +  16];} __syncthreads();}
    if(goodSize >= 16)  {if(tid <   8) {data[tid] += data[tid +   8];} __syncthreads();}
    if(goodSize >=  8)  {if(tid <   4) {data[tid] += data[tid +   4];} __syncthreads();}
    if(goodSize >=  4)  {if(tid <   2) {data[tid] += data[tid +   2];} __syncthreads();}
    if(goodSize >=  2)  {if(tid <   1) {data[tid] += data[tid +   1];} __syncthreads();}

#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
    /* write result for this block to the output array */
    if(threadIdx.y == 0 && blockIdx.y < reducedStrideNum) 
        output[(k * reducedStrideNum + blockIdx.y) * stride  + iOffset] = data[0];
#else
    /* write result for this block to the output array */
    if(threadIdx.y == 0 && blockIdx.y < reducedStrideNum) 
        output[(k * reducedStrideNum + blockIdx.y) * stride  + iOffset] = __float2half(data[0]);
#endif
}

486 487 488 489
/*
if data storage is discontinuius ,use this way to reduce 
*/
__global__ 
490
void KernelReduceSumDiscontinuousStorage(DTYPE * input, DTYPE * output, int stride, int strideNum,
491
                                         int blockNum, DTYPE * shift, DTYPE power, bool isExp)
492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577
{
    __shared__ DTYPE bias[MAX_CUDA_THREAD_NUM_PER_BLOCK];
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    int blockIndex = idx / stride;
    int offsetInBlock = idx % stride; 
    if (idx >= stride * blockNum)
        return;
    bias[idx % blockDim.x] = shift != NULL ? shift[idx] : 0;
    DTYPE ans = 0;

#pragma unroll
    for (int i = stride * strideNum * blockIndex + offsetInBlock;
        i < stride * strideNum * blockIndex + offsetInBlock + stride * strideNum;
        i += stride){
        DTYPE value = input[i];
        value = value - bias[idx % blockDim.x];
        if (power != (DTYPE)1.0) {
            if (power == (DTYPE)2.0) {
                value = value * value;
            }
            else if (power == (DTYPE)0.5) {
                value = sqrt(value);
            }
            else {
                value = pow(value, power);
            }
        }
        if (isExp) {
            value = exp(value);
        }
        ans += value;
    }
    output[idx] = ans;
}

__global__
void KernelReduceSumOp(DTYPE * input, DTYPE * output,
                       int stride, int strideNum, int reducedStrideNum,
                       int blockSize, int blockNum,
                       DTYPE * shift, DTYPE power, bool isExp)
{
    __shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK / 32];
    __shared__ DTYPE bias[MAX_CUDA_THREAD_NUM_PER_BLOCK];

    unsigned int tid = threadIdx.y;
    unsigned int j = blockIdx.y * blockDim.y + threadIdx.y;
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i >= stride * blockNum)
        return;

    if (threadIdx.y == 0)
        bias[threadIdx.x] = shift != NULL ? shift[i] : 0;

    __syncthreads();

    /* first level reduction */
    int k = i / stride;
    int iOffset = i % stride;

    DTYPE threadSum = 0;

    DTYPE * data = iData + threadIdx.x * blockDim.y;
    DTYPE * inputData = input + k * blockSize;
    for (int it = j; it < strideNum; it += blockDim.y){
        DTYPE value = inputData[it * stride + iOffset] - bias[threadIdx.x];
        if (power != (DTYPE)1.0) {
            if (power == (DTYPE)2.0) {
                value = value * value;
            }
            else if (power == (DTYPE)0.5) {
                value = sqrt(value);
            }
            else {
                value = pow(value, power);
            }
        }
        if (isExp) value = exp(value);
        threadSum += value;
    }
    __syncthreads();
    threadSum = shflDownReduceSum(threadSum);
    if ((tid & 0x1f) == 0) { data[tid / 32] = threadSum; }
    __syncthreads();
    if (tid < 32){
        if (tid < blockDim.y / 32)
            threadSum = data[tid];
578 579
        else 
            threadSum = 0;
580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639
        threadSum = shflDownReduceSum(threadSum);
        if (tid == 0 && blockIdx.y < reducedStrideNum)
            output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = threadSum;
    }

}

__global__
void KernelReduceSumOpLessBlocks(DTYPE * input, DTYPE * output,
                                 int strideNum, int blockNum,
                                 DTYPE * shift, DTYPE power, bool isExp)
{
    __shared__ DTYPE bias[MAX_CUDA_THREAD_NUM_PER_BLOCK];
    int idx = threadIdx.x % 32;
    int idy = (blockIdx.x * blockDim.x + threadIdx.x) / 32;

    if (idx == 0)
        bias[threadIdx.x / 32] = shift != NULL ? shift[idy] : 0;

    int startIndex = idy * strideNum;
    DTYPE threadSum = 0;
    for (int i = idx; i < strideNum; i += 32) {
        DTYPE value = input[startIndex + i] - bias[threadIdx.x / 32];
        if (power != (DTYPE)1.0) {
            if (power == (DTYPE)2.0) {
                value = value * value;
            }
            else if (power == (DTYPE)0.5) {
                value = sqrt(value);
            }
            else {
                value = pow(value, power);
            }
        }
        if (isExp) value = exp(value);
        threadSum += value;
    }
    threadSum = shflDownReduceSum(threadSum);
    if (idx == 0)
        output[idy] = threadSum;
}

/*
according the GPU's sm number allocation warp num
*/
inline void continuousStorageThreadAllocation(dim3& grid, dim3& block, long long vectorNum, int vectorSize)
{
    int warpNum = 4;
    if (vectorNum < 20 * 8) {
        warpNum = 8;
        if (vectorNum < 20 * 4) {
            warpNum = 16;
            if (warpNum < 20 * 2)
                warpNum = 32;
        }
    }
    int minWarpNum = vectorSize / 32;
    if (vectorSize % 32 != 0) minWarpNum++;
    warpNum = min(warpNum, minWarpNum);

640
    grid.x = (unsigned int)vectorNum;
641 642 643 644 645 646 647 648 649 650
    grid.y = 1;
    grid.z = 1;
    block.x = 1;
    block.y = warpNum * 32;
    block.z = 1;
}

/* 
this situation we use block.x * grid.x deal one vector for continuous read
*/
651
void discontinuousStorageNoShareMemThreadAllocation(dim3* grid, dim3* block, int stride, int blockNum)
652
{
653 654
    block->x = 512;
    block->y = 1;
655
    if ((stride * blockNum) % 512 == 0)
656
        grid->x = (stride * blockNum) / 512;
657
    else
658 659
        grid->x = (stride * blockNum) / 512 + 1;
    grid->y = 1;
660 661 662 663 664
}

/*
adjust threads.x number then we can use warp optimization
*/
665
void adjustThreadForUseWarpOptimization(dim3* blocks, dim3* threads)
666
{
667 668 669
    if (threads->y > 1){
        blocks->y *= threads->y;
        threads->y = 1;
670
    }
671 672
    if (threads->x < 32)
        threads->x = 32;
673 674
}

xiaotong committed
675 676 677 678 679 680 681 682 683 684 685 686
/* 
sum the items along a dimension of the tensor (cuda version). 
For a 1-dimensional data array a,
sum = \sum_i (a_i - shift)^power if isExp == false
sum = \sum_i exp((a_i - shift)^power) if isExp == true
>> input - the input tensor
>> output - the output tensor
>> dim - which dimension to reduce
>> shift - the bias on the input
>> power - we perform pow(item_i, power) on each item
>> ieExp - specify if the exp() is performed
*/
687
void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor * shift, DTYPE power, bool isExp)
xiaotong committed
688
{
689 690 691 692 693
    CheckNTErrors(input && output, "Empty input or output tensors!");
    CheckNTErrors(input->order == output->order + 1, "Incorrect tensor sizes!");
    CheckNTErrors(input->order > dim && dim >= 0, "Illegal dimension to reduce!");
    CheckNTErrors(input->dataType == output->dataType, "Unmatched data types!");
    CheckNTErrors(shift == NULL || output->unitNum == shift->unitNum, "Incorrect shift tensor size!");
xiaotong committed
694 695 696 697

	int dimRDI = input->order - dim - 1;
    for(int i = 0; i < input->order; i++){
        if(i < dimRDI){
698
            CheckNTErrors(input->dimSizeRDI[i] == output->dimSizeRDI[i], "Unmatched tensors!");
xiaotong committed
699 700
        }
        else if(i > dimRDI){
701
            CheckNTErrors(input->dimSizeRDI[i] == output->dimSizeRDI[i - 1], "Unmatched tensors!");
xiaotong committed
702 703 704
        }
    }

705 706
    if(input->dataType == X_FLOAT16)
        CheckNTErrors(power == 0 || power == 0.5 || power == 1.0 || power == 2.0, "TODO!");
xiaotong committed
707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733

    int cudaGridSize[3];
    int cudaBlockSize[3];
    int iter = 0;
    int stride = 1;
    int strideNum = input->dimSizeRDI[dimRDI];
    int blockSize = 1;
    int blockNum = 1;

    for (int i = 0; i < input->order; i++) {
        if (i < dimRDI)
            stride *= input->dimSizeRDI[i];
        else if (i > dimRDI)
            blockNum *= input->dimSizeRDI[i];
    }
    blockSize = stride * strideNum;

    int devID = input->devID;
    XMem * mem = input->mem;

    GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);

    int bufSize = input->unitSize * cudaGridSize[0] * stride * blockNum * 2;
    DTYPE * buf  = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(input->devID, bufSize);
    DTYPE * buf1 = buf;
    DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum;
    DTYPE * sp = shift != NULL ? (DTYPE*)shift->data : NULL;
734
    
xiaotong committed
735 736
    int devIDBackup;
    ProtectCudaDev(input->devID, devIDBackup);
xuchen committed
737

738 739 740 741 742
    if (stride == 1 && blockNum >= 10) {
        dim3 grids;
        dim3 blocks;
        continuousStorageThreadAllocation(grids, blocks, (long long)blockNum, strideNum);
        if (blocks.y >= 128)
743 744
            KernelReduceSumOp <<<grids, blocks>>> ((DTYPE *)input->data, (DTYPE*)output->data, stride, 
                                                    strideNum, grids.y, blockSize, blockNum, sp, power, isExp);
745
        else {
746 747 748 749 750 751
            if (blockNum % 4 != 0) 
                blockNum = (int)(blockNum / 4) + 1;
            else 
                blockNum = blockNum / 4;
            KernelReduceSumOpLessBlocks <<<blockNum, 128>>> ((DTYPE *)input->data, (DTYPE*)output->data, 
                                                              strideNum, blockNum, sp, power, isExp);
xuchen committed
752
        }
753 754 755 756
    }
    else if (stride != 1 && stride * blockNum > 4096){
        //GDevs->GetGridAndBlockSize2D(devID, stride * blockNum, strideNum,MAX_INT, cudaGridSize, cudaBlockSize);
        //unsigned int* goutput = (unsigned int *)input->data;
757
        //convert2uintV2 << <dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >> > ((float*)input->data, goutput, stride, strideNum, blockNum, strideNum*blockNum*stride);
758
        dim3 grid, block;
759
        discontinuousStorageNoShareMemThreadAllocation(&grid, &block, stride, blockNum);
760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782
        KernelReduceSumDiscontinuousStorage <<<grid, block>>> ((DTYPE *)input->data, (DTYPE*)output->data, stride, 
                                                                strideNum, blockNum,sp, power, isExp);
    }
    else {
        do {
            if (input->dataType == DEFAULT_DTYPE) {
                DTYPE * iData = NULL;
                DTYPE * oData = NULL;
                if (iter == 0) {
                    iData = (DTYPE*)input->data;
                    oData = buf1;
                }
                else if (iter % 2 == 1) {
                    iData = buf1;
                    oData = buf2;
                }
                else {
                    iData = buf2;
                    oData = buf1;
                }
                /* unroll the reduction procedure. The code is messy but it is faster. */
                if (strideNum <= 32) {
                    GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
783
                    dim3 blocks(cudaGridSize[0], cudaGridSize[1]), threads(cudaBlockSize[0], cudaBlockSize[1]);
784 785
                    if (cudaGridSize[0] == 1)
                        oData = (DTYPE*)output->data;
786
                    KernelReduceSum <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.x, 
787 788 789 790
                                                           blockSize, blockNum, sp, power, isExp);
                }
                else if (strideNum < 128) {
                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
791
                    dim3 blocks(cudaGridSize[0], cudaGridSize[1]), threads(cudaBlockSize[0], cudaBlockSize[1]);
792 793 794
                    if (cudaGridSize[0] == 1)
                        oData = (DTYPE*)output->data;
                    CheckNTErrors((cudaBlockSize[0] >= 64), "Incorrect thread number when calling the cuda kernel!");
795 796
                    adjustThreadForUseWarpOptimization(&blocks, &threads);
                    KernelReduceSumFast<64> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.x, 
797 798 799 800
                                                                   blockSize, blockNum, sp, power, isExp);
                }
                else if (strideNum < 256) {
                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
801
                    dim3 blocks(cudaGridSize[0], cudaGridSize[1]), threads(cudaBlockSize[0], cudaBlockSize[1]);
802 803 804
                    if (cudaGridSize[0] == 1)
                        oData = (DTYPE*)output->data;
                    CheckNTErrors((cudaBlockSize[0] >= 128), "Incorrect thread number when calling the cuda kernel!");
805 806
                    adjustThreadForUseWarpOptimization(&blocks, &threads);
                    KernelReduceSumFast<128> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.x, 
807 808 809 810
                                                                    blockSize, blockNum, sp, power, isExp);
                }
                else if (strideNum < 512) {
                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
811
                    dim3 blocks(cudaGridSize[0], cudaGridSize[1]), threads(cudaBlockSize[0], cudaBlockSize[1]);
812 813 814
                    if (cudaGridSize[0] == 1)
                        oData = (DTYPE*)output->data;
                    CheckNTErrors((cudaBlockSize[0] >= 256), "Incorrect thread number when calling the cuda kernel!");
815 816
                    adjustThreadForUseWarpOptimization(&blocks, &threads);
                    KernelReduceSumFast<256> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.x, 
817 818 819 820
                                                                    blockSize, blockNum, sp, power, isExp);
                }
                else {
                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
821
                    dim3 blocks(cudaGridSize[0], cudaGridSize[1]), threads(cudaBlockSize[0], cudaBlockSize[1]);
822 823 824
                    if (cudaGridSize[0] == 1)
                        oData = (DTYPE*)output->data;
                    CheckNTErrors((cudaBlockSize[0] >= 512), "Incorrect thread number when calling the cuda kernel!");
825 826
                    adjustThreadForUseWarpOptimization(&blocks, &threads);
                    KernelReduceSumFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.x, 
827 828
                                                                    blockSize, blockNum, sp, power, isExp);
                }
xuchen committed
829
            }
830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895
            else if (input->dataType == X_FLOAT16) {
                __half * buf1ft16 = (__half *)buf1;
                __half * buf2ft16 = (__half *)buf2;
                __half * spft16 = (__half *)sp;
                unsigned short power2 = FloatToFloat16(power);
                __half * powerft16p = (__half*)&power2;
                __half * iData = NULL;
                __half * oData = NULL;
                if (iter == 0) {
                    iData = (__half*)input->data;
                    oData = buf1ft16;
                }
                else if (iter % 2 == 1) {
                    iData = buf1ft16;
                    oData = buf2ft16;
                }
                else {
                    iData = buf2ft16;
                    oData = buf1ft16;
                }

                /* unroll the reduction procedure. The code is messy but it is faster. */
                if (strideNum < 32) {
                    GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
                    if (cudaGridSize[0] == 1)
                        oData = (__half*)output->data;
                    KernelReduceSum <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, 
                                                           blockSize, blockNum, spft16, *powerft16p, isExp);
                }
                else if (strideNum < 128) {
                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
                    if (cudaGridSize[0] == 1)
                        oData = (__half*)output->data;
                    CheckNTErrors((cudaBlockSize[0] >= 64), "Incorrect thread number when calling the cuda kernel!");
                    KernelReduceSumFast<64> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, 
                                                                   blockSize, blockNum, spft16, *powerft16p, isExp);
                }
                else if (strideNum < 256) {
                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
                    if (cudaGridSize[0] == 1)
                        oData = (__half*)output->data;
                    CheckNTErrors((cudaBlockSize[0] >= 128), "Incorrect thread number when calling the cuda kernel!");
                    KernelReduceSumFast<128> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, 
                                                                    blockSize, blockNum, spft16, *powerft16p, isExp);
                }
                else if (strideNum < 512) {
                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
                    if (cudaGridSize[0] == 1)
                        oData = (__half*)output->data;
                    CheckNTErrors((cudaBlockSize[0] >= 256), "Incorrect thread number when calling the cuda kernel!");
                    KernelReduceSumFast<256> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, 
                                                                    blockSize, blockNum, spft16, *powerft16p, isExp);
                }
                else {
                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
                    if (cudaGridSize[0] == 1)
                        oData = (__half*)output->data;
                    CheckNTErrors((cudaBlockSize[0] >= 512), "Incorrect thread number when calling the cuda kernel!");
                    KernelReduceSumFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, 
                                                                    blockSize, blockNum, spft16, *powerft16p, isExp);
                }
xuchen committed
896
            }
xiaotong committed
897

898 899 900 901 902
            strideNum = cudaGridSize[0];
            blockSize = cudaGridSize[0];
            sp = NULL;
            power = (DTYPE)1.0;
            isExp = false;
xuchen committed
903

904
            iter++;
xiaotong committed
905

906 907
        } while (strideNum > 1);
    }
xiaotong committed
908 909 910 911 912 913 914 915 916 917 918
    ProtectCudaDev(input->devID, devIDBackup);

    if (mem != NULL)
        mem->ReleaseBuf(mem->devID, bufSize);
    else
        XMemFree(input->devID, buf);
}

#endif // USE_CUDA

} // namespace nts(NiuTrans.Tensor)