CopyBlocksOnSite.cu 4.43 KB
Newer Older
xiaotong committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*   http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/

#include "CopyBlocksOnSite.h"
#include "CopyBlocksOnSite.cuh"
24
#include "../../XDevice.h"
xiaotong committed
25 26 27 28 29 30 31 32 33 34 35 36 37 38

namespace nts { // namespace nts(NiuTrans.Tensor)

#ifdef USE_CUDA

/*
copy a number of blocks to target positions
NOTE that this version makes more use of the 2d threads in cuda
>> source - data array (head of the blocks) to copy from
>> blockSize - size of block
>> blockNum - number of blocks
>> target - target data array
>> targetBlocks - target positions of the copy
*/
xiaotong committed
39
template<class T>
xiaotong committed
40
__global__
xiaotong committed
41
void KernelCopyBlocks(T * source, int blockSize, int blockNum, T * target, int * targetBlocks)
xiaotong committed
42 43
{
    /* entry index in the block */
xiaotong committed
44
    int i = blockDim.x * blockIdx.x + threadIdx.x;
xiaotong committed
45 46 47 48

    /* block index */
    int j = blockDim.y * blockIdx.y + threadIdx.y;

xiaotong committed
49
    if (i >= blockSize || j >= blockNum)
xiaotong committed
50 51
        return;

xiaotong committed
52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
    T * s = source + blockSize * j;
    T * t = target + blockSize * targetBlocks[j];

    t[i] = s[i];
}

/*
copy a number of blocks to target positions
NOTE that this version makes more use of the 2d threads in cuda
>> source - data array (head of the blocks) to copy from
>> blockSize - size of block
>> blockNum - number of blocks
>> target - target data array
>> targetBlocks - target positions of the copy
*/
template<class T>
__global__
void KernelCopyBlocksV2(T * source, int blockSize, int blockNum, int totalSize, T * target, int * targetBlocks)
{
    /* entry index in the block */
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    if (i >= totalSize)
        return;

    int targetBlockID = targetBlocks[i / blockSize];
    int targetOffset  = i % blockSize;

    *(target + blockSize * targetBlockID + targetOffset) = source[i];
xiaotong committed
81 82 83 84 85 86 87 88 89
}

/*
copy a number of blocks to target positions (cuda version)
>> source - data array (head of the blocks) to copy from
>> blockSize - size of block
>> blockNum - number of blocks
>> target - target data array
>> targetBlocks - target positions of the copy (on the device)
90
>> devID - device id
xiaotong committed
91
*/
92
void _CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, int devID)
xiaotong committed
93
{
94
    CheckNTErrors(devID >= 0, "Wrong device to run!");
xiaotong committed
95 96 97
    int cudaGrids[3];
    int cudaBlocks[3];

xiaotong committed
98 99 100
    int devIDBackup;
    ProtectCudaDev(devID, devIDBackup);

xiaotong committed
101 102
    if(blockSize % sizeof(double) == 0){
        int bSize = blockSize / sizeof(double);
xiaotong committed
103 104 105 106 107 108
        GDevs.GetCudaThread(devID, bSize * blockNum, cudaGrids, cudaBlocks);
        KernelCopyBlocksV2<double> <<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
                                    ((double*)source, bSize, blockNum, bSize * blockNum, (double*)target, targetBlocks);
        //GDevs.GetCudaThread2D(devID, bSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
        //KernelCopyBlocks<double> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>>
        //                            ((double*)source, bSize, blockNum, (double*)target, targetBlocks);
xiaotong committed
109
    }
xiaotong committed
110 111 112
    else 
    if(blockSize % sizeof(float) == 0){
        int bSize = blockSize / sizeof(float);
xiaotong committed
113 114 115 116 117 118
        GDevs.GetCudaThread(devID, bSize * blockNum, cudaGrids, cudaBlocks);
        KernelCopyBlocksV2<float> <<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
                                   ((float*)source, bSize, blockNum, bSize * blockNum, (float*)target, targetBlocks);
        //GDevs.GetCudaThread2D(devID, bSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
        //KernelCopyBlocks<float> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>>
        //                         ((float*)source, bSize, blockNum, (float*)target, targetBlocks);
xiaotong committed
119 120 121
    }
    else{
        ShowNTErrors("Unsupported block size!");
xiaotong committed
122
    }
xiaotong committed
123 124

    BacktoCudaDev(devID, devIDBackup);
xiaotong committed
125 126 127
}
#endif // USE_CUDA

128
} // namespace nts(NiuTrans.Tensor)