CopyBlocks.cpp 3.51 KB
Newer Older
xiaotong committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*   http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/

22 23
#include "../../XTensor.h"
#include "../../XUtility.h"
xiaotong committed
24 25 26 27 28 29 30 31 32 33 34 35 36 37
#include "CopyBlocks.h"
#include "CopyBlocksOnSite.h"
#include "CopyBlocksSelected.cuh"

namespace nts { // namespace nts(NiuTrans.Tensor)

/*
copy a number of blocks to target positions
>> source - data array (head of the blocks) to copy from
>> blockSize - size of block
>> blockNum - number of blocks
>> target - target data array
>> targetBlocks - target positions of the copy
>> myMem - the memory pool
38
>> devID - device id
xiaotong committed
39
*/
40
void _CopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID)
xiaotong committed
41
{
42 43 44 45
    if (myMem != NULL)
        devID = myMem->devID;
    
    if (devID >= 0) {
xiaotong committed
46 47
#ifdef USE_CUDA
        /* copy the index from host to device */
48
        int * targetBlocksTMP = myMem != NULL ?
xiaotong committed
49
                               (int*)myMem->AllocBuf(devID, blockNum * sizeof(int)):
50
                               (int*)XMemAlloc(devID, blockNum * sizeof(int));
xiaotong committed
51
        XMemCopy(targetBlocksTMP, devID, targetBlocks, -1, blockNum * sizeof(int));
xiaotong committed
52

53
        _CopyBlocksOnSite(source, blockSize, blockNum, target, targetBlocksTMP, devID);
xiaotong committed
54

55 56 57 58
        if(myMem != NULL)
            myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
        else
            XMemFree(devID, targetBlocksTMP);
xiaotong committed
59 60 61 62 63
#else
        ShowNTErrors("Plesae specify USE_CUDA and recompile the code!");
#endif
    }
    else {
64
        _CopyBlocksOnSite(source, blockSize, blockNum, target, targetBlocks, devID);
xiaotong committed
65 66 67 68 69 70 71 72 73 74 75 76
    }
}

/*
copy a number of blocks source source positions to target positions
>> source - data array (head of the blocks) to copy from
>> blockSize - size of block
>> srcBlocks - source positions of the copy
>> blockNum - number of blocks (lenth of srcBlocks and tgtBlocks)
>> target - target data array
>> targetBlocks - target positions of the copy
>> myMem - the memory pool
77
>> devID - device id
xiaotong committed
78
*/
79
void _CopyBlocks(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID)
xiaotong committed
80
{
81
    if (myMem != NULL)
82
        devID = myMem->devID;
83 84

    if (devID >= 0) {
xiaotong committed
85
#ifdef USE_CUDA
86
        _CudaCopyBlocksSelected(source, blockSize, sourceBlocks, blockNum, target, targetBlocks, myMem, devID);
xiaotong committed
87 88 89 90 91
#else
        ShowNTErrors("Plesae specify USE_CUDA and recompile the code!");
#endif
    }
    else {
liyinqiao committed
92 93
        /* 
        The following code should be fine with GPUs, but too many
xiaotong committed
94
        kernel calls would slow down the system. We prefer to use
liyinqiao committed
95 96
        one kernel to do block copy in batch (kernel fusion). 
        */
xiaotong committed
97 98
        for (int i = 0; i < blockNum; i++) {
            XMemCopy((char*)target + targetBlocks[i] * blockSize, devID,
99
                     (char*)source + sourceBlocks[i] * blockSize, devID, blockSize);
xiaotong committed
100 101 102 103 104
        }
    }
}

} // namespace nts(NiuTrans.Tensor)