Spread.cpp 9.02 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136
/* NiuTrans.Tensor - an open-source tensor library
 * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*
 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-25
 */

#include "Spread.h"
#include "Spread.cuh"

namespace nts { // namespace nts(NiuTrans.Tensor)

/*
This is core assignment for spread function.

>> sData - the data pointer of the source tensor
>> cData - the data pointer of collection tensor
>> blockNum - number of data blocks
>> blockSizeSrc - size of source data block
>> blockSizeColl - size of source data block
>> stride - stride of a data block
*/
void _Assignment(DTYPE * sData, DTYPE * cData, int blockNum, 
                 int blockSizeSrc, int blockSizeColl, int stride) 
{
    for (int i = 0; i < blockNum; i++) {
        DTYPE * s = sData + blockSizeSrc * i;
        DTYPE * c = cData + blockSizeColl * i;
        for(int j = 0; j < stride; j++)
            s[j] = c[j];
    }
}

/*
spread a collection tensor to source tensor.
This is a inverse operation compared to gather.

>> source - the source tensor whose data would be modified
>> collection - the collection whose data would be spread to source tensor
>> dim - the leading dimension to define "sub-tensors"
         e.g., for a tensor of size (3, 2, 4) and dim = 2, 
         we have 4 sub-tensors of size (3, 2)
>> srcIndex - index of the source sub-tensors
>> indexSize - length of srcIndex (and collIndex)
>> collIndex - index of the gathered sub-tensors
*/
void _Spread(XTensor * source, XTensor * collection, int dim, 
             int * srcIndex, int indexSize, int * collIndex)
{
    int order = source->order;

    CheckNTErrors(source->dataType == DEFAULT_DTYPE, "TODO!");
    CheckNTErrors(dim >= 0 && dim < order, "Illegal dimension!");
    
    for(int i = 0; i < order; i++){
        if(i < dim){
            CheckNTErrors(collection->GetDim(i) == source->GetDim(i), "Illegal dimension!");
        }
        else if(i > dim){
            CheckNTErrors(collection->GetDim(i) == source->GetDim(i), "Illegal dimension!");
        }
        else{
            CheckNTErrors(collection->GetDim(i) == indexSize, "Illegal dimension!");
        }
    }

#ifdef USE_CUDA
    if(source->devID >= 0 && collection->devID >= 0) {
        _CudaSpread(source, collection, dim, srcIndex, indexSize, collIndex);
        return;
    }
#endif

    int blockSizeSrc = 1;
    int blockSizeColl = 1;
    int blockNum = 1;
    int stride = 1;

    for (int i = dim + 1; i < order; i++) {
        stride *= source->GetDim(i);
    }
    
    blockSizeSrc = stride * source->GetDim(dim);
    blockSizeColl = stride * collection->GetDim(dim);
    blockNum = source->unitNum / blockSizeSrc;

    DTYPE * sData = (DTYPE*)source->data;
    DTYPE * cData = (DTYPE*)collection->data;

    for(int i = 0; i < indexSize; i++){
        int src = srcIndex[i];
        int tgt = collIndex[i];
        DTYPE * s = sData + src * stride;
        DTYPE * c = cData + tgt * stride;
        _Assignment(s, c, blockNum, blockSizeSrc, blockSizeColl, stride);
    }
}

/*
This is core assignment for backward computation of gather function.
Care of the operator "+=" instead of "=".

>> sData - the data pointer of the source tensor
>> cData - the data pointer of collection tensor
>> blockNum - number of data blocks
>> blockSizeSrc - size of source data block
>> blockSizeColl - size of source data block
>> stride - stride of a data block
*/
void _AssignmentForGather(DTYPE * sData, DTYPE * cData, int blockNum, 
                          int blockSizeSrc, int blockSizeColl, int stride) 
{
    for (int i = 0; i < blockNum; i++) {
        DTYPE * s = sData + blockSizeSrc * i;
        DTYPE * c = cData + blockSizeColl * i;
        for(int j = 0; j < stride; j++)
            s[j] += c[j];
    }
}

/*
spread a collection tensor to source tensor.
137
And this is a special spread function for backward computation of CopyIndexed function.
138

139 140
>> s - the source tensor whose data would be modified
>> c - the collection whose data would be spread to source tensor
141 142 143
>> dim - the leading dimension to define "sub-tensors"
         e.g., for a tensor of size (3, 2, 4) and dim = 2, 
         we have 4 sub-tensors of size (3, 2)
144 145 146 147 148
>> srcIndex - the tensor to save the index of the source sub-tensors
>> collIndex - the tensor to save the index of the collection sub-tensors
>> copyNum - number of the sub-tensors we copy for each source index, 
             e.g., for srcIndex = [1,4] and copyNum = 2,
             we actually copy the source sub-tensors 1, 2, 4, 5
149
*/
150 151 152
void _SpreadForCopyIndexed(XTensor * s, XTensor * c, int dim, 
                           XTensor * srcIndex, XTensor * collIndex, 
                           int copyNum)
153
{
154 155
    int order = s->order;
    int indexSize = srcIndex->unitNum;
156

157 158 159 160 161 162 163 164 165 166 167 168 169 170
    CheckNTErrors(indexSize != 0, "NULL index!")
    CheckNTErrors((s && c), "Invalid tensors!");
    CheckNTErrors((srcIndex && collIndex), "Invalid index tensors!");
    CheckNTErrors((s->devID == c->devID || (s->devID < 0 && c->devID < 0)),
                  "the data must be kept on the same device!");
    CheckNTErrors((srcIndex->devID == srcIndex->devID || (s->devID < 0 && c->devID < 0)),
                  "the index must be kept on the same device!");
    CheckNTErrors((s->devID == srcIndex->devID || (s->devID < 0 && c->devID < 0)),
                  "the data and index must be kept on the same device!");
    CheckNTErrors((dim >= 0 && dim < s->order), "A too larget dimension specified!");
    CheckNTErrors((s->unitSize == c->unitSize), "Unmatched tensors!");
    CheckNTErrors((srcIndex->unitNum == collIndex->unitNum), "Unmatched index tensors!");

    CheckNTErrors(s->dataType == DEFAULT_DTYPE, "TODO!");
171 172
    CheckNTErrors(dim >= 0 && dim < order, "Illegal dimension!");
    
173 174 175
    for (int i = 0; i < order; i++) {
        if (i != dim) {
            CheckNTErrors(s->GetDim(i) == c->GetDim(i), "Unmatched dimensions");
176
        }
177
        else {
178
            CheckNTErrors(c->GetDim(i) == indexSize * copyNum, "Unmatched dimensions");
179 180 181 182
        }
    }

#ifdef USE_CUDA
183 184
    if(s->devID >= 0 && c->devID >= 0) {
        _CudaSpreadForCopyIndexed(s, c, dim, srcIndex, collIndex, copyNum);
185 186 187 188 189 190
        return;
    }
#endif

    int blockNum = 1;
    int stride = 1;
191 192
    int blockSizeSrc = 1;
    int blockSizeTgt = 1;
193

194 195
    for (int i = 0; i < dim; i++)
        blockNum *= s->GetDim(i);
196
    
197 198
    for (int i = dim + 1; i < order; i++)
        stride *= s->GetDim(i);
199

200 201
    blockSizeSrc = stride * s->GetDim(dim);
    blockSizeTgt = stride * c->GetDim(dim);
202

203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220
    DTYPE * sData = (DTYPE*)s->data;
    DTYPE * cData = (DTYPE*)c->data;
    int * sIndex = (int*)srcIndex->data;
    int * cIndex = (int*)collIndex->data;

    for (int i = 0; i < indexSize; i++) {
        for (int c = 0; c < copyNum; c++) {
            int si = sIndex[i] + c;
            int ti = cIndex[i] + c;

            for (int j = 0; j < blockNum; j++) {
                DTYPE * sd = sData + j * blockSizeSrc + si * stride;
                DTYPE * td = cData + j * blockSizeTgt + ti * stride;
                for (int k = 0; k < stride; k++)
                    *(sd + k) += *(td + k);
            }
        
        }
221
    }
222

223 224
}

225 226 227 228 229 230
/*
spread a collection tensor to source tensor.
And this is a special spread function for backward computation of gather function.

>> source - the source tensor whose data would be modified
>> collection - the collection whose data would be spread to source tensor
231
>> index - the tensor to save the index of the collenction tensor
232 233 234 235 236 237 238 239 240
*/
void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index)
{
    int dim = 0;
    int order = source->order;

    CheckNTErrors(source->dataType == DEFAULT_DTYPE, "TODO!");
    
    for(int i = 0; i < order; i++){
241 242
        if(i == dim){
            CheckNTErrors(collection->GetDim(i) == index->unitNum, "Illegal dimension!");
243
        }
244
        else {
245 246 247 248 249
            CheckNTErrors(collection->GetDim(i) == source->GetDim(i), "Illegal dimension!");
        }
    }

#ifdef USE_CUDA
250
    if(source->devID >= 0 && collection->devID >= 0) {
251 252 253 254
        _CudaSpreadForGather(source, collection, index);
        return;
    }
#endif
255 256 257 258 259 260 261 262 263 264
    
    int stride = 1;
    int indexSize = 1;

    stride = source->GetDim(-1);
    indexSize = index->unitNum;

    DTYPE * sData = (DTYPE*)source->data;
    DTYPE * cData = (DTYPE*)collection->data;
    int * sIndexData = (int*)index->data;
265

266 267 268 269 270
    for (int i = 0; i < indexSize; i++) {
        int sIndex = sIndexData[i] * stride;
        for (int j = 0; j < stride; j++)
            sData[sIndex + j] += cData[i * stride + j];
    }
271 272
}

xiaotong committed
273
} // namespace nts(NiuTrans.Tensor)