SetData.cu 20.1 KB
Newer Older
1
/* 
2
 * NiuTrans.Tensor - an open-source tensor library
3
 * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
4 5 6 7 8 9 10 11 12 13 14 15 16 17
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
18 19

/*
20 21 22
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-18
 * I'm surprised that I did not write this file till today.
 */
23

24 25
#include <curand.h>
#include <time.h>
26
#include "SetData.cuh"
27
#include <curand_kernel.h>
28
#include "../../XDevice.h"
29
#include "../../XUtility.h"
30 31 32

namespace nts { // namespace nts(NiuTrans.Tensor)

liyinqiao committed
33 34
#ifdef USE_CUDA

35 36
/*
set a data array with a fixed value
37 38

>> d - pointer to the data array
39
>> v - the initial value
40 41
>> size - size of the array
*/
42 43 44
template<class T>
__global__
void KernelSetDataFixed(T * d, T v, int size)
45 46 47 48
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    if (i < size)
49
        d[i] = v;
50
}
51 52 53
template __global__ void KernelSetDataFixed<int>(int *, int, int);
template __global__ void KernelSetDataFixed<float>(float *, float, int);
template __global__ void KernelSetDataFixed<double>(double *, double, int);
54 55

/* 
56
generate data items with a fixed value 
57 58

>> tensor - the tensor for initialization
59
>> value - the initial value
60
*/
61 62
template<class T>
void _CudaSetDataFixed(XTensor * tensor, T value)
63 64 65 66 67 68 69 70 71 72 73
{
    int gridSize[3];
    int blockSize[3];

    GDevs.GetCudaThread(tensor->devID, tensor->unitNum, gridSize, blockSize);

    dim3 blocks(gridSize[0]);
    dim3 threads(blockSize[0]);

    int devIDBackup;

xiaotong committed
74 75
    ProtectCudaDev(tensor->devID, devIDBackup);

76 77 78 79 80 81 82 83
    if (tensor->dataType == X_INT)
        KernelSetDataFixed << <blocks, threads >> > ((int*)tensor->data, (int)value, tensor->unitNum);
    else if (tensor->dataType == X_FLOAT)
        KernelSetDataFixed << <blocks, threads >> > ((float*)tensor->data, (float)value, tensor->unitNum);
    else if (tensor->dataType == X_DOUBLE)
        KernelSetDataFixed << <blocks, threads >> > ((double*)tensor->data, (double)value, tensor->unitNum);
    else
        ShowNTErrors("TODO! Unsupported datatype!")
xiaotong committed
84 85 86

    BacktoCudaDev(tensor->devID, devIDBackup);
}
87 88 89
template void _CudaSetDataFixed<int>(XTensor *, int);
template void _CudaSetDataFixed<float>(XTensor *, float);
template void _CudaSetDataFixed<double>(XTensor *, double);
xiaotong committed
90 91

/* 
92 93 94 95 96 97 98
set a float data array with a fixed value p (in int) only 
if the condition entry is non-zero 
>> d - pointer to the data array
>> c - pointer to the condition array
>> size - size of the array
>> p - the initial value
*/
99
template<class T>
100
__global__ 
101
void KernelSetDataFixedCond(T * d, T * c, T value, int size)
102 103 104 105
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    if (i < size && c[i] != 0)
106
        d[i] = value;
107
}
108 109 110
template __global__ void KernelSetDataFixedCond<int>(int*, int*, int, int);
template __global__ void KernelSetDataFixedCond<float>(float*, float*, float, int);
template __global__ void KernelSetDataFixedCond<double>(double*, double*, double, int);
111
/* 
112 113 114
generate data items with a fixed value p 
only if the condition entry is non-zero 

115 116 117
>> tensor - the tensor for initialization
>> condition - the condition tensor whose entry would be check to
               set the corresponding entry in "tensor"
118
>> value - the initial value   
119
*/
120 121
template<class T>
void _CudaSetDataFixedCond(XTensor* tensor, XTensor* condition, T value)
122 123 124 125 126 127 128 129 130 131 132 133
{
    int gridSize[3];
    int blockSize[3];

    GDevs.GetCudaThread(tensor->devID, tensor->unitNum, gridSize, blockSize);

    dim3 blocks(gridSize[0]);
    dim3 threads(blockSize[0]);

    int devIDBackup;
    ProtectCudaDev(tensor->devID, devIDBackup);

134 135 136 137 138 139 140 141 142 143 144 145
    if (tensor->dataType == X_INT)
        KernelSetDataFixedCond <<< blocks, threads >>> ((int*)tensor->data, (int*)condition->data,
                                                       (int)value, tensor->unitNum);
    else if (tensor->dataType == X_FLOAT)
        KernelSetDataFixedCond <<< blocks, threads >>> ((float*)tensor->data, (float*)condition->data,
                                                       (float)value, tensor->unitNum);

    else if (tensor->dataType == X_DOUBLE)
        KernelSetDataFixedCond <<< blocks, threads >>> ((double*)tensor->data, (double*)condition->data,
                                                       (double)value, tensor->unitNum);
    else
        ShowNTErrors("TODO! Unsupported datatype!")
146 147 148

    BacktoCudaDev(tensor->devID, devIDBackup);
}
149 150 151
template void _CudaSetDataFixedCond<int>(XTensor*, XTensor*, int);
template void _CudaSetDataFixedCond<float>(XTensor*, XTensor*, float);
template void _CudaSetDataFixedCond<double>(XTensor*, XTensor*, double);
152 153

/* 
154 155 156 157
set data array with a uniform distribution in [low, high] 
>> deviceStates - the state of curand
>> d - float datatype pointer to the data array 
>> size - size of the array
158 159
>> lower - low value of the range
>> variance - the variance of the range
160 161
*/
__global__
162
void KernelSetDataRandFloat(float * d, int size, DTYPE lower, DTYPE variance)
163 164 165 166
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    
    if (i < size) {
167
        d[i] = d[i] * variance + lower;
168 169 170 171 172 173 174
    }
}
/* 
set data array with a uniform distribution in [low, high] 
>> deviceStates - the state of curand
>> d - double datatype pointer to the data array
>> size - size of the array
175 176
>> lower - low value of the range
>> variance - the variance of the range
177 178
*/
__global__
179
void KernelSetDataRandDouble(double * d, int size, DTYPE lower, DTYPE variance)
180 181 182 183
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    
    if (i < size){
184
        d[i] = d[i] * variance + lower;
185 186 187
    }
}

188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
/*
set data items to a pre-defined value if its value >= p, set it to 0 otherwise
>> d - pointer to the data array
>> size - size of the array
>> lower - low value of the range
>> variance - the variance of the range
*/
__global__
void KernelSetDataPCut(DTYPE * d, int size, DTYPE p, DTYPE value)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    if (i < size) {
        if (d[i] >= p)
            d[i] = value;
        else
            d[i] = 0;
    }
}

xuchen committed
208 209 210 211 212 213 214 215
/* 
set data items along with a given dimension (and keep the remaining items unchanged) - kernel version
>> tensor - the tensor whose data array would be initialized
>> beg - the beginning position
>> len - length of the segment to be set
>> blockSize - size of a data block
>> blockNum - number of data blocks
*/
216
template<class T>
xuchen committed
217
__global__
218
void KernelSetDataDim(T * d, int beg, int len, int blockSize, int blockNum, T p)
xuchen committed
219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
{
    /* offset in each block */
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    /* block id */
    int j = blockDim.y * blockIdx.y + threadIdx.y;

    if(i >= blockSize || j > blockNum)
        return;

    if(i < beg || i >= beg + len)
        return;

    d[blockSize * j + i] = p;
}
234 235 236
template __global__ void KernelSetDataDim<int>(int*, int, int, int, int, int);
template __global__ void KernelSetDataDim<float>(float*, int, int, int, int, float);
template __global__ void KernelSetDataDim<double>(double*, int, int, int, int, double);
xuchen committed
237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253

/* 
set data items along with a given dimension (and keep the remaining items unchanged) - cuda version
>> tensor - the tensor whose data array would be initialized
>> beg - the beginning position
>> len - length along with the given dimension
>> dim - the dimension along which we set the data
e.g., given a 3 * 3 tensor 
      1 2 3
      4 5 6
      7 8 9
      when beg = 1, len = 1, dim = 0 and p = 0, we have
      1 2 3
      0 0 0
      7 8 9
      i.e., we set all entries of row 1 to 0
*/
254 255
template<class T>
void _CudaSetDataDim(XTensor * tensor, int beg, int len, int dim, T p)
xuchen committed
256 257 258 259
{
    int n = tensor->order;

    CheckNTErrors(tensor->dataType == DEFAULT_DTYPE, "TODO!");
260
    CheckNTErrors(dim < n && dim >= 0, "Illegal dimension!");
xuchen committed
261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283
    CheckNTErrors(beg >= 0 && beg < tensor->GetDim(dim), "Illegal beginning position!");
    CheckNTErrors(beg + len >= 0 && beg + len < tensor->GetDim(dim), "Illegal length!");

    int stride = 1;
    int blockSize = 1;
    int blockNum  = 1;
    for(int i = n - 1; i > dim; i--){
        stride *= tensor->GetDim(i);
    }
    blockSize = stride * tensor->GetDim(dim);
    blockNum = tensor->unitNum / blockSize;

    int cudaGrids[3];
    int cudaBlocks[3];

    GDevs.GetCudaThread2D(tensor->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);

    dim3 blocks(cudaGrids[0], cudaGrids[1]);
    dim3 threads(cudaBlocks[0], cudaBlocks[1]);

    int devIDBackup;
    ProtectCudaDev(tensor->devID, devIDBackup);

284 285 286 287 288 289 290 291 292 293 294 295
    if (tensor->dataType == X_INT)
        KernelSetDataDim << <blocks, threads >> > ((int*)tensor->data, beg * stride,
                                                    len * stride, blockSize, blockNum, (int)p);
    else if (tensor->dataType == X_FLOAT)
        KernelSetDataDim << <blocks, threads >> > ((float*)tensor->data, beg * stride,
                                                    len * stride, blockSize, blockNum, (float)p);

    else if (tensor->dataType == X_DOUBLE)
        KernelSetDataDim << <blocks, threads >> > ((double*)tensor->data, beg * stride,
                                                    len * stride, blockSize, blockNum, (double)p);
    else
        ShowNTErrors("TODO! Unsupported datatype!")
xuchen committed
296 297 298

    BacktoCudaDev(tensor->devID, devIDBackup);
}
299 300 301
template void _CudaSetDataDim<int>(XTensor*, int, int, int, int);
template void _CudaSetDataDim<float>(XTensor*, int, int, int, float);
template void _CudaSetDataDim<double>(XTensor*, int, int, int, double);
xuchen committed
302 303

/* 
304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385
modify data items along with a given index and dimension 
(and keep the remaining items unchanged) - kernel version

>> s - the pointer whose data would be modified
>> m - the pointer whose data would be used to modify the data pointed by s
>> blockNum - number of data blocks
>> blockSize - size of a data block
>> stride - stride of a data block
*/
__global__
void KernelSetDataIndexed(DTYPE * s, DTYPE * m, int blockNum, int blockSize, int stride)
{
    /* offset in each block */
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    /* block id */
    int j = blockDim.y * blockIdx.y + threadIdx.y;
    
    if(i >= stride || j >= blockNum)
        return;

    int x = blockSize * j + i;
    int y = stride * j + i;
    s[x] = m[y];
}

/*
modify data items along with a given index and dimension (and keep the remaining items unchanged) 
>> source - the tensor whose data array would be modified
>> modify - the tensor whose data array would be used to modify the source tensor
>> dim - the dimension along which we modify the tensor
>> index - index of the given dimension
e.g., given a source tensor (3, 3)
      1 2 3
      4 5 6
      7 8 9
      given a modified tensor (3)
      1 2 3
      when dim = 0, index = 1, we have
      1 2 3
      1 2 3
      7 8 9
      i.e., we set entries of row 1 to {1, 2, 3}
*/
void _CudaSetDataIndexed(XTensor * source, XTensor * modify, int dim, int index)
{
    int order = source->order;
    int size = source->GetDim(dim);

    CheckNTErrors(source->dataType == DEFAULT_DTYPE, "TODO!");
    CheckNTErrors(dim >= 0 && dim < order, "Illegal dimension!");
    CheckNTErrors(index >= 0 && index < size, "Illegal index!");
    
    int stride = 1;
    int blockSize = 1;
    int blockNum  = 1;

    for(int i = order - 1; i > dim; i--){
        stride *= source->GetDim(i);
    }

    blockSize = stride * source->GetDim(dim);
    blockNum = source->unitNum / blockSize;

    int cudaGrids[3];
    int cudaBlocks[3];

    GDevs.GetCudaThread2D(source->devID, stride, blockNum, MAX_INT, cudaGrids, cudaBlocks);

    dim3 blocks(cudaGrids[0], cudaGrids[1]);
    dim3 threads(cudaBlocks[0], cudaBlocks[1]);

    int devIDBackup;
    ProtectCudaDev(source->devID, devIDBackup);
    
    KernelSetDataIndexed<<<blocks, threads >>>((DTYPE*)source->data + index * stride, (DTYPE*)modify->data, 
                                                blockNum, blockSize, stride);

    BacktoCudaDev(source->devID, devIDBackup);
}

/* 
xuchen committed
386
set lower triangular matrics for each block
387

xuchen committed
388 389 390 391 392 393 394
>> d - pointer to the data array
>> l - row number (or column number) of each block, i.e, 
       a block is l * l matrix
>> blockSize - size of each block (blockSize = l * l)
>> blockNum - number of the blocks
>> p - the value for each entry of the lower triangular matrics
>> shift - the offset from diagonal
395 396 397 398 399 400 401 402 403
   e.g., for a 3* 3 tensor, 
         when p = 1 ans shift = 0, we have
         1 0 0
         1 1 0
         1 1 1
         when p = 2 and shift = -1, we have
         0 0 0
         2 0 0
         2 2 0
xuchen committed
404 405
*/
__global__
406
void KernelSetDataLowTri(DTYPE * d, int l, int blockSize, int blockNum, DTYPE p, int shift)
xuchen committed
407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426
{
    /* offset in each block */
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    /* block id */
    int j = blockDim.y * blockIdx.y + threadIdx.y;

    if(i >= blockSize || j > blockNum)
        return;

    int row = i / l;
    int col = i % l;
    DTYPE * d2 = d + blockSize * j + row * l + col;

    if(col <= row + shift)
        *d2 = p;
    else
        *d2 = 0;
}

427
/*
xuchen committed
428
generate data as lower triangular matrics for last two dimensions (cuda version)
429

xuchen committed
430
>> tensor - the tensor whose data to be set
431
>> value - the value for each entry of the lower triangular matrics
xuchen committed
432
>> shift - the offset from diagonal
433 434 435 436 437 438 439 440 441 442

   e.g., for a 3 * 3 tensor,
         when value = 1 ans shift = 0, we have
         1 0 0
         1 1 0
         1 1 1
         when value = 2 and shift = -1, we have
         0 0 0
         2 0 0
         2 2 0
xuchen committed
443
*/
444
void _CudaSetDataLowTri(XTensor * tensor, DTYPE value, int shift)
xuchen committed
445
{
446 447 448
    int size = tensor->GetDim(-1);
    int blockSize = size * size;
    int blockNum = tensor->unitNum / blockSize;
xuchen committed
449 450 451 452 453 454 455 456 457 458 459 460

    int cudaGrids[3];
    int cudaBlocks[3];

    GDevs.GetCudaThread2D(tensor->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);

    dim3 blocks(cudaGrids[0], cudaGrids[1]);
    dim3 threads(cudaBlocks[0], cudaBlocks[1]);

    int devIDBackup;
    ProtectCudaDev(tensor->devID, devIDBackup);

461
    KernelSetDataLowTri<<<blocks, threads >>>((DTYPE*)tensor->data, size, blockSize, blockNum, value, shift);
xuchen committed
462 463 464 465

    BacktoCudaDev(tensor->devID, devIDBackup);
}

466
/*
467
generate data items with a uniform distribution in [lower, upper]
468
>> tensor - the tensor whose data array would be initialized
469 470
>> lower - lower value of the range
>> upper - upper value of the range
471
*/
472
void _CudaSetDataRand(const XTensor * tensor, DTYPE lower, DTYPE upper)
473
{
474
    CheckNTErrors(upper > lower, "the high value must be greater than low value!");
475 476 477 478 479 480 481 482 483 484 485 486

    int gridSize[3];
    int blockSize[3];

    GDevs.GetCudaThread(tensor->devID, tensor->unitNum, gridSize, blockSize);

    dim3 blocks(gridSize[0]);
    dim3 threads(blockSize[0]);

    int devIDBackup;
    ProtectCudaDev(tensor->devID, devIDBackup);
    
487
    curandGenerator_t & gen = GDevs.GPUs[tensor->devID].gen;
liyinqiao committed
488
    curandGenerateUniform(gen, (float*)tensor->data, tensor->unitNum);
489
    
490
    DTYPE variance = upper - lower;
491

492 493
    if(variance != 1.0F || lower != 0){
        if (tensor->dataType == X_FLOAT)
liyinqiao committed
494 495
            KernelSetDataRandFloat  <<<blocks, threads >>>
                                     ((float*) tensor->data, tensor->unitNum, lower, variance);
496
        else if (tensor->dataType == X_DOUBLE)
liyinqiao committed
497 498
            KernelSetDataRandDouble <<<blocks, threads >>>
                                     ((double*)tensor->data, tensor->unitNum, lower, variance);
499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528
    }

    BacktoCudaDev(tensor->devID, devIDBackup);
}

/* 
generate data items with a uniform distribution in [lower, upper] and set
the item to a pre-defined value if the item >= p, set the item to 0 otherwise 
>> tensor - the tensor whose data array would be initialized
>> lower - lower value of the range
>> upper - upper value of the range
>> p - the threshold
>> value - the value we intend to assign to the item
*/
void _CudaSetDataRandP(const XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value)
{
    _CudaSetDataRand(tensor, lower, upper);

    int gridSize[3];
    int blockSize[3];

    GDevs.GetCudaThread(tensor->devID, tensor->unitNum, gridSize, blockSize);

    dim3 blocks(gridSize[0]);
    dim3 threads(blockSize[0]);

    int devIDBackup;
    ProtectCudaDev(tensor->devID, devIDBackup);
    
    KernelSetDataPCut << <blocks, threads >> >((float*)tensor->data, tensor->unitNum, p, value);
529 530 531 532

    BacktoCudaDev(tensor->devID, devIDBackup);
}

xiaotong committed
533 534 535 536 537
/*
set the data with an array of offsets (kernel version)
>> data - pointer to the data array
>> offsets - offset for each data item
>> value - value of the data items
538
>> num - number of the data items
xiaotong committed
539 540
*/
__global__
541
void KernelSetDataWithOffset(DTYPE * data, MTYPE * offsets, DTYPE value, MTYPE num)
xiaotong committed
542 543 544 545
{
    /* index */
    int i = blockDim.x * blockIdx.x + threadIdx.x;

546
    if (i < num)
xiaotong committed
547 548 549 550 551 552 553 554
        data[offsets[i]] = value;
}

/*
set the data with an array of offsets (cuda version)
>> tensor - the tensor that keeps the data
>> offsets - offset for each data item
>> value - value of the data items
555
>> num - number of the data items
xiaotong committed
556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571
*/
void _CudaSetDataWithOffset(XTensor * tensor, MTYPE * offsets, DTYPE value, MTYPE num)
{
    CheckNTErrors(tensor->dataType == X_FLOAT, "Data type is incorrect!");

    int gridSize[3];
    int blockSize[3];

    GDevs.GetCudaThread(tensor->devID, (int)num, gridSize, blockSize);

    dim3 blocks(gridSize[0]);
    dim3 threads(blockSize[0]);

    int devIDBackup;
    ProtectCudaDev(tensor->devID, devIDBackup);

572
    KernelSetDataWithOffset << <blocks, threads >> > ((DTYPE*)tensor->data, offsets, value, num);
xiaotong committed
573 574 575 576

    BacktoCudaDev(tensor->devID, devIDBackup);
}

577 578 579 580 581 582 583 584 585
/*
set the data with an array of offsets (kernel version)
>> data - pointer to the data array
>> offsets - offset for each data item
>> value - value of the data items
>> num - number of the data items
>> dataType - the data type of the data and values
*/
__global__
586
void KernelSetDataWithOffsetAndValue(void * data, MTYPE * offsets, void * values, MTYPE num, TENSOR_DATA_TYPE dataType)
587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607
{
    /* index */
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    if (i < num) {
        if (dataType == X_INT)
            *((int *)data + offsets[i]) = *((int *)values + i);
        else if (dataType == X_FLOAT)
            *((float *)data + offsets[i]) = *((float *)values + i);
    }
}

/*
set the data with an array of values 
>> tensor - the tensor that keeps the data
>> offsets - offset for each data item
>> value - value of the ech data item
>> num - number of the data items
*/
void _CudaSetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * values, MTYPE num)
{
608 609 610 611 612 613 614 615 616 617 618 619

    XMem * mem = tensor->mem;
    MTYPE offsetSize = num * sizeof(MTYPE);
    MTYPE valueSize;

    if (tensor->dataType == X_INT)
        valueSize = num * sizeof(int);
    else if (tensor->dataType == X_FLOAT)
        valueSize = num * sizeof(float);
    else
        ShowNTErrors("TO DO!!!");

620 621 622 623 624 625 626 627 628 629 630
    int gridSize[3];
    int blockSize[3];

    GDevs.GetCudaThread(tensor->devID, (int)num, gridSize, blockSize);

    dim3 blocks(gridSize[0]);
    dim3 threads(blockSize[0]);

    int devIDBackup;
    ProtectCudaDev(tensor->devID, devIDBackup);

631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656
    MTYPE * offsetsCuda = mem != NULL ? 
                            (MTYPE*)mem->AllocBuf(mem->devID, offsetSize) : 
                            (MTYPE*)XMemAlloc(tensor->devID, offsetSize);
    void * valuesCuda  = mem != NULL ? 
                            mem->AllocBuf(mem->devID, valueSize) : 
                            XMemAlloc(tensor->devID, valueSize);

    if (mem != NULL) {
        XMemCopy(offsetsCuda, mem->devID, offsets, -1, offsetSize);
        XMemCopy(valuesCuda, mem->devID, values, -1, valueSize);
    }
    else {
        XMemCopy(offsetsCuda, tensor->devID, offsets, -1, offsetSize);
        XMemCopy(valuesCuda, tensor->devID, values, -1, valueSize);
    }

    KernelSetDataWithOffsetAndValue<<<blocks, threads >>> (tensor->data, offsetsCuda, valuesCuda, num, tensor->dataType);

    if (mem != NULL) {
        mem->ReleaseBuf(mem->devID, valueSize);
        mem->ReleaseBuf(mem->devID, offsetSize);
    }
    else {
        XMemFree(tensor->devID, valuesCuda);
        XMemFree(tensor->devID, offsetsCuda);
    }
657 658 659 660

    BacktoCudaDev(tensor->devID, devIDBackup);
}

liyinqiao committed
661
#endif // USE_CUDA
662
} // namespace nts(NiuTrans.Tensor)