/* NiuTrans.Tensor - an open-source tensor library * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. * All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24 */ #include "../../XDevice.h" #include "../../XTensor.h" #include "Unsqueeze.h" #include "Unsqueeze.cuh" namespace nts { // namespace nts(NiuTrans.Tensor) #ifdef USE_CUDA /* insert a dimension by copying the blocks for n times (where n is the size of the inerted dimension) >> s - pointer to the source data array >> blockSize - size of a block >> blockNum - number of the blocks >> t - pointer to the target data array */ template<class T> __global__ void KernelUnsqueeze(void * s, int blockSize, int blockNum, void * t, int n) { /* index of data items */ int i = blockDim.x * blockIdx.x + threadIdx.x; /* block index */ int j = blockDim.y * blockIdx.y + threadIdx.y; if (i >= blockSize || j >= blockNum) return; MTYPE offset = blockSize * j; T value = ((T*)s)[offset + i]; T * tData = (T*)t + offset * n; int length = blockSize * n; __syncthreads(); for (int k = i; k < length; k += blockSize) tData[k] = value; } /* insert a dimension by copying the blocks for x times (where x is the size of the inerted dimension) >> a - input tensor >> b - output tensor >> dim - where to insert the dimension >> dSize - size of the newly-inserted dimension */ extern "C" void _CudaUnsqueeze(const XTensor * a, XTensor * b, int dim, int dSize) { int blockSize = 1; int blockNumA = 1; int blockNumB = 1; int dimRDI = b->order - dim - 1; for (int i = 0; i < dimRDI; i++) blockSize *= a->dimSizeRDI[i]; blockNumA = a->unitNum / blockSize; blockNumB = b->unitNum / blockSize; CheckNTErrors((blockNumA * dSize == blockNumB), "Unmatched tensors!");; int cudaGrids[3]; int cudaBlocks[3]; GDevs.GetCudaThread2D(a->devID, blockSize, blockNumA, MAX_INT, cudaGrids, cudaBlocks); int devIDBackup = 0; ProtectCudaDev(a->devID, devIDBackup); if (a->dataType == X_FLOAT && a->dataType == X_FLOAT) { KernelUnsqueeze<float> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> > (a->data, blockSize, blockNumA, b->data, dSize); } else if (a->dataType == X_INT && a->dataType == X_INT) { KernelUnsqueeze<int> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> > (a->data, blockSize, blockNumA, b->data, dSize); } else { ShowNTErrors("TODO!"); } BacktoCudaDev(a->devID, devIDBackup); } #endif // USE_CUDA } // namespace nts(NiuTrans.Tensor)