/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*   http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

/*
* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-06-14
*/

#include "SetAscendingOrder.cuh"
#include "../../XDevice.h"

namespace nts { // namespace nts(NiuTrans.Tensor)

#ifdef USE_CUDA

/* 
set the cell to the ascending order along a given dimension (kernel code)
>> data - the data array
>> stride - how many items we go ove when move to the next item along the dimension
>> strideNum - size of the given dimension
>> blockNum - block number
*/
__global__
void KernelSetAscendingOrder(int * data, int stride, int strideNum, int blockNum)
{
    __shared__ int iBlock[MAX_CUDA_THREAD_NUM_PER_BLOCK];
    __shared__ int iOffset[MAX_CUDA_THREAD_NUM_PER_BLOCK];

    /* index along the "stride" dimension */
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    /* index along the leading dimension */
    int j = blockDim.y * blockIdx.y + threadIdx.y;

    if(i >= stride * blockNum || j >= strideNum)
        return;

    if(threadIdx.y == 0){
        iBlock[threadIdx.x] = i / stride;
        iOffset[threadIdx.x] = i % stride;
    }
    __syncthreads();
    
    int * d = (int*)data + (iBlock[threadIdx.x] * strideNum + j) * stride + iOffset[threadIdx.x];
    *d = j;
}

/* 
set the cell to the ascending order along a given dimension
>> a - the tensor
>> dim - the dimension
*/
void CudaSetAscendingOrder(XTensor * a, int dim)
{
    CheckNTErrors((a->dataType == X_INT), "TODO!");

	int dimRDI = a->order - dim - 1;
	int stride = 1;
    int strideNum = a->dimSizeRDI[dimRDI];
    for(int i = 0; i < dimRDI; i++)
        stride *= a->dimSizeRDI[i];

    int blockNum = 1;
    for(int i = dimRDI + 1; i < a->order; i++)
        blockNum *= a->dimSizeRDI[i];

    int gridSize[3];
    int blockSize[3];

    GDevs.GetCudaThread2D(a->devID, strideNum, stride * blockNum, MAX_INT, gridSize, blockSize);

    int devIDBackup;
    ProtectCudaDev(a->devID, devIDBackup);

    KernelSetAscendingOrder<<<dim3(gridSize[1], gridSize[0]), dim3(blockSize[1], blockSize[0])>>>
                            ((int*)a->data, stride, strideNum, blockNum);

    BacktoCudaDev(a->devID, devIDBackup);
}
#endif // USE_CUDA

} // namespace nts(NiuTrans.Tensor)