/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*   http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-02 float16/int/int8 added
*/

#include "../../XDevice.h"
#include "../../XUtility.h"
#include "Sum.cuh"

namespace nts { // namespace nts(NiuTrans.Tensor)

#ifdef USE_CUDA

/*
summation of data arrays (CUDA Kernel)
c = a  + b * \beta
>> a - A matrix
>> b - another matrix
>> c - where we put a+b
>> size - the size of a/b/c
>> beta - the coefficient
*/
template <class T>
__global__
void KernelADD(T * a, T * b, T * c, int size, T beta)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    if (i < size)
        c[i] = a[i] + b[i] * beta;

}

__global__
void KernelADDInt(int * a, int * b, int * c, int size, DTYPE beta)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    if (i < size)
        c[i] = a[i] + b[i] * (int)beta;

}

/*
tensor summation c = a + b * \beta (cuda version)
>> a - a tensor
>> b - another tensor
>> c - where we put a+b*\beta. we save it in a if c is NULL
>> beta - the scaling factor
*/
void _CudaSum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
{
    CheckNTErrors(a && b && c, "Empty tensor input!");
    CheckNTErrors((a->unitNum == b->unitNum && a->unitNum == c->unitNum),
                  "Unmatched tensors in addition!");
    CheckNTErrors((a->dataType == b->dataType && a->dataType == c->dataType),
                  "Unmatched tensors in addition!");
    CheckNTErrors((a->devID == b->devID && a->devID == c->devID),
                  "The tensors must be on the same!");
    CheckNTErrors((a->dataType == DEFAULT_DTYPE && b->dataType == DEFAULT_DTYPE && c->dataType == DEFAULT_DTYPE) ||
                  (a->dataType == X_FLOAT16 && b->dataType == X_FLOAT16 && c->dataType == X_FLOAT16) ||
                  (a->dataType == X_INT && b->dataType == X_INT && c->dataType == X_INT) ||
                  (a->dataType == X_INT8 && b->dataType == X_INT8 && c->dataType == X_INT8),
                  "The sum function does not support this datatype.");

    int devIDBackup = XDevice::GetGPUDevice();
    XDevice::SetGPUDevice(a->devID);

    if (!a->isSparse && !b->isSparse) {
        CheckNTErrors(!c->isSparse,
            "Illegal use of sparse matrix in addition!");

        if (a->dataType == DEFAULT_DTYPE &&
            b->dataType == DEFAULT_DTYPE &&
            c->dataType == DEFAULT_DTYPE)
        {
            cublasHandle_t * handle = NULL;
            if ((a->mem != NULL) && (b->mem != NULL)) {
                cublasHandle_t * handleA = a->mem->GetCublasHandle();
                cublasHandle_t * handleB = b->mem->GetCublasHandle();
                handle = *handleA != 0 ? handleA : handleB;
            }
            else {
                handle = GDevs.GetCudaHandle(a->devID);
            }

            if ((c == a && handle != NULL) && *handle != 0) {
#ifdef DOUBELPRICSION
                cublasDaxpy(*handle, a->unitNum, &beta, (DTYPE*)b->data, 1, (DTYPE*)a->data, 1);
#else
                cublasSaxpy(*handle, a->unitNum, &beta, (DTYPE*)b->data, 1, (DTYPE*)a->data, 1);
#endif
            }
            else {
                int gridSize[3], blockSize[3];

                GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
                dim3 blocks(gridSize[0]);
                dim3 threads(blockSize[0]);

                KernelADD << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, a->unitNum, beta);
            }
        }
        else if (a->dataType == X_FLOAT16 &&
                 b->dataType == X_FLOAT16 &&
                 c->dataType == X_FLOAT16)
        {
            int gridSize[3], blockSize[3];

            GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
            dim3 blocks(gridSize[0]);
            dim3 threads(blockSize[0]);

            unsigned short temp = FloatToFloat16(beta);
            half beta1 = *((half *)&temp);
            KernelADD << <blocks, threads >> >((__half *)a->data, (__half *)b->data, (__half *)c->data, a->unitNum, beta1);
        }
        else if (a->dataType == X_INT &&
                 b->dataType == X_INT &&
                 c->dataType == X_INT)
        {
            int gridSize[3], blockSize[3];

            GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
            dim3 blocks(gridSize[0]);
            dim3 threads(blockSize[0]);

            int beta1 = (int)beta;
            KernelADD << <blocks, threads >> >((int *)a->data, (int *)b->data, (int *)c->data, a->unitNum, beta1);
        }
        else if (a->dataType == X_INT8 &&
                 b->dataType == X_INT8 &&
                 c->dataType == X_INT8)
        {
            int gridSize[3], blockSize[3];

            GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
            dim3 blocks(gridSize[0]);
            dim3 threads(blockSize[0]);

            __int8 beta1 = (__int8)beta;
            KernelADD << <blocks, threads >> >((__int8 *)a->data, (__int8 *)b->data, (__int8 *)c->data, a->unitNum, beta1);
        }
        else {
            // TODO!!
            ShowNTErrors("TODO!");
        }
    }
    else {
        // TODO!!
        ShowNTErrors("TODO!");
    }

    XDevice::SetGPUDevice(devIDBackup);
}

/* summation over arrays
tensor summation c = a + b * \beta (cuda version) with an input handle
>> devID - device ID (MUST >= 0)
>> handle - cuda handle
>> a - an array
>> b - another array
>> c - where we put a+b
>> size - size of the array
>> beta - the coefficient
*/
void _CudaSumWithHandle(int devID, cublasHandle_t * handle, DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta)
{
    if (size == 0)
        return;

    if (c == NULL)
        c = a;

    CheckNTErrors((a && b && c), "Empty arrays in addition!");

    int devIDBackup;
    ProtectCudaDev(devID, devIDBackup);

    if (c == a) {
#ifdef DOUBELPRICSION
        cublasDaxpy(*handle, size, &beta, b, 1, a, 1);
#else
        cublasSaxpy(*handle, size, &beta, b, 1, a, 1);
#endif
    }
    else {
        int gridSize[3], blockSize[3];

        GDevs.GetCudaThread(devID, size, gridSize, blockSize);

        dim3 blocks(gridSize[0]);
        dim3 threads(blockSize[0]);

        KernelADD<<<blocks, threads>>>((DTYPE*)a, (DTYPE*)b, (DTYPE*)c, size, beta);
    }

    BacktoCudaDev(devID, devIDBackup);
}

#endif // USE_CUDA

} // namespace nts(NiuTrans.Tensor)