/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*   http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/

#include "../../XTensor.h"
#include "../../XDevice.h"
#include "../../XName.h"
#include "MatrixMulBatched.h"
#include "XTensorBLAS.h"
#include "MatrixMul2D.h"

namespace nts { // namespace nts(NiuTrans.Tensor)

/*
matrix multiplication of the two tensors

for each 2-dimensional data array in a (denoted as ai) and
each 2-dimensional data array in b (denoted as bi), we have
ci = trans(ai) * trans(bi) * alpha + cm * beta
where trans() returns the transposed matrix if the flag is fired

>> a - tensor a
>> transposedA - indicates whether the matrices in a are transposed
>> b - tensor b
>> transposedB - indicates whether teh matrices in b are transposed
>> c - where we keep a*b
>> alpha - a coefficient
>> beta - another coefficient
>> parallelRunner - parallel processing module
*/
void _MatrixMulBatched(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
                       const XTensor * b, MATRIX_TRANS_TYPE transposedB,
                       XTensor * c, DTYPE alpha, DTYPE beta, XPRunner * parallelRunner)
{
    CheckNTErrors((a && b && c), "Empty input tensors!");
    CheckNTErrors((a->dataType == b->dataType && a->dataType == c->dataType),
                  "Input tensors should have the same data type!");
    CheckNTErrors((a->order >= 2 && b->order >= 2 && c->order >= 2),
                  "Input tensors must have a order >= 2!");
    CheckNTErrors((a->order == b->order && a->order == c->order), 
                  "Input tensor and output tensor must have same order!");

    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0)
        _MatrixMulBatchedGPU(a, transposedA, b, transposedB, c, alpha, beta);
    else
        _MatrixMulBatchedCPU(a, transposedA, b, transposedB, c, alpha, beta);
}

/*
matrix multiplication of the two tensors
optimized for GPU

for each 2-dimensional data array in a (denoted as ai) and
each 2-dimensional data array in b (denoted as bi), we have
ci = trans(ai) * trans(bi) * alpha + cm * beta
where trans() returns the transposed matrix if the flag is fired

>> a - tensor a
>> transposedA - indicates whether the matrices in a are transposed
>> b - tensor b
>> transposedB - indicates whether teh matrices in b are transposed
>> c - where we keep a*b
>> alpha - a coefficient
>> beta - another coefficient
*/
void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
                          const XTensor * b, MATRIX_TRANS_TYPE transposedB,
                          XTensor * c, DTYPE alpha, DTYPE beta)
{
#ifdef USE_CUDA
    CheckNTErrors((a && b && c), "Empty input tensors!");
    CheckNTErrors((a->dataType == b->dataType && a->dataType == c->dataType),
                  "Input tensors should have the same data type!");
    CheckNTErrors((a->order >= 2 && b->order >= 2 && c->order >= 2),
                  "Input tensors must have a order >= 2!");
    CheckNTErrors((a->order == b->order && a->order == c->order), 
                  "Input tensor and output tensor must have same order!");
    CheckNTErrors(a->devID >= 0 && b->devID >= 0 && c->devID >= 0, "The tensors must be on GPUs");

    int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
    int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
    int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1];
    int bm = transposedB == X_TRANS ? b->dimSizeRDI[1] : b->dimSizeRDI[0];
    int cn = c->dimSizeRDI[1];
    int cm = c->dimSizeRDI[0];

    CheckNTErrors((am == bn && an == cn && bm == cm), "Unmatched tensors in multiplication!");

    int aBlockSize = a->dimSizeRDI[0] * a->dimSizeRDI[1];
    int bBlockSize = b->dimSizeRDI[0] * b->dimSizeRDI[1];
    int cBlockSize = c->dimSizeRDI[0] * c->dimSizeRDI[1];
    int aRealBlockSize = aBlockSize * a->unitSize;
    int bRealBlockSize = bBlockSize * b->unitSize;
    int cRealBlockSize = cBlockSize * c->unitSize;
    int blockNum = 1;

    for (int i = 2; i < a->order; i++) {
        CheckNTErrors((a->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!");
        CheckNTErrors((b->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!");
        blockNum *= a->dimSizeRDI[i];
    }

    int devIDBackup = 0;
    ProtectCudaDev(a->devID, devIDBackup);

    cublasHandle_t * handle = a->mem != NULL ? a->mem->GetCublasHandle() : GDevs.GetCudaHandle(a->devID);
    _CudaBLASMatrixMULBatchedStrided(handle,
                                     a->data, transposedA, a->dataType, aBlockSize,
                                     b->data, transposedB, b->dataType, bBlockSize,
                                     c->data, c->dataType, cBlockSize, blockNum,
                                     a->dimSizeRDI[1], a->dimSizeRDI[0],
                                     b->dimSizeRDI[1], b->dimSizeRDI[0],
                                     c->dimSizeRDI[1], c->dimSizeRDI[0], alpha, beta);

    BacktoCudaDev(a->devID, devIDBackup);
#endif
}

/*
matrix multiplication of the two tensors
optimized for CPU

for each 2-dimensional data array in a (denoted as ai) and
each 2-dimensional data array in b (denoted as bi), we have
ci = trans(ai) * trans(bi) * alpha + cm * beta
where trans() returns the transposed matrix if the flag is fired

>> a - tensor a
>> transposedA - indicates whether the matrices in a are transposed
>> b - tensor b
>> transposedB - indicates whether teh matrices in b are transposed
>> c - where we keep a*b
>> alpha - a coefficient
>> beta - another coefficient
*/
void _MatrixMulBatchedCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
                          const XTensor * b, MATRIX_TRANS_TYPE transposedB,
                          XTensor * c, DTYPE alpha, DTYPE beta)
{
CheckNTErrors((a && b && c), "Empty input tensors!");
    CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
                 "Input tensors should have the same data type!");
    CheckNTErrors(a->order >= 2 && b->order >= 2 && c->order >= 2,
                 "Input tensors must have a order >= 2!");
    CheckNTErrors(a->order == b->order && a->order == c->order, 
                 "Input tensor and output tensor must have same order!");


    int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
    int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
    int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1];
    int bm = transposedB == X_TRANS ? b->dimSizeRDI[1] : b->dimSizeRDI[0];
    int cn = c->dimSizeRDI[1];
    int cm = c->dimSizeRDI[0];

    CheckNTErrors(am == bn && an == cn && bm == cm, "Unmatched tensors in multiplication!");

    int aBlockSize = a->dimSizeRDI[0] * a->dimSizeRDI[1];
    int bBlockSize = b->dimSizeRDI[0] * b->dimSizeRDI[1];
    int cBlockSize = c->dimSizeRDI[0] * c->dimSizeRDI[1];
    int aRealBlockSize = aBlockSize * a->unitSize;
    int bRealBlockSize = bBlockSize * b->unitSize;
    int cRealBlockSize = cBlockSize * c->unitSize;
    int blockNum = 1;

    for (int i = 2; i < a->order; i++) {
        CheckNTErrors((a->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!");
        CheckNTErrors((b->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!");
        blockNum *= a->dimSizeRDI[i];
    }

    int aDimSize[2] = {-a->dimSizeRDI[1], a->dimSizeRDI[0]};
    int bDimSize[2] = {-b->dimSizeRDI[1], b->dimSizeRDI[0]};
    int cDimSize[2] = {-c->dimSizeRDI[1], c->dimSizeRDI[0]};

    XTensor * ai = NewTensor2D(aDimSize[0], aDimSize[1], a->dataType, a->devID, a->mem);
    XTensor * bi = NewTensor2D(bDimSize[0], bDimSize[1], b->dataType, b->devID, b->mem);
    XTensor * ci = NewTensor2D(cDimSize[0], cDimSize[1], c->dataType, c->devID, c->mem);

    for (int i = 0; i < blockNum; i++) {
        ai->data = (char*)a->data + i * aRealBlockSize;
        bi->data = (char*)b->data + i * bRealBlockSize;
        ci->data = (char*)c->data + i * cRealBlockSize;
#ifdef USE_BLAS
        if (useBLAS)
            _MatrixMULCPU(ai, transposedA, bi, transposedB, ci, alpha, beta);
        else
            _MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
#else
        _MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
#endif
    }

    ai->data = NULL;
    bi->data = NULL;
    ci->data = NULL;
    delete ai;
    delete bi;
    delete ci;
}

/*
matrix multiplication in batch mode for list inputs (BLAS)
c_i = trans(a_i) * trans(b_i) * \alpha + c_i * \beta for each i in [0,count-1]
>> a - list of input matrices (2d tensors)
>> transposedA - indicate whether the matrix a is transposed
>> b - another list of input matrices (2d tensors)
>> transposedB - indicate whether the matrix b is transposed
>> c - output matrix (2d tensor)
>> alpha - scalar
>> beta - scalar
*/
void _MatrixMulBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA,
                          const XList * b, MATRIX_TRANS_TYPE transposedB,
                          XList * c, DTYPE alpha, DTYPE beta)
{
    CheckNTErrors(a && b && c, "Empty input lists!");
    CheckNTErrors(a->count == b->count && a->count == c->count, "Input lists must be of the same size!");

    if (a->count == 0)
        return;

    bool isUniform = true;
    for (int i = 1; i < a->count; i++) {
        XTensor * aim = (XTensor*)a->GetItem(i - 1);
        XTensor * bim = (XTensor*)b->GetItem(i - 1);
        XTensor * cim = (XTensor*)c->GetItem(i - 1);
        XTensor * ai = (XTensor*)a->GetItem(i);
        XTensor * bi = (XTensor*)b->GetItem(i);
        XTensor * ci = (XTensor*)c->GetItem(i);
        if (!XTensor::IsSameShaped(aim, ai) ||
            !XTensor::IsSameShaped(bim, bi) ||
            !XTensor::IsSameShaped(cim, ci))
        {
            isUniform = false;
            break;
        }
    }

    for (int i = 0; i < a->count; i++) {
        XTensor * ai = (XTensor*)a->GetItem(i);
        XTensor * bi = (XTensor*)b->GetItem(i);
        XTensor * ci = (XTensor*)c->GetItem(i);
        CheckNTErrors((ai->order == 2), "2d tensor (i.e., matrix) is required!");
        CheckNTErrors((bi->order == 2), "2d tensor (i.e., matrix) is required!");
        CheckNTErrors((ci->order == 2), "2d tensor (i.e., matrix) is required!");
#ifdef USE_BLAS
        if (useBLAS)
            _MatrixMULCPU(ai, transposedA, bi, transposedB, ci, alpha, beta);
        else
            _MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
#else
        _MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
#endif
    }
}

/*
matrix multiplication of the two tensors (do it on site)
c = trans(a) * trans(b) * alpha
make a new tensor to keep the result and return it

for each 2-dimensional data array in a (denoted as ai) and
each 2-dimensional data array in b (denoted as bi), we have
ci = trans(ai) * trans(bi) * alpha + cm * beta
where trans() returns the transposed matrix if the flag is fired.

>> a - tensor a
>> transposedA - indicates whether the matrices in a are transposed
>> b - tensor b
>> transposedB - indicates whether teh matrices in b are transposed
>> alpha - a coefficient
>> parallelRunner - parallel processing module
<< return - the result of matrix multiplication of the two tensors
*/
XTensor MatrixMulBatched(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB,
                         DTYPE alpha, XPRunner * parallelRunner)
{
    CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
    CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
    CheckNTErrors(a.order == b.order, "Input tensor and output tensor must have same order!");

    int an = transposedA == X_TRANS ? a.dimSizeRDI[0] : a.dimSizeRDI[1];
    int am = transposedA == X_TRANS ? a.dimSizeRDI[1] : a.dimSizeRDI[0];
    int bn = transposedB == X_TRANS ? b.dimSizeRDI[0] : b.dimSizeRDI[1];
    int bm = transposedB == X_TRANS ? b.dimSizeRDI[1] : b.dimSizeRDI[0];

    CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");

    int order = a.order;
    int sub = 0;
    int * dimSize = new int[order];
    for (int i = 0; i < a.order - 2; i++)
        dimSize[sub++] = a.dimSize[i];
    dimSize[sub++] = an;
    dimSize[sub++] = bm;

    float dr = (!a.isSparse || !b.isSparse) ? 1.0F : MAX(a.denseRatio, b.denseRatio);
    XTensor c(order, dimSize, a.dataType, dr, a.devID, a.mem);
    c.SetTMP();

    /*call _MatrixMulBatched function */
    _MatrixMulBatched(&a, transposedA, &b, transposedB, &c, alpha, 0, parallelRunner);

    /* tensor connections */
    XLink::MakeLink(&a, &b, &c, MATH_MATRIXMULBATCHED);
    XLink::AddParamToHeadTrans(&c, transposedA);
    XLink::AddParamToHeadTrans(&c, transposedB);
    XLink::AddParamToHead(&c, alpha);

    /* destroy variables */
    delete[] dimSize;

    return c;
}

/*
matrix multiplication of the two tensors (do it on site)
c = a * b * alpha
make a new tensor to keep the result and return it

for each 2-dimensional data array in a (denoted as ai) and
each 2-dimensional data array in b (denoted as bi), we have
ci = ai * bi * alpha + cm * beta

>> a - tensor a
>> b - tensor b
>> alpha - a coefficient
>> parallelRunner - parallel processing module
<< return - the result of matrix multiplication of the two tensors
*/
XTensor MatrixMulBatched(const XTensor &a, const XTensor &b,
                         DTYPE alpha, XPRunner * parallelRunner)
{
    CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
    CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
    CheckNTErrors(a.order == b.order, "Input tensor and output tensor must have same order!");

    int an = a.dimSizeRDI[1];
    int am = a.dimSizeRDI[0];
    int bn = b.dimSizeRDI[1];
    int bm = b.dimSizeRDI[0];

    CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");

    int order = a.order;
    int sub = 0;
    int * dimSize = new int[order];
    for (int i = 0; i < a.order - 2; i++)
        dimSize[sub++] = a.dimSize[i];
    dimSize[sub++] = an;
    dimSize[sub++] = bm;

    float dr = (!a.isSparse || !b.isSparse) ? 1.0F : MAX(a.denseRatio, b.denseRatio);
    XTensor c(order, dimSize, a.dataType, dr, a.devID, a.mem);
    c.SetTMP();

    /*call _MatrixMulBatched function */
    _MatrixMulBatched(&a, X_NOTRANS, &b, X_NOTRANS, &c, alpha, 0, parallelRunner);

    /* tensor connections */
    XLink::MakeLink(&a, &b, &c, MATH_MATRIXMULBATCHED);
    XLink::AddParamToHeadTrans(&c, X_NOTRANS);
    XLink::AddParamToHeadTrans(&c, X_NOTRANS);
    XLink::AddParamToHead(&c, alpha);

    /* destroy variables */
    delete[] dimSize;

    return c;
}

} // namespace nts(NiuTrans.Tensor)