/* NiuTrans.Tensor - an open-source tensor library
 * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/

#include <math.h>
#include "ReduceSum.h"
#include "ReduceSum.cuh"
#include "../../XName.h"

namespace nts{ // namespace nts(NiuTrans.Tensor)

/* 
sum the items along a dimension of the tensor

For a 1-dimensional data array a,
sum = \sum_i (a_i - shift)^power if isExp == false
sum = \sum_i exp((a_i - shift)^power) if isExp == true

>> input - the input tensor
>> output - the output tensor
>> dim - the dimension where the reduction is performed on
>> shift - shift the input
>> ieExp - specify if the exp() is performed
>> power - we perform pow(item_i, power) on each item in the array
*/
void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor * shift, DTYPE power, bool isExp)
{
    CheckNTErrors((input->devID == output->devID || (input->devID < 0 && output->devID < 0)), 
                  "This code must be run on the same device!");
    CheckNTErrors((input && output), "Empty input or output tensors!");
    CheckNTErrors((input->order == output->order + 1), "Incorrect tensor sizes!");
    CheckNTErrors((input->order > dim && dim >=0), "Illegal dimension to reduce!");
    CheckNTErrors((input->dataType == output->dataType), "Unmatched data types!");
    CheckNTErrors((shift == NULL || XTensor::IsSameShaped(output, shift)), "Incorrect shift tensor size!");

	int dimRDI = input->order - dim - 1;
    for(int i = 0; i < input->order; i++){
        if(i < dimRDI){
            CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i]), "Unmatched tensors!");
        }
        else if(i > dimRDI){
            CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i - 1]), "Unmatched tensors!");
        }
    }

    if(input->devID >= 0){
#ifdef USE_CUDA
        _CudaReduceSum(input, output, dim, shift, power, isExp);
#endif
    }
    else{
        CheckNTErrors((input->dataType == DEFAULT_DTYPE), "TODO!");

        int stride = 1;
        int strideNum = input->dimSizeRDI[dimRDI];
        int blockSize = 1;
        int blockNum = 1;
        for (int i = 0; i < input->order; i++) {
            if (i < dimRDI)
                stride *= input->dimSizeRDI[i];
            else if (i > dimRDI)
                blockNum *= input->dimSizeRDI[i];
        }
        blockSize = stride * strideNum;

        for(int k = 0; k < blockNum; k++){
            DTYPE * ip = (DTYPE*)input->data + blockSize * k;
            DTYPE * op = (DTYPE*)output->data + stride * k;
            DTYPE * sp = shift != NULL ? (DTYPE*)shift->data + stride * k : NULL;
            for(int i = 0; i < stride; i++){
                DTYPE sum = 0;
                DTYPE bias = shift != NULL ? *(sp + i) : 0;
                DTYPE * ipe = ip + blockSize;
                if(isExp){
                    if(bias == 0){
                        if(power == (DTYPE)1.0){
                            for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride)
                                sum += (DTYPE)exp(*ipb);
                        }
                        else if(power == (DTYPE)2.0){
                            for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
                                DTYPE value = (*ipb);
                                sum += (DTYPE)exp(value * value);
                            }
                        }
                        else if(power == (DTYPE)0.5){
                            for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
                                DTYPE value = (*ipb);
                                sum += (DTYPE)exp(sqrt(value));
                            }
                        }
                        else{
                            for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
                                DTYPE value = (*ipb);
                                sum += (DTYPE)exp(pow(value, power));
                            }
                        }
                    }
                    else{
                        if(power == (DTYPE)1.0){
                            for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride)
                                sum += (DTYPE)exp(*ipb - bias);
                        }
                        else if(power == (DTYPE)2.0){
                            for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
                                DTYPE value = (*ipb) - bias;
                                sum += (DTYPE)exp(value * value);
                            }
                        }
                        else if(power == (DTYPE)0.5){
                            for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
                                DTYPE value = (*ipb) - bias;
                                sum += (DTYPE)exp(sqrt(value));
                            }
                        }
                        else{
                            for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
                                DTYPE value = (*ipb) - bias;
                                sum += (DTYPE)exp(pow(value, power));
                            }
                        }
                    }
                }
                else{
                    if(bias == 0){
                        if(power == (DTYPE)1.0){
                            for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride)
                                sum += *ipb;
                        }
                        else if(power == (DTYPE)2.0){
                            for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
                                DTYPE value = (*ipb);
                                sum += value * value;
                            }
                        }
                        else if(power == (DTYPE)0.5){
                            for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
                                DTYPE value = (*ipb);
                                sum += (DTYPE)sqrt(value);
                            }
                        }
                        else{
                            for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
                                DTYPE value = (*ipb);
                                sum += (DTYPE)pow(value, power);
                            }
                        }
                    }
                    else{
                        if(power == (DTYPE)1.0){
                            for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride)
                                sum += *ipb;
                            sum -= strideNum * bias;
                        }
                        else if(power == (DTYPE)2.0){
                            for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
                                DTYPE value = (*ipb) - bias;
                                sum += value * value;
                            }
                        }
                        else if(power == (DTYPE)0.5){
                            for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
                                DTYPE value = (*ipb) - bias;
                                sum += (DTYPE)sqrt(value);
                            }
                        }
                        else{
                            for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
                                DTYPE value = (*ipb) - bias;
                                sum += (DTYPE)pow(value, power);
                            }
                        }
                    }
                }
                *(op + i) = sum;
            }
        }
    }
}

/* 
sum the items along a dimension of the tensor (return a XTensor structure)
make a new tensor to keep the result and return it

For a 1-dimensional data array a,
sum = \sum_i (a_i - shift)^power if isExp == false
sum = \sum_i exp((a_i - shift)^power) if isExp == true

>> input - the input tensor
>> dim - the dimension where the reduction is performed on
>> shift - shift the input
>> ieExp - specify if the exp() is performed
>> power - we perform pow(item_i, power) on each item in the array
<< return - the sum along a dimension of the tensor
*/
XTensor ReduceSum(const XTensor &input, int dim, const XTensor &shift, DTYPE power, bool isExp)
{
    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
	
    int order = input.order - 1;
    int * dimSize = new int[order];
    for(int i = 0; i < order; i++){
        if(i < dim)
            dimSize[i] = input.dimSize[i];
        else if(i >= dim)
            dimSize[i] = input.dimSize[i + 1];
    }

    float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
    XTensor output(order, dimSize, input.dataType, dr, input.devID, input.mem);
    output.SetTMP();

    /* call _ReduceSum function */
    _ReduceSum(&input, &output, dim, &shift, power, isExp);
            
    /* tensor connection */
    XLink::MakeLink(&input, &shift, &output, REDUCE_REDUCESUM);
    XLink::AddParamToHeadInt(&output, dim);
    XLink::AddParamToHead(&output, power);
    XLink::AddParamToHeadBool(&output, isExp);

    /* destroy variables */
    delete[] dimSize;

    return output;
}

/* 
sum the items along a dimension of the tensor (return a XTensor structure)
make a new tensor to keep the result and return it

For a 1-dimensional data array a,
sum = \sum_i (a_i)^power if isExp == false
sum = \sum_i exp((a_i)^power) if isExp == true

>> input - the input tensor
>> dim - the dimension where the reduction is performed on
>> ieExp - specify if the exp() is performed
>> power - we perform pow(item_i, power) on each item in the array
<< return - the sum along a dimension of the tensor
*/
XTensor ReduceSum(const XTensor &input, int dim, DTYPE power, bool isExp)
{
    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
	
    int order = input.order - 1;
    int * dimSize = new int[order];
    for(int i = 0; i < order; i++){
        if(i < dim)
            dimSize[i] = input.dimSize[i];
        else if(i >= dim)
            dimSize[i] = input.dimSize[i + 1];
    }

    float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
    XTensor output(order, dimSize, input.dataType, dr, input.devID, input.mem);
    output.SetTMP();

    /* call _ReduceSum function */
    _ReduceSum(&input, &output, dim, NULL, power, isExp);
            
    /* tensor connection */
    XLink::MakeLink(&input, NULL, &output, REDUCE_REDUCESUM);
    XLink::AddParamToHeadInt(&output, dim);
    XLink::AddParamToHead(&output, power);
    XLink::AddParamToHeadBool(&output, isExp);

    /* destroy variables */
    delete[] dimSize;

    return output;
}

} // namespace nts(NiuTrans.Tensor)