/*
 * NiuTrans.Tensor - an open-source tensor library
 * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-05-08
 */

#include <math.h>
#include "SetData.h"
#include "SetData.cuh"
#include "../../XUtility.h"
#include "../movement/CopyValues.h"

#if !defined( WIN32 ) && !defined( _WIN32 )
    #include "sys/time.h"
    #include "time.h"
    #include "iconv.h"
#else
    #include "time.h"
    #include "windows.h"
    #include "process.h"
#endif

namespace nts{ // namespace nts(NiuTrans.Tensor)

/*
Fills the input Tensor or Variable with values according to the method described in 
"Understanding the difficulty of training deep feedforward neural networks" - Glorot, X. & Bengio, Y. (2010), 
using a uniform distribution. The resulting tensor will have values sampled from :math:`U(-a, a)` 
where :math:`a = gain \times \sqrt{2 / (fan\_in + fan\_out)} \times \sqrt{3}`. Also known as Glorot initialisation.

>> tensor - the tensor whose data array would be initialized
>> gain - an optional scaling factor
*/
void _SetDataFanInOut(XTensor * tensor, DTYPE gain)
{
    CheckNTErrors(tensor->dataType == X_FLOAT, "the tensor must be in X_FLOAT!");
    CheckNTErrors(tensor->order >= 2, "the tensor dimension must be no less than 2!");

    int fanIn = 1;
    int fanOut = 1;

    int order = tensor->order;
    if (order == 2) {
        fanIn = tensor->dimSize[1];
        fanOut = tensor->dimSize[0];
    }
    else {
        int numInputFmaps = tensor->dimSize[1];
        int numOutputFmaps = tensor->dimSize[0];
        int receptiveFieldSize = 0;
        for (int i = 2; i < order; i++)
            receptiveFieldSize += tensor->dimSize[i];
        fanIn = numInputFmaps * receptiveFieldSize;
        fanOut = numOutputFmaps * receptiveFieldSize;
    }

    DTYPE std = gain * (float)sqrt(2.0/(fanIn + fanOut));
    DTYPE a = (DTYPE)sqrt(3.0) * std;
    _SetDataRand(tensor, -a, a);
}

/* 
generate data items with a fixed value p 
>> tensor - the tensor whose data array would be initialized
>> p - pointer to the number for initializing the tensor
*/
void _SetDataFixed(XTensor * tensor, void * valuePointer)
{
    int num = tensor->unitNum;

    if(tensor->dataType == X_INT){
        int p = *(int*)valuePointer;
        if(tensor->devID < 0){
            int * d = (int*)tensor->data;
            if(num % 4 == 0){
                for(int i = 0; i < num; i += 4){
                    d[i] = p;
                    d[i + 1] = p;
                    d[i + 2] = p;
                    d[i + 3] = p;
                }
            }
            else{
                for(int i = 0; i < num; i++)
                    d[i] = p;
            }
        }
        else{
#ifdef USE_CUDA
            _CudaSetDataFixedInt(tensor, p);
#endif
        }
    }
    else if(tensor->dataType == X_FLOAT){
        float p = *(float*)valuePointer;
        if(tensor->devID < 0){
            float * d = (float*)tensor->data;
            if(num % 4 == 0){
                for(int i = 0; i < num; i += 4){
                    d[i] = p;
                    d[i + 1] = p;
                    d[i + 2] = p;
                    d[i + 3] = p;
                }
            }
            else{
                for(int i = 0; i < num; i++)
                    d[i] = p;
            }
        }
        else{
#ifdef USE_CUDA
            _CudaSetDataFixedFloat(tensor, p);
#endif
        }
    }
    else if(tensor->dataType == X_DOUBLE){
        double p = *(double*)valuePointer;
        if(tensor->devID < 0){
            double * d = (double*)tensor->data;
            if(num % 4 == 0){
                for(int i = 0; i < num; i += 4){
                    d[i] = p;
                    d[i + 1] = p;
                    d[i + 2] = p;
                    d[i + 3] = p;
                }
            }
            else{
                for(int i = 0; i < num; i++)
                    d[i] = p;
            }
        }
        else{
#ifdef USE_CUDA
            _CudaSetDataFixedDouble(tensor, p);
#endif
        }
    }
    else{
        ShowNTErrors("TODO");
    }
}

/* 
generate data items with a fixed value p (in default type) 
>> tensor - the tensor whose data array would be initialized
>> p - number in default type
*/
void SetDataFixed(XTensor &tensor, DTYPE p)
{
    _SetDataFixed(&tensor, &p);
}

/* 
generate data items with a fixed value p (in integer) 
>> tensor - the tensor whose data array would be initialized
>> p - an int-valued number
*/
void _SetDataFixedInt(XTensor * tensor, int p)
{
    CheckNTErrors(tensor->dataType == X_INT, "the tensor must be in X_INT!");

    if(p == 0)
        tensor->SetZeroAll();
    else
        _SetDataFixed(tensor, &p);
}

/*
generate data items with a fixed value p (in float) 
>> tensor - the tensor whose data array would be initialized
>> p - a float-valued number
*/
void _SetDataFixedFloat(XTensor * tensor, float p)
{
    CheckNTErrors(tensor->dataType == X_FLOAT, "the tensor must be in X_FLOAT!");

    if(p == 0)
        tensor->SetZeroAll();
    else
        _SetDataFixed(tensor, &p);
}

/* 
generate data items with a fixed value p (in double) 
>> tensor - the tensor whose data array would be initialized
>> p - a double-valued number
*/
void _SetDataFixedDouble(XTensor * tensor, double p)
{
    CheckNTErrors(tensor->dataType == X_DOUBLE, "the tensor must be in X_DOUBLE!");

    if(p == 0)
        tensor->SetZeroAll();
    else
        _SetDataFixed(tensor, &p);
}

/* 
generate data as lower triangular matrics for last two dimensions 
>> tensor - the tensor whose data to be set
>> p - the value for each entry of the lower triangular matrics
>> shift - the offset from diagonal
e.g., for a 3* 3 tensor, 
      when p = 1 ans shift = 0, we have
      1 0 0
      1 1 0
      1 1 1
      when p = 2 and shift = -1, we have
      0 0 0
      2 0 0
      2 2 0
*/
void _SetDataLowTri(XTensor * tensor, DTYPE p, int shift)
{
    int n = tensor->order;

    CheckNTErrors(tensor->dataType == DEFAULT_DTYPE, "TODO!");
    CheckNTErrors(n >= 2, "The tensor must have a order no less than 2!");
    CheckNTErrors(tensor->GetDim(n - 1) == tensor->GetDim(n - 2), 
                 "The last two dimensions must be of the same size!");

    if(tensor->devID < 0){
        int l = tensor->GetDim(-1);
        int blockNum = 1;
        int blockSize = l * l;
        for(int i = 0; i < n - 2; i++)
            blockNum *= tensor->GetDim(i);

        for(int i = 0; i < blockNum; i++){
            DTYPE * d = (DTYPE*)tensor->data + i * blockSize;
            for(int row = 0; row < l; row++){
                for(int col = 0; col < row + shift; col++){
                    d[row * l + col] = row;
                }
                for(int col = row + shift; col < l; col++){
                    d[row * l + col] = 0;
                }
            }
        }
    }
    else{
#ifdef USE_CUDA
        _CudaSetDataLowTri(tensor, p, shift);
#endif
    }
}

/*
generate data items with a uniform distribution in [lower, upper]
>> tensor - the tensor whose data array would be initialized
>> lower - lower value of the range
>> upper - upper value of the range
*/
void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
{
    CheckNTErrors(upper > lower, "the high value must be greater than low value!");

    if(tensor == NULL)
        return;
    
    /* GPU code */
    if(tensor->devID < 0){
        DTYPE variance = upper - lower;
        
        if(tensor->dataType == X_FLOAT){
            float * d = (float*)tensor->data;
            for(int i = 0; i < tensor->unitNum; i++){
                d[i] = variance * ((float)rand()/RAND_MAX) + lower;
            }
        }
        else if(tensor->dataType == X_DOUBLE){
            double * d = (double*)tensor->data;
            for(int i = 0; i < tensor->unitNum; i++){
                d[i] = variance * ((double)rand()/RAND_MAX) + lower;
            }
        }
        else{
            ShowNTErrors("TODO");
        }
    }
    /* 
    GPU code
    The trick here is that initialize the data on a temperary tensor on CPU.
    The CPU data is then copied to GPU.
    TODO: generate data points on GPUs straightforwardly.
    */
    else{
#ifdef USE_CUDA
        _CudaSetDataRand(tensor, lower, upper);
#endif
        //XTensor * t2 = NewTensor(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, -1);
        //_SetDataRand(t2, low, high);
        //_CopyValues(t2, tensor);
        //delete t2;
    }
}
    

/*
generate data items with a normal distribution with specified mean and standard deviation 
>> mean - mean or expectation of the distribution
>> standardDeviation - standard deviation of the distribution
*/
void _SetDataRandN(XTensor * tensor, DTYPE mean, DTYPE standardDeviation)
{
    // TODO: rewrite it and add cuda code!!!!!!!
    tensor->SetDataRandn(mean, standardDeviation);
}

} // namespace nts(NiuTrans.Tensor)