/* NiuTrans.Tensor - an open-source tensor library
 * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*
 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-12
 */

#include "../XName.h"
#include <math.h>
#include <time.h>
#include "Dropout.h"
#include "Dropout.cuh"
#include "../core/arithmetic/Multiply.h"
#include "../core/math/ScaleAndShift.h"

namespace nts{ // namespace nts(NiuTrans.Tensor

/*
generate a random bernoulli number
*/
DTYPE RandomBernoulli(DTYPE prob)
{
    return (DTYPE)rand()/(DTYPE)RAND_MAX > prob ? (DTYPE)1.0 : (DTYPE)0.0;
}

/*
dropout function

During training, randomly zeroes some of the elements of the input tensor
with probability p using samples from a Bernoulli distribution.
The elements to zero are randomized on every forward call.

This has proven to be an effective technique for regularization and
preventing the co-adaptation of neurons as described in the paper
"Improving neural networks by preventing co-adaptation of feature detectors".

Furthermore, the outputs are scaled by a factor of \frac{1}{1-p} during training.
This means that during evaluation the module simply computes an identity function.
>> x - input tensor
>> y - output tensor
>> prob - probability to set an element zero
*/
void _Dropout(const XTensor *x, XTensor *y, unsigned int seed, DTYPE prob)
{
    CheckNTErrors(prob >= 0.0 && prob <= 1.0, "The probability must be 0-1!");
    
   DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - prob);
    
    /* generate a mask tensor again with special probability */
    srand(seed);
    int unitNum = x->unitNum;
    DTYPE * maskArray = new DTYPE[unitNum];
    for (int i = 0; i < unitNum; i++)
        maskArray[i] = RandomBernoulli(prob);

    XTensor * maskTensor = NewTensorBuf(x, x->devID, x->mem);
    maskTensor->SetData(maskArray, unitNum);

#ifdef USE_CUDA
    if(x->devID >=0 || y->devID >= 0){
        _CudaDropout(x, y, maskTensor, scaleFactor);
        
        DelTensorBuf(maskTensor);
        delete[] maskArray;
        return;
    }
#endif

    XTensor * inter = NewTensorBuf(x, x->devID, x->mem);
    _Multiply(x, maskTensor, inter);
    _ScaleAndShift(inter, y, scaleFactor, 0);
    
    DelTensorBuf(inter);
    DelTensorBuf(maskTensor);
    delete[] maskArray;
}

/*
dropout function (return a XTensor structure)
make a new tensor to keep the result and return it

During training, randomly zeroes some of the elements of the input tensor
with probability p using samples from a Bernoulli distribution.
The elements to zero are randomized on every forward call.

This has proven to be an effective technique for regularization and
preventing the co-adaptation of neurons as described in the paper
"Improving neural networks by preventing co-adaptation of feature detectors".

Furthermore, the outputs are scaled by a factor of \frac{1}{1-p} during training.
This means that during evaluation the module simply computes an identity function.
>> x - input tensor
>> y - output tensor
>> prob - probability to set an element zero
*/
XTensor Dropout(const XTensor &x, DTYPE prob)
{
    XTensor y(&x);
    y.SetTMP();

   DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - prob);
    
    /* generate a mask tensor again with special probability */
    srand((unsigned int)time(NULL));
    int unitNum = x.unitNum;
    DTYPE * maskArray = new DTYPE[unitNum];
    for (int i = 0; i < unitNum; i++)
        maskArray[i] = RandomBernoulli(prob);

    XTensor maskTensor(&x);
    maskTensor.SetData(maskArray, unitNum);

    XTensor inter;
    inter = Multiply(x, maskTensor);
    y = ScaleAndShift(inter, scaleFactor, 0);

    delete[] maskArray;

    ///* tensor connection */
    //XLink::MakeLink(&x, NULL, &y, FUNC_DROPOUT);
    //XLink::AddParamToHead(&y, prob);
    
    return y;
}

/* 
backward computation of dropout function

dE/dx = dE/dy * dy/dx

>> y - output of the dropout function
>> x - input of the dropout function
>> dedy - dE/dy
>> dedx - dE/dx
>> prob - probability to set an element zero
*/
void _DropoutBackward(const XTensor * y, const XTensor * x, 
                      const XTensor * dedy, XTensor * dedx, 
                      unsigned int seed, DTYPE prob)
{
    if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE)
    {
        int unitNum = y->unitNum;
        DTYPE scaleFactor = (DTYPE)1.0F / ((DTYPE)1.0F - prob);

        /* generate a mask tensor again with special probability */
        srand(seed);
        DTYPE * maskArray = new DTYPE[unitNum];
        for (int i = 0; i < unitNum; i++)
            maskArray[i] = RandomBernoulli(prob);

        XTensor * maskTensor = NewTensorBuf(x, x->devID, x->mem);
        maskTensor->SetData(maskArray, unitNum);

        #ifdef USE_CUDA
            if(x->devID >= 0 || y->devID >= 0){
                _CudaDropoutBackward(y, x, dedy, dedx, maskTensor, scaleFactor);
                
                DelTensorBuf(maskTensor);
                delete[] maskArray;
                return;
            }
        #endif

        DTYPE * dedyp = (DTYPE*)dedy->data;
        DTYPE * dedxp = (DTYPE*)dedx->data;

        /* dE/dx = dE/dy * dy/dx */
        for(int i = 0; i < unitNum; i++)
            dedxp[i] = dedyp[i] * maskArray[i] * scaleFactor;

        DelTensorBuf(maskTensor);
        delete[] maskArray;
    }
    else
        ShowNTErrors("TODO!");
}

} // namespace nts(NiuTrans.Tensor)