# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
"""
RoBERTa: A Robustly Optimized BERT Pretraining Approach.
"""

import logging

import torch
import torch.nn as nn
import torch.nn.functional as F
from fairseq import utils
from fairseq.model_parallel.models.transformer import ModelParallelTransformerEncoder
from fairseq.models import register_model, register_model_architecture
from fairseq.models.roberta import (
    roberta_base_architecture,
    roberta_prenorm_architecture,
    RobertaEncoder,
    RobertaModel,
)
from fairseq.modules import LayerNorm


try:
    from fairseq.model_parallel.megatron.mpu import (
        copy_to_model_parallel_region,
        gather_from_model_parallel_region,
        ColumnParallelLinear,
        VocabParallelEmbedding,
    )

    has_megatron_submodule = True
except (ImportError, ModuleNotFoundError):
    has_megatron_submodule = False

logger = logging.getLogger(__name__)


@register_model("model_parallel_roberta")
class ModelParallelRobertaModel(RobertaModel):
    def __init__(self, args, encoder):
        super().__init__(args, encoder)

        self.classification_heads = nn.ModuleDict()

    @staticmethod
    def add_args(parser):
        RobertaModel.add_args(parser)
        parser.add_argument(
            "--no-final-layer-norm",
            action="store_true",
            help=(
                "don't add final layernorm (only applicable when "
                "--encoder-normalize-before=True"
            ),
        )

    @classmethod
    def build_model(cls, args, task):
        """Build a new model instance."""

        # make sure all arguments are present
        base_architecture(args)

        task.source_dictionary.pad_to_multiple_(args.model_parallel_size * 8)
        task.target_dictionary.pad_to_multiple_(args.model_parallel_size * 8)

        if not hasattr(args, "max_positions"):
            args.max_positions = args.tokens_per_sample

        if getattr(args, "untie_weights_roberta", False):
            raise NotImplementedError(
                "--untie-weights-roberta is not supported in model parallel mode"
            )

        encoder = ModelParallelRobertaEncoder(args, task.source_dictionary)
        return cls(args, encoder)

    def forward(
        self,
        src_tokens,
        features_only=False,
        return_all_hiddens=False,
        classification_head_name=None,
        **kwargs
    ):
        if classification_head_name is not None:
            features_only = True

        x, extra = self.encoder(src_tokens, features_only, return_all_hiddens, **kwargs)

        if classification_head_name is not None:
            x = self.classification_heads[classification_head_name](x)
        return x, extra

    def register_classification_head(
        self, name, num_classes=None, inner_dim=None, **kwargs
    ):
        """Register a classification head."""
        if name in self.classification_heads:
            prev_num_classes = self.classification_heads[name].out_proj.out_features
            prev_inner_dim = self.classification_heads[name].dense.out_features
            if num_classes != prev_num_classes or inner_dim != prev_inner_dim:
                logger.warning(
                    're-registering head "{}" with num_classes {} (prev: {}) '
                    "and inner_dim {} (prev: {})".format(
                        name, num_classes, prev_num_classes, inner_dim, prev_inner_dim
                    )
                )
        self.classification_heads[name] = ModelParallelRobertaClassificationHead(
            self.args.encoder_embed_dim,
            inner_dim or self.args.encoder_embed_dim,
            num_classes,
            self.args.pooler_activation_fn,
            self.args.pooler_dropout,
        )


class ModelParallelRobertaLMHead(nn.Module):
    """Head for masked language modeling."""

    def __init__(self, embed_dim, output_dim, activation_fn, weight=None):
        super().__init__()
        self.dense = ColumnParallelLinear(embed_dim, embed_dim, gather_output=True)
        self.activation_fn = utils.get_activation_fn(activation_fn)
        self.layer_norm = LayerNorm(embed_dim)

        if weight is None:
            weight = nn.Linear(embed_dim, output_dim, bias=False).weight
        self.weight = weight
        self.bias = nn.Parameter(torch.zeros(output_dim))

    def forward(self, features, masked_tokens=None, **kwargs):
        # Only project the unmasked tokens while training,
        # saves both memory and computation
        if masked_tokens is not None:
            features = features[masked_tokens, :]

        x = self.dense(features)
        x = self.activation_fn(x)
        x = self.layer_norm(x)

        x = copy_to_model_parallel_region(x)
        # project back to size of vocabulary with bias
        x = F.linear(x, self.weight)
        x = gather_from_model_parallel_region(x).contiguous()
        x = x + self.bias
        return x


class ModelParallelRobertaClassificationHead(nn.Module):
    """Head for sentence-level classification tasks."""

    def __init__(
        self, input_dim, inner_dim, num_classes, activation_fn, pooler_dropout
    ):
        super().__init__()
        self.dense = ColumnParallelLinear(input_dim, inner_dim, gather_output=True)
        self.activation_fn = utils.get_activation_fn(activation_fn)
        self.dropout = nn.Dropout(p=pooler_dropout)
        self.out_proj = nn.Linear(inner_dim, num_classes)

    def forward(self, features, **kwargs):
        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
        x = self.dropout(x)
        x = self.dense(x)
        x = self.activation_fn(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x


class ModelParallelRobertaEncoder(RobertaEncoder):
    """RoBERTa encoder."""

    def __init__(self, args, dictionary):
        super().__init__(args, dictionary)
        assert not self.args.untie_weights_roberta

    def build_embedding(self, vocab_size, embedding_dim, padding_idx):
        return VocabParallelEmbedding(vocab_size, embedding_dim, padding_idx)

    def build_encoder(self, args, dictionary, embed_tokens):
        return ModelParallelTransformerEncoder(args, dictionary, embed_tokens)

    def build_lm_head(self, embed_dim, output_dim, activation_fn, weight):
        return ModelParallelRobertaLMHead(embed_dim, output_dim, activation_fn, weight)


@register_model_architecture("model_parallel_roberta", "model_parallel_roberta")
def base_architecture(args):
    args.no_final_layer_norm = getattr(args, "no_final_layer_norm", False)
    # model parallel RoBERTa defaults to "Pre-LN" formulation
    roberta_prenorm_architecture(args)


# earlier versions of model parallel RoBERTa removed the final layer norm
@register_model_architecture("model_parallel_roberta", "model_parallel_roberta_v1")
def model_parallel_roberta_v1_architecture(args):
    args.no_final_layer_norm = getattr(args, "no_final_layer_norm", True)
    base_architecture(args)


@register_model_architecture(
    "model_parallel_roberta", "model_parallel_roberta_postnorm"
)
def model_parallel_roberta_postnorm_architecture(args):
    # the original BERT/RoBERTa uses the "Post-LN" formulation
    roberta_base_architecture(args)


@register_model_architecture("model_parallel_roberta", "model_parallel_roberta_base")
def model_parallel_roberta_base_architecture(args):
    base_architecture(args)


@register_model_architecture("model_parallel_roberta", "model_parallel_roberta_large")
def model_parallel_roberta_large_architecture(args):
    args.encoder_layers = getattr(args, "encoder_layers", 24)
    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024)
    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096)
    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16)
    base_architecture(args)