fix the bugs

33c67f90 · xuchen · 1f8d6f6c · 33c67f90 · 33c67f90 · 33c67f90
Commit 33c67f90 authored Feb 22, 2022 by xuchen
--- a/fairseq/criterions/ctc.py
+++ b/fairseq/criterions/ctc.py
@@ -176,9 +176,9 @@ class CtcCriterion(FairseqCriterion):
                from torch.distributions import Categorical
                # ctc_logit = ctc_logit.sort(dim=-1, descending=True)[0][:, :, 0:100]
                # ctc_logit = ctc_logit / ctc_logit.sum(dim=-1, keepdim=True)
-                cut_ctc_logit = ctc_logit.sort(dim=-1, descending=True)[0][:, :, 0:100]
-                ctc_entropy = Categorical(logits=cut_ctc_logit).entropy().sum()
-                # ctc_entropy = Categorical(logits=ctc_logit).entropy().sum()
+                # cut_ctc_logit = ctc_logit.sort(dim=-1, descending=True)[0][:, :, 0:100]
+                # ctc_entropy = Categorical(logits=cut_ctc_logit).entropy().sum()
+                ctc_entropy = Categorical(logits=ctc_logit).entropy().sum()
                logging_output["ctc_entropy"] = utils.item(ctc_entropy.data)
            logging_output["ctc_loss"] = utils.item(ctc_loss.data)


--- a/fairseq/models/speech_to_text/s2t_ctc.py
+++ b/fairseq/models/speech_to_text/s2t_ctc.py
@@ -19,7 +19,6 @@ from fairseq.modules import (
    FairseqDropout,
    LayerNorm,
    PositionalEmbedding,
-    PositionalEncoding,
    LegacyRelPositionalEncoding,
    RelPositionalEncoding,
    S2TTransformerEncoderLayer,
@@ -464,7 +463,7 @@ class S2TCTCEncoder(FairseqEncoder):
            self.embed_positions = RelPositionalEncoding(
                args.max_source_positions, args.encoder_embed_dim
            )
-        elif self.attn_type == "rel_selfattn":
+        elif self.attn_type in ["rel_selfattn", "rel_pos_legacy"]:
            self.embed_positions = LegacyRelPositionalEncoding(
                args.encoder_embed_dim, args.dropout, args.max_source_positions
            )
@@ -560,7 +559,7 @@ class S2TCTCEncoder(FairseqEncoder):
        # padding and position embedding
        encoder_padding_mask = lengths_to_padding_mask(input_lengths)

-        if self.attn_type == "rel_pos" or self.attn_type == "rel_selfattn":
+        if self.attn_type in ["rel_selfattn", "rel_pos", "rel_pos_legacy"]:
            positions = self.embed_positions(x)

        elif self.attn_type == "rope":

--- a/fairseq/models/speech_to_text/s2t_sate.py
+++ b/fairseq/models/speech_to_text/s2t_sate.py
@@ -333,7 +333,6 @@ class S2TSATEEncoder(FairseqEncoder):
        if "ctc_logit" in acoustic_encoder_out and len(acoustic_encoder_out["ctc_logit"]) > 0:
            ctc_logit = acoustic_encoder_out["ctc_logit"][0]
            ctc_prob = F.softmax(ctc_logit / self.temperature, dim=-1)
-            # ctc_prob = self.acoustic_encoder.ctc.softmax(encoder_out, self.temperature)
        else:
            ctc_logit = None
            ctc_prob = None

--- a/fairseq/models/transformer.py
+++ b/fairseq/models/transformer.py
@@ -25,6 +25,8 @@ from fairseq.modules import (
    LayerNorm,
    PositionalEmbedding,
    SinusoidalPositionalEmbedding,
+    RelPositionalEncoding,
+    LegacyRelPositionalEncoding,
    TransformerDecoderLayer,
    TransformerEncoderLayer,
    DynamicLinearCombination
@@ -173,7 +175,8 @@ class TransformerModel(FairseqEncoderDecoderModel):
                            help='checkpoint activations at each layer, which saves GPU '
                                 'memory usage at the cost of some additional compute')
        parser.add_argument('--offload-activations', action='store_true',
-                            help='checkpoint activations at each layer, then save to gpu. Sets --checkpoint-activations.')
+                            help='checkpoint activations at each layer, then save to gpu. '
+                                 'Sets --checkpoint-activations.')
        # args for "Cross+Self-Attention for Transformer Models" (Peitz et al., 2019)
        parser.add_argument('--no-cross-attention', default=False, action='store_true',
                            help='do not perform cross-attention')
@@ -204,6 +207,8 @@ class TransformerModel(FairseqEncoderDecoderModel):
                "selfattn",
                "rel_selfattn",
                "relative",
+                "rel_pos",
+                "rel_pos_legacy"
            ],
            help="transformer encoder self-attention layer type"
        )
@@ -473,6 +478,17 @@ class TransformerEncoder(FairseqEncoder):

        self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim)

+        if self.attn_type == "rel_pos":
+            self.embed_positions = RelPositionalEncoding(
+                args.max_source_positions, args.encoder_embed_dim
+            )
+        elif self.attn_type in ["rel_selfattn", "rel_pos_legacy"]:
+            self.embed_positions = LegacyRelPositionalEncoding(
+                args.encoder_embed_dim, args.dropout, args.max_source_positions
+            )
+        elif self.attn_type == "rope":
+            self.embed_positions = None
+        else:  # Use absolute positional embedding
            self.embed_positions = (
                PositionalEmbedding(
                    args.max_source_positions,

--- a/fairseq/modules/dlcl.py
+++ b/fairseq/modules/dlcl.py
@@ -3,6 +3,8 @@ import torch.nn as nn
 import torch.nn.functional as F
 import numpy as np

+from fairseq.modules.layer_norm import LayerNorm
+

 class DynamicLinearCombination(nn.Module):
    """Implementation of Dynamic Linear Combination of Layers (DLCL)
@@ -34,9 +36,9 @@ class DynamicLinearCombination(nn.Module):

        # init triangular layer norm
        if args.normalize_embed:
-            self.layer_norms = nn.ModuleList([nn.LayerNorm(self.dim) for _ in range(layer_num)])
+            self.layer_norms = nn.ModuleList([LayerNorm(self.dim) for _ in range(layer_num)])
        else:
-            self.layer_norms = nn.ModuleList([nn.Sequential()] + [nn.LayerNorm(self.dim) for _ in range(layer_num-1)])
+            self.layer_norms = nn.ModuleList([nn.Sequential()] + [LayerNorm(self.dim) for _ in range(layer_num-1)])

        # states
        self.count = 0
@@ -165,5 +167,3 @@ class DynamicLinearCombination(nn.Module):

    def forward(self):
        pass
-
-
--- a/fairseq/modules/speech_to_text/subsampling.py
+++ b/fairseq/modules/speech_to_text/subsampling.py
@@ -142,6 +142,9 @@ class Conv1dSubsampling(nn.Module):
            get_activation_class(act, dim=1)
        ) for layer_id in range(num_layers)])

+        out_dim = filters[-1]
+        self.linear = nn.Linear(filters, filters)
+
    def forward(self, x, x_len):

        # (B, T, D) -> (B, D, T)
@@ -154,6 +157,7 @@ class Conv1dSubsampling(nn.Module):
            if x_len is not None:
                x_len = torch.div(x_len - 1, 2, rounding_mode='floor') + 1
        x = x.transpose(1, 2)
+        x = self.linear(x)
        return x, x_len