use inf instead of 1e8 or 1e-8

ca4271f2 · xuchen · 976237ec · ca4271f2 · ca4271f2 · ca4271f2
Commit ca4271f2 authored Aug 22, 2023 by xuchen
--- a/fairseq/modules/espnet_multihead_attention.py
+++ b/fairseq/modules/espnet_multihead_attention.py
@@ -64,9 +64,9 @@ class ESPNETMultiHeadedAttention(nn.Module):
        if kwargs.get("cal_localness", False) and not self.encoder_decoder_attention: 
            self.cal_localness = True
            self.localness_window = kwargs.get("localness_window", 0.1)
-        if kwargs.get("cal_entropy", False):
+        if kwargs.get("cal_entropy", False): # and self.encoder_decoder_attention: 
            self.cal_entropy = True
-        if kwargs.get("cal_topk_cross_attn_weights", False) and self.encoder_decoder_attention: 
+        if kwargs.get("cal_topk_cross_attn_weights", False):
            self.cal_topk = True
            self.weights_topk = kwargs.get("topk_cross_attn_weights", 1)
        if kwargs.get("cal_monotonic_cross_attn_weights", False) and self.encoder_decoder_attention: 
@@ -74,7 +74,7 @@ class ESPNETMultiHeadedAttention(nn.Module):
    def dump(self, fstream, info):
        if self.cal_localness:
-            print("%s window size: %f localness: %.2f" % (info, self.localness_window, self.localness), file=fstream)
+            print("%s window size: %.2f localness: %.4f" % (info, self.localness_window, self.localness), file=fstream)
        if self.cal_entropy:
            print("%s Entropy: %.2f" % (info, self.entropy), file=fstream)
@@ -119,8 +119,8 @@ class ESPNETMultiHeadedAttention(nn.Module):
        if mask is not None:
            scores = scores.masked_fill(
                mask.unsqueeze(1).unsqueeze(2).to(bool),
-                -1e8 if scores.dtype == torch.float32 else -1e4
+                # -1e8 if scores.dtype == torch.float32 else -1e4
-                # float("-inf"),  # (batch, head, time1, time2)
+                float("-inf"),  # (batch, head, time1, time2)
            )
        # self.attn = torch.softmax(scores, dim=-1)   # (batch, head, time1, time2)

--- a/fairseq/modules/pds_layer.py
+++ b/fairseq/modules/pds_layer.py
@@ -286,7 +286,7 @@ class PDSTransformerEncoderLayer(nn.Module):
        # the attention weight (before softmax) for some padded element in query
        # will become -inf, which results in NaN in model parameters
        if attn_mask is not None:
-            attn_mask = attn_mask.masked_fill(attn_mask.to(torch.bool), -1e8)
+            attn_mask = attn_mask.masked_fill(attn_mask.to(torch.bool), -float('inf'))
        # whether to use macaron style
        if self.macaron_norm is not None:

--- a/fairseq/modules/s2t_transformer_layer.py
+++ b/fairseq/modules/s2t_transformer_layer.py
@@ -252,7 +252,7 @@ class S2TTransformerEncoderLayer(nn.Module):
        # the attention weight (before softmax) for some padded element in query
        # will become -inf, which results in NaN in model parameters
        if attn_mask is not None:
-            attn_mask = attn_mask.masked_fill(attn_mask.to(torch.bool), -1e8)
+            attn_mask = attn_mask.masked_fill(attn_mask.to(torch.bool), -float('inf'))
        # whether to use macaron style
        if self.macaron_norm is not None:

--- a/fairseq/modules/s2t_transformer_s2_layer.py
+++ b/fairseq/modules/s2t_transformer_s2_layer.py
@@ -315,7 +315,7 @@ class S2TTransformerS2EncoderLayer(nn.Module):
        # the attention weight (before softmax) for some padded element in query
        # will become -inf, which results in NaN in model parameters
        if attn_mask is not None:
-            attn_mask = attn_mask.masked_fill(attn_mask.to(torch.bool), -1e8)
+            attn_mask = attn_mask.masked_fill(attn_mask.to(torch.bool), -float('inf'))
        # whether to use macaron style
        if self.macaron_norm is not None:

--- a/fairseq/modules/transformer_layer.py
+++ b/fairseq/modules/transformer_layer.py
@@ -188,7 +188,7 @@ class TransformerEncoderLayer(nn.Module):
        # will become -inf, which results in NaN in model parameters
        if attn_mask is not None:
            attn_mask = attn_mask.masked_fill(
-                attn_mask.to(torch.bool), -1e8 if x.dtype == torch.float32 else -1e4
+                attn_mask.to(torch.bool), -float('inf') # -1e8 if x.dtype == torch.float32 else -1e4
            )
        residual = x

--- a/fairseq/modules/transformer_s2_layer.py
+++ b/fairseq/modules/transformer_s2_layer.py
@@ -241,7 +241,7 @@ class TransformerS2EncoderLayer(nn.Module):
        # will become -inf, which results in NaN in model parameters
        if attn_mask is not None:
            attn_mask = attn_mask.masked_fill(
-                attn_mask.to(torch.bool), -1e8 if x.dtype == torch.float32 else -1e4
+                attn_mask.to(torch.bool), -float('inf') # -1e8 if x.dtype == torch.float32 else -1e4
            )
        residual = x