fix the bugs

03076942 · xuchen · 1d60b3a6 · 03076942 · 03076942 · 03076942
Commit 03076942 authored May 13, 2022 by xuchen
--- a/egs/aishell/asr/conf/basis.yaml
+++ b/egs/aishell/asr/conf/basis.yaml
@@ -6,6 +6,7 @@ max-update: 100000
 patience: 20
 best_checkpoint_metric: loss
 maximize_best_checkpoint_metric: False
+post-process: sentencepiece
 no-epoch-checkpoints: True
 #keep-last-epochs: 10

--- a/egs/aishell/asr/conf/big_wenet.yaml
+++ b/egs/aishell/asr/conf/big_wenet.yaml
@@ -31,6 +31,8 @@ decoder-embed-dim: 512
 decoder-ffn-embed-dim: 2048
 decoder-attention-heads: 8
+cnn-module-norm: layer_norm
 load-pretrained-encoder-from: /home/xuchen/after.pt
 load-pretrained-decoder-from: /home/xuchen/after.pt
 #load-pretrained-decoder-from:
--- a/egs/aishell/asr/conf/ctc.yaml
+++ b/egs/aishell/asr/conf/ctc.yaml
 ctc-weight: 0.3
-post-process: sentencepiece
+share-ctc-and-embed: True
\ No newline at end of file
--- a/egs/aishell/asr/conf/inter.yaml
+++ b/egs/aishell/asr/conf/inter.yaml
+ctc-weight: 0.2
+interleaved-ctc-weight: 0.1
+interleaved-ctc-layers: 6,9
+interleaved-ctc-temperature: 1.0
+interleaved-ctc-drop-prob: 0
+sae-adapter: league
+sae-drop-prob: 0.2
+sae-distribution-cutoff: 10
+share-ctc-and-sae: False
+ctc-self-distill-weight: 0
--- a/egs/aishell/asr/conf/mixup.yaml
+++ b/egs/aishell/asr/conf/mixup.yaml
+inter_mixup: True
+inter_mixup_layer: -1
+inter_mixup_prob: 1.0
+inter_mixup_ratio: 0.2
\ No newline at end of file
--- a/egs/aishell/asr/train.sh
+++ b/egs/aishell/asr/train.sh
@@ -2,9 +2,9 @@
 # training the model
-gpu_num=8
+gpu_num=2
 update_freq=1
-max_tokens=40000
+max_tokens=160000
 extra_tag=
 extra_parameter=
@@ -13,12 +13,12 @@ extra_parameter=
 exp_tag=
-#config_list=(base ctc)
+config_list=(base ctc)
-config_list=(base ctc conformer)
+#config_list=(base ctc conformer)
-config_list=(big ctc conformer)
+#config_list=(big ctc conformer)
 #config_list=(pds_base_16)
-config_list=(pds_base_16 conformer)
+#config_list=(pds_base_16 conformer)
 # exp full name
 exp_name=

--- a/egs/iwslt14/mt/conf/base.yaml
+++ b/egs/iwslt14/mt/conf/base.yaml
-arch: transformer_ctc
+arch: transformer
 share-all-embeddings: True
 optimizer: adam
 clip-norm: 10.0
@@ -8,7 +8,7 @@ warmup-updates: 8000
 lr: 1e-3
 adam_betas: (0.9,0.997)
-criterion: label_smoothed_cross_entropy_with_ctc
+criterion: label_smoothed_cross_entropy
 label_smoothing: 0.1
 dropout: 0.3

--- a/egs/iwslt14/mt/conf/base_ctc.yaml
+++ b/egs/iwslt14/mt/conf/base_ctc.yaml
+arch: transformer_ctc
+share-all-embeddings: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 8000
+lr: 1e-3
+adam_betas: (0.9,0.997)
+criterion: label_smoothed_cross_entropy_with_ctc
+label_smoothing: 0.1
+dropout: 0.3
+attention-dropout: 0.0
+activation-dropout: 0.0
+activation-fn: relu
+encoder-normalize-before: True
+decoder-normalize-before: True
+encoder-embed-dim: 512
+encoder-ffn-embed-dim: 1024
+encoder-layers: 6
+decoder-layers: 6
+encoder-attention-heads: 4
+decoder-embed-dim: 512
+decoder-ffn-embed-dim: 1024
+decoder-attention-heads: 4
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/iwslt14/mt/conf/basis.yaml
+++ b/egs/iwslt14/mt/conf/basis.yaml
@@ -6,6 +6,7 @@ max-update: 50000
 patience: 20
 best_checkpoint_metric: loss
 maximize_best_checkpoint_metric: False
+post-process: sentencepiece
 no-epoch-checkpoints: True
 #keep-last-epochs: 10

--- a/egs/iwslt14/mt/conf/inter.yaml
+++ b/egs/iwslt14/mt/conf/inter.yaml
-#ctc-weight: 0.2
+#ctc-layer:
-intermedia-ctc-weight: 0.3
+ctc-weight: 0.2
-intermedia-ctc-layers: 2,4
+interleaved-ctc-weight: 0.1
+interleaved-ctc-layers: 6,9
+interleaved-ctc-temperature: 1.0
+interleaved-ctc-drop-prob: 0
+interleaved_ctc_upsampling_ratio: 2
-#target-ctc-weight: 0.3
+sae-adapter: league
-#target-ctc-layer: 6
+sae-drop-prob: 0.0
-#target-intermedia-ctc-weight: 0.1
+#sae-distribution-cutoff: 10
-#target-intermedia-ctc-layers: 2,4
+share-ctc-and-sae: False
-intermedia-adapter: league
+ctc-self-distill-weight: 0
-#intermedia-drop-prob: 0.2
\ No newline at end of file
-#intermedia-temperature: 5
-post-process: sentencepiece
\ No newline at end of file
--- a/egs/libri_trans/asr/conf/debug.yaml
+++ b/egs/libri_trans/asr/conf/debug.yaml
-arch: s2t_sate
+arch: s2t_transformer
 share-decoder-input-output-embed: True
 optimizer: adam
 clip-norm: 10.0
@@ -37,9 +37,9 @@ activation-dropout: 0.1
 #load-pretrained-encoder-from:
 #load-pretrained-decoder-from:
-#inter_mixup: True
+inter_mixup: True
-#inter_mixup_layer: -1
+inter_mixup_layer: -1
-#inter_mixup_ratio: 0.2
+inter_mixup_ratio: 0.2
 ctc-weight: 0.2
 interleaved-ctc-weight: 0.1
@@ -48,8 +48,8 @@ interleaved-temperature: 2
 #target-ctc-weight: 0.3
 #target-ctc-layer: 6
-target-interleaved-ctc-weight: 0.1
+#target-interleaved-ctc-weight: 0.1
-target-interleaved-ctc-layers: 2,4
+#target-interleaved-ctc-layers: 2,4
 sae-adapter: league
 share-ctc-and-sae: False

--- a/egs/mustc/asr/conf/basis.yaml
+++ b/egs/mustc/asr/conf/basis.yaml
@@ -6,6 +6,7 @@ max-update: 100000
 patience: 20
 best_checkpoint_metric: loss
 maximize_best_checkpoint_metric: False
+post-process: sentencepiece
 no-epoch-checkpoints: True
 #keep-last-epochs: 10
@@ -17,4 +18,3 @@ log-interval: 100
 seed: 1
 report-accuracy: True
 skip-invalid-size-inputs-valid-test: True
-post-process: sentencepiece
\ No newline at end of file
--- a/egs/mustc/asr/conf/ctc.yaml
+++ b/egs/mustc/asr/conf/ctc.yaml
 ctc-weight: 0.3
-post-process: sentencepiece
+share-ctc-and-embed: True
\ No newline at end of file
--- a/egs/mustc/mt/conf/base_ctc.yaml
+++ b/egs/mustc/mt/conf/base_ctc.yaml
+arch: transformer_ctc
+share-all-embeddings: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 8000
+lr: 1e-3
+adam_betas: (0.9,0.997)
+criterion: label_smoothed_cross_entropy_with_ctc
+label_smoothing: 0.1
+dropout: 0.1
+attention-dropout: 0.1
+activation-dropout: 0.1
+activation-fn: relu
+encoder-normalize-before: True
+decoder-normalize-before: True
+encoder-embed-dim: 512
+encoder-ffn-embed-dim: 2048
+encoder-layers: 6
+decoder-layers: 6
+encoder-attention-heads: 8
+decoder-embed-dim: 512
+decoder-ffn-embed-dim: 2048
+decoder-attention-heads: 8
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/mustc/mt/conf/basis.yaml
+++ b/egs/mustc/mt/conf/basis.yaml
@@ -6,6 +6,7 @@ max-update: 100000
 patience: 20
 best_checkpoint_metric: loss
 maximize_best_checkpoint_metric: False
+post-process: sentencepiece
 no-epoch-checkpoints: True
 #keep-last-epochs: 10

--- a/egs/mustc/mt/conf/debug.yaml
+++ b/egs/mustc/mt/conf/debug.yaml
@@ -33,4 +33,19 @@ decoder-ffn-embed-dim: 2048
 decoder-attention-heads: 8
 #load-pretrained-encoder-from:
 #load-pretrained-decoder-from:
\ No newline at end of file
+#ctc-layer:
+#ctc-weight: 0.2
+interleaved-ctc-weight: 0.3
+interleaved-ctc-layers: 6,9
+interleaved-ctc-temperature: 1.0
+interleaved-ctc-drop-prob: 0
+interleaved_ctc_upsampling_ratio: 2
+sae-adapter: league
+sae-drop-prob: 0.0
+#sae-distribution-cutoff: 10
+share-ctc-and-sae: True
+ctc-self-distill-weight: 0
\ No newline at end of file
--- a/egs/mustc/mt/conf/inter.yaml
+++ b/egs/mustc/mt/conf/inter.yaml
+#ctc-layer:
 #ctc-weight: 0.2
-intermedia-ctc-weight: 0.3
+interleaved-ctc-weight: 0.3
-intermedia-ctc-layers: 2,4
+interleaved-ctc-layers: 6,9
+interleaved-ctc-temperature: 1.0
+interleaved-ctc-drop-prob: 0
+interleaved_ctc_upsampling_ratio: 2
-#target-ctc-weight: 0.3
+sae-adapter: league
-#target-ctc-layer: 6
+sae-drop-prob: 0.0
-#target-intermedia-ctc-weight: 0.1
+#sae-distribution-cutoff: 10
-#target-intermedia-ctc-layers: 2,4
+share-ctc-and-sae: False
-intermedia-adapter: league
+ctc-self-distill-weight: 0
-#intermedia-drop-prob: 0.2
\ No newline at end of file
-#intermedia-temperature: 5
-post-process: sentencepiece
\ No newline at end of file
--- a/egs/mustc/mt/conf/small_ctc.yaml
+++ b/egs/mustc/mt/conf/small_ctc.yaml
+arch: transformer_ctc
+share-all-embeddings: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 8000
+lr: 1e-3
+adam_betas: (0.9,0.997)
+criterion: label_smoothed_cross_entropy_with_ctc
+label_smoothing: 0.1
+dropout: 0.1
+attention-dropout: 0.1
+activation-dropout: 0.1
+activation-fn: relu
+encoder-normalize-before: True
+decoder-normalize-before: True
+encoder-embed-dim: 256
+encoder-ffn-embed-dim: 2048
+encoder-layers: 6
+decoder-layers: 6
+encoder-attention-heads: 4
+decoder-embed-dim: 256
+decoder-ffn-embed-dim: 2048
+decoder-attention-heads: 4
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/mustc/st/conf/base.yaml
+++ b/egs/mustc/st/conf/base.yaml
 arch: s2t_transformer_s
 share-decoder-input-output-embed: True
+share-ctc-and-embed: True
 optimizer: adam
 clip-norm: 10.0
 lr-scheduler: inverse_sqrt

--- a/egs/mustc/st/conf/basis.yaml
+++ b/egs/mustc/st/conf/basis.yaml
@@ -6,6 +6,7 @@ max-update: 100000
 patience: 20
 best_checkpoint_metric: loss
 maximize_best_checkpoint_metric: False
+post-process: sentencepiece
 no-epoch-checkpoints: True
 #keep-last-epochs: 10
@@ -16,5 +17,4 @@ no-progress-bar: True
 log-interval: 100
 seed: 1
 report-accuracy: True
 skip-invalid-size-inputs-valid-test: True
-post-process: sentencepiece
\ No newline at end of file
\ No newline at end of file
--- a/egs/mustc/st/conf/ctc.yaml
+++ b/egs/mustc/st/conf/ctc.yaml
 ctc-weight: 0.3
-post-process: sentencepiece
+share-ctc-and-embed: True
\ No newline at end of file
--- a/fairseq/criterions/ctc.py
+++ b/fairseq/criterions/ctc.py
@@ -263,7 +263,7 @@ class CtcCriterion(FairseqCriterion):
        target_interleaved_ctc_loss = 0
        # calculate the target CTC loss
-        if self.target_ctc_weight > 0 or self.target_interleaved_ctc_weight:
+        if self.target_ctc_weight > 0 or self.target_interleaved_ctc_weight > 0:
            target = sample["target"]
            pad_mask = (target != self.pad_idx) & (target != self.eos_idx)
@@ -297,27 +297,28 @@ class CtcCriterion(FairseqCriterion):
            target_interleaved_ctc_num = 0
            if "target_interleaved_ctc_logits" in net_output:
                target_interleaved_ctc_num = len(net_output["target_interleaved_ctc_logits"])
+            if target_interleaved_ctc_num != 0 and self.target_interleaved_ctc_weight > 0:
+                for i in range(target_interleaved_ctc_num):
+                    out = net_output["target_interleaved_ctc_logits"][i]
+                    if type(out) == list:
+                        inter_ctc_logit = out[0]
+                        padding = ~out[1]
+                        tgt_input_lengths = padding.long().sum(-1)
+                    else:
+                        inter_ctc_logit = out
+                        tgt_input_lengths = input_lengths
-            for i in range(target_interleaved_ctc_num):
+                    tgt_inter_lprobs = model.get_normalized_probs(
-                out = net_output["target_interleaved_ctc_logits"][i]
+                        [inter_ctc_logit], log_probs=True
-                if type(out) == list:
+                    ).contiguous()  # (T, B, C) from the encoder
-                    inter_ctc_logit = out[0]
+                    tgt_inter_lprobs.batch_first = False
-                    padding = ~out[1]
-                    tgt_input_lengths = padding.long().sum(-1)
-                else:
-                    inter_ctc_logit = out
-                    tgt_input_lengths = input_lengths
-                tgt_inter_lprobs = model.get_normalized_probs(
-                    [inter_ctc_logit], log_probs=True
-                ).contiguous()  # (T, B, C) from the encoder
-                tgt_inter_lprobs.batch_first = False
-                for flat, lengths, coef in zip(target_flat, target_length, loss_coef):
+                    for flat, lengths, coef in zip(target_flat, target_length, loss_coef):
-                    target_interleaved_ctc_loss += self.get_loss(tgt_inter_lprobs, flat, tgt_input_lengths, lengths) * coef
+                        target_interleaved_ctc_loss += self.get_loss(tgt_inter_lprobs, flat, tgt_input_lengths,
+                                                                     lengths) * coef
-            target_interleaved_ctc_loss /= target_interleaved_ctc_num
+                target_interleaved_ctc_loss /= target_interleaved_ctc_num
-            logging_output["target_interleaved_ctc_loss"] = utils.item(target_interleaved_ctc_loss.data)
+                logging_output["target_interleaved_ctc_loss"] = utils.item(target_interleaved_ctc_loss.data)
        # calculate the self distillation CTC loss
        ctc_self_distill_loss = 0
@@ -358,7 +359,7 @@ class CtcCriterion(FairseqCriterion):
        logging_output["all_ctc_loss"] = utils.item(loss.data)
        if torch.isnan(loss) or torch.isinf(loss) or utils.item(loss.data) < 0:
-            logger.warning("Illegal loss %f!" % loss)
+            # logger.warning("Illegal loss %f!" % loss)
            if self.ctc_weight != 0:
                logger.warning("CTC loss %f!" % ctc_loss)
            if self.interleaved_ctc_weight != 0:
@@ -366,7 +367,7 @@ class CtcCriterion(FairseqCriterion):
            if self.target_ctc_weight != 0:
                logger.warning("Target CTC loss %f!" % target_ctc_loss)
-        if not model.training and self.ctc_weight > 0:
+        if not model.training and self.ctc_weight + self.interleaved_ctc_weight > 0:
            import editdistance
            with torch.no_grad():

--- a/fairseq/criterions/label_smoothed_cross_entropy.py
+++ b/fairseq/criterions/label_smoothed_cross_entropy.py
@@ -55,12 +55,12 @@ def label_smoothed_nll_loss(lprobs, target, epsilon, ignore_index=None, reduce=T
 )
 class LabelSmoothedCrossEntropyCriterion(FairseqCriterion):
    def __init__(
-        self,
+            self,
-        task,
+            task,
-        sentence_avg,
+            sentence_avg,
-        label_smoothing,
+            label_smoothing,
-        ignore_prefix_size=0,
+            ignore_prefix_size=0,
-        report_accuracy=False,
+            report_accuracy=False,
    ):
        super().__init__(task)
        self.sentence_avg = sentence_avg
@@ -99,11 +99,11 @@ class LabelSmoothedCrossEntropyCriterion(FairseqCriterion):
        target = model.get_targets(sample, net_output)
        if self.ignore_prefix_size > 0:
            if getattr(lprobs, "batch_first", False):
-                lprobs = lprobs[:, self.ignore_prefix_size :, :].contiguous()
+                lprobs = lprobs[:, self.ignore_prefix_size:, :].contiguous()
-                target = target[:, self.ignore_prefix_size :].contiguous()
+                target = target[:, self.ignore_prefix_size:].contiguous()
            else:
-                lprobs = lprobs[self.ignore_prefix_size :, :, :].contiguous()
+                lprobs = lprobs[self.ignore_prefix_size:, :, :].contiguous()
-                target = target[self.ignore_prefix_size :, :].contiguous()
+                target = target[self.ignore_prefix_size:, :].contiguous()
        if "mixup" in net_output[1] and net_output[1]["mixup"] is not None:
            mixup = net_output[1]["mixup"]
            idx1 = mixup["index1"]

--- a/fairseq/criterions/label_smoothed_cross_entropy_with_ctc.py
+++ b/fairseq/criterions/label_smoothed_cross_entropy_with_ctc.py
@@ -69,9 +69,9 @@ class LabelSmoothedCrossEntropyCriterionWithCTC(
        n_tokens = sample["ntokens"]
        n_sentences = sample["target"].size(0)
        if use_mixup:
-            sample_size //= 2
+            sample_size //= net_output[0].size(0) if self.sentence_avg else encoder_out["mixup"]["ratio"]
-            n_tokens //= 2
+            n_tokens //= encoder_out["mixup"]["ratio"]
-            n_sentences //= 2
+            n_sentences //= net_output[0].size(0)
        logging_output = {
            "trans_loss": utils.item(loss.data) if reduce else loss.data,
@@ -88,7 +88,8 @@ class LabelSmoothedCrossEntropyCriterionWithCTC(
        if self.ctc_criterion.all_ctc_weight > 0:
            ctc_loss, logging_output = self.ctc_criterion.compute_ctc_loss(model, sample, encoder_out, logging_output)
-            loss = (1 - self.ctc_weight) * loss + ctc_loss
+            # loss = (1 - self.ctc_weight) * loss + ctc_loss
+            loss = loss + ctc_loss
        # if hasattr(model.encoder, "get_loss"):
        #     encoder_loss = model.encoder.get_loss()

--- a/fairseq/models/speech_to_text/s2t_sate.py
+++ b/fairseq/models/speech_to_text/s2t_sate.py
@@ -259,11 +259,11 @@ class TextEncoder(FairseqEncoder):
                "drop_prob": getattr(args, "sae_drop_prob", 0),
            }
-            self.sae_adapter = Adapter(embed_dim, args.sae_adapter,
+            self.sae = Adapter(embed_dim, args.sae_adapter,
-                                       len(dictionary),
+                               len(dictionary),
-                                       strategy=strategy)
+                               strategy=strategy)
-            if args.share_target_ctc_and_sae and hasattr(self.sae_adapter, "embed_adapter"):
+            if args.share_target_ctc_and_sae and hasattr(self.sae, "embed_adapter"):
-                self.ctc.ctc_projection.weight = self.sae_adapter.embed_adapter.weight
+                self.sae.embed_adapter.weight = self.ctc.ctc_projection.weight
            self.interleaved_ctc_drop_prob = args.interleaved_ctc_drop_prob
@@ -297,7 +297,7 @@ class TextEncoder(FairseqEncoder):
                target_interleaved_ctc_logits.append(logit)
                prob = utils.softmax(logit / self.interleaved_ctc_temperature, dim=-1)
-                x, encoder_padding_mask = self.sae_adapter([x, prob], encoder_padding_mask)
+                x, encoder_padding_mask = self.sae([x, prob], encoder_padding_mask)
            if history is not None:
                history.push(x)
@@ -376,8 +376,8 @@ class S2TSATEEncoder(FairseqEncoder):
        encoder_out = acoustic_encoder_out["encoder_out"][0]
        encoder_padding_mask = acoustic_encoder_out["encoder_padding_mask"][0]
        ctc_padding_mask = encoder_padding_mask
-        if "mixup" in encoder_out:
+        if "mixup" in acoustic_encoder_out:
-            mixup = encoder_out["mixup"]
+            mixup = acoustic_encoder_out["mixup"]
        else:
            mixup = None
@@ -406,7 +406,8 @@ class S2TSATEEncoder(FairseqEncoder):
                x, target_ctc_logit, target_interleaved_ctc_logits = self.text_encoder(x, encoder_padding_mask,
                                                                                       self.history)
        else:
-            x, target_ctc_logit, target_interleaved_ctc_logits = self.text_encoder(x, encoder_padding_mask, self.history)
+            x, target_ctc_logit, target_interleaved_ctc_logits = self.text_encoder(x, encoder_padding_mask,
+                                                                                   self.history)
        return {
            "encoder_out": [x],  # T x B x C

--- a/fairseq/models/speech_to_text/s2t_transformer.py
+++ b/fairseq/models/speech_to_text/s2t_transformer.py
@@ -657,12 +657,12 @@ class S2TTransformerEncoder(FairseqEncoder):
                "drop_prob": getattr(args, "sae_drop_prob", 0),
            }
-            self.sae_adapter = Adapter(dim, args.sae_adapter,
+            self.sae = Adapter(dim, args.sae_adapter,
-                                       len(task.source_dictionary),
+                               len(task.source_dictionary),
-                                       strategy=strategy,
+                               strategy=strategy,
-                                       )
+                               )
-            if args.share_ctc_and_sae and hasattr(self.sae_adapter, "embed_adapter"):
+            if args.share_ctc_and_sae and hasattr(self.sae, "embed_adapter"):
-                self.ctc.ctc_projection.weight = self.sae_adapter.embed_adapter.weight
+                self.sae.embed_adapter.weight = self.ctc.ctc_projection.weight
        # mixup
        self.mixup = getattr(args, "inter_mixup", False)
@@ -734,6 +734,7 @@ class S2TTransformerEncoder(FairseqEncoder):
        input_lengths = (~encoder_padding_mask).sum(-1)
        mixup = {
+            "ratio": self.mixup_ratio,
            "coef": coef,
            "index1": idx1,
            "index2": idx2,
@@ -766,12 +767,12 @@ class S2TTransformerEncoder(FairseqEncoder):
        # down-sampling
        x, input_lengths = self.subsample(x, input_lengths)
+        encoder_padding_mask = lengths_to_padding_mask(input_lengths)
        # embedding scaling
        x = self.embed_scale * x
-        # padding and position embedding
+        # position embedding
-        encoder_padding_mask = lengths_to_padding_mask(input_lengths)
        if self.attn_type in ["rel_pos", "rel_pos_legacy", "rel_selfattn"]:
            positions = self.embed_positions(x)
@@ -836,7 +837,7 @@ class S2TTransformerEncoder(FairseqEncoder):
                                    max=1e8 if logit.dtype == torch.float32 else 1e4)
                prob = utils.softmax(logit / self.interleaved_ctc_temperature, dim=-1)
-                x, encoder_padding_mask = self.sae_adapter([x, prob], encoder_padding_mask)
+                x, encoder_padding_mask = self.sae([x, prob], encoder_padding_mask)
            # gather cosine similarity
            if self.gather_cos_sim:

--- a/fairseq/models/transformer_ctc.py
+++ b/fairseq/models/transformer_ctc.py
--- a/fairseq/modules/convolution.py
+++ b/fairseq/modules/convolution.py
@@ -58,7 +58,7 @@ class ConvolutionModule(nn.Module):
        elif norm_type == "layer_norm":
            self.norm = LayerNorm(expand_embed_dim)
        else:
-            assert False, "Unsupported normalization type in convolution module"
+            assert False, "Unsupported normalization type %s in convolution module" % norm_type
        self.activation = get_activation_class(activation_fn)
        self.pointwise_conv2 = torch.nn.Conv1d(
            expand_embed_dim,

--- a/fairseq/modules/speech_to_text/adapter.py
+++ b/fairseq/modules/speech_to_text/adapter.py
@@ -77,6 +77,7 @@ class Adapter(nn.Module):
        if self.adapter_type in ["context", "league", "gated_league", "gated_league2", "inter_league"]:
            self.cal_context = True
            self.embed_adapter = nn.Linear(dim, dictionary_size, bias=False)    # reverse for initialization
+            nn.init.normal_(self.embed_adapter.weight, mean=0, std=dim ** -0.5)
            if embed_tokens is not None:
                self.embed_adapter.weight = embed_tokens.weight

--- a/fairseq/modules/speech_to_text/ctc.py
+++ b/fairseq/modules/speech_to_text/ctc.py
@@ -20,7 +20,8 @@ class CTC(nn.Module):
        self.embed_dim = embed_dim
        self.ctc_projection = nn.Linear(embed_dim, dictionary_size)
-        # nn.init.normal_(self.ctc_projection.weight, mean=0, std=embed_dim ** -0.5)
+        nn.init.normal_(self.ctc_projection.weight, mean=0, std=embed_dim ** -0.5)
+        nn.init.constant_(self.ctc_projection.bias, 0.0)
        self.ctc_dropout_module = FairseqDropout(
            p=dropout, module_name=self.__class__.__name__

--- a/fairseq/modules/speech_to_text/subsampling.py
+++ b/fairseq/modules/speech_to_text/subsampling.py
@@ -198,7 +198,11 @@ class Conv2dSubsampling(nn.Module):
                     transpose=True if norm == "layer" else False),
            get_activation_class(act, dim=1)
        ) for layer_id in range(num_layers)])
-        self.linear = nn.Linear(filters[-1] * in_dim // 2 ** num_layers, filters[-1])
+        dim = in_dim
+        for _ in range(num_layers):
+            dim = (dim - 1) // 2
+        self.linear = nn.Linear(dim*filters[-1], filters[-1])
    def forward(self, x, x_len):
@@ -211,11 +215,12 @@ class Conv2dSubsampling(nn.Module):
            # Update Sequence Lengths
            if x_len is not None:
-                x_len = torch.div(x_len - 1, 2, rounding_mode='floor') + 1
+                x_len = torch.div(x_len - 1, 2, rounding_mode='floor')
        # (B, C, D // S, T // S) -> (B,  C * D // S, T // S)
        batch_size, channels, subsampled_dim, subsampled_length = x.size()
-        assert subsampled_length == max(x_len), "The lengths are mismatched."
+        assert subsampled_length == max(x_len), \
+            ("The lengths are mismatched: %d and %d." % (subsampled_length, max(x_len)))
        x = x.reshape(batch_size, channels * subsampled_dim, subsampled_length).permute(2, 0, 1)
        x = self.linear(x)

--- a/fairseq/sequence_generator.py
+++ b/fairseq/sequence_generator.py
@@ -197,11 +197,11 @@ class SequenceGenerator(nn.Module):
        )
        net_input = sample["net_input"]
-        if "transcript" in sample:
+        # if "transcript" in sample:
-            text_src_tokens = sample["transcript"]["tokens"]
+        #     text_src_tokens = sample["transcript"]["tokens"]
-            text_src_lengths = sample["transcript"]["lengths"]
+        #     text_src_lengths = sample["transcript"]["lengths"]
-            net_input["text_src_tokens"] = text_src_tokens
+        #     net_input["text_src_tokens"] = text_src_tokens
-            net_input["text_src_lengths"] = text_src_lengths
+        #     net_input["text_src_lengths"] = text_src_lengths
        if "src_tokens" in net_input:
            src_tokens = net_input["src_tokens"]