arch: pdss2t_transformer_s_8

#pds-ctc: 0_1_1_0
#intermedia-adapter: league
#intermedia-ctc-weight: 0.1

#encoder-attention-type: reduced
#pds-attn-ds-ratios: 4_2_2_1
#attention-reduced-method: pool
#attention-reduced-q: True

encoder-embed-dim: 512
pds-stages: 4
#ctc-layer: 15
encoder-layers: 18
pds-layers: 6_3_3_6
pds-ratios: 2_2_1_2
pds-fusion: True
pds-fusion-method: all_conv
pds-embed-dims: 256_384_384_512
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_4_4_4
pds-attn-heads: 4_6_6_8

share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam_betas: (0.9,0.98)

criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1

dropout: 0.1
activation-fn: relu

decoder-layers: 6
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8

#load-pretrained-encoder-from:
#load-pretrained-decoder-from: