Commit d220c040 by xuchen

Big update!

I optimize the implementation of the speech-to-text tasks. As always, I update the shell scripts and YAML configures for easy training.
There may be some bugs. So, the follow-up update is coming!
parent 99763132
...@@ -40,5 +40,3 @@ encoder-attention-heads: 4 ...@@ -40,5 +40,3 @@ encoder-attention-heads: 4
decoder-embed-dim: 256 decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048 decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4 decoder-attention-heads: 4
attention-dropout: 0.1
activation-dropout: 0.1
...@@ -29,7 +29,7 @@ label_smoothing: 0.1 ...@@ -29,7 +29,7 @@ label_smoothing: 0.1
conv-kernel-sizes: 5,5 conv-kernel-sizes: 5,5
conv-channels: 1024 conv-channels: 1024
dropout: 0.1 dropout: 0.15
activation-fn: relu activation-fn: relu
encoder-embed-dim: 512 encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048 encoder-ffn-embed-dim: 2048
...@@ -40,5 +40,3 @@ encoder-attention-heads: 8 ...@@ -40,5 +40,3 @@ encoder-attention-heads: 8
decoder-embed-dim: 512 decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048 decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8 decoder-attention-heads: 8
\ No newline at end of file
attention-dropout: 0.1
activation-dropout: 0.1
arch: s2t_conformer_s
macaron-style: True macaron-style: True
use-cnn-module: True use-cnn-module: True
cnn-module-kernel: 31 cnn-module-kernel: 31
#arch: pdss2t_transformer_s
#arch: s2t_transformer_s
arch: s2t_sate
encoder-embed-dim: 256
pyramid-stages: 4
#pyramid-dropout: 0
pyramid-layers: 2_2_6_2
pyramid-ratios: 2_2_2_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8
pyramid-attn-heads: 4_4_4_4
cl-dropout: True
cl-dropout-epoch: 50
train-subset: train-clean-100 train-subset: train-clean-100
valid-subset: dev-clean valid-subset: dev-clean
...@@ -5,7 +26,7 @@ max-epoch: 100 ...@@ -5,7 +26,7 @@ max-epoch: 100
max-update: 300000 max-update: 300000
num-workers: 8 num-workers: 8
patience: 10 patience: 20
no-progress-bar: True no-progress-bar: True
log-interval: 100 log-interval: 100
seed: 1 seed: 1
...@@ -14,7 +35,6 @@ report-accuracy: True ...@@ -14,7 +35,6 @@ report-accuracy: True
#load-pretrained-encoder-from: #load-pretrained-encoder-from:
#load-pretrained-decoder-from: #load-pretrained-decoder-from:
arch: s2t_transformer_s
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
clip-norm: 10.0 clip-norm: 10.0
...@@ -28,11 +48,9 @@ criterion: label_smoothed_cross_entropy_with_ctc ...@@ -28,11 +48,9 @@ criterion: label_smoothed_cross_entropy_with_ctc
ctc-weight: 0.3 ctc-weight: 0.3
label_smoothing: 0.1 label_smoothing: 0.1
conv-kernel-sizes: 5,5
conv-channels: 1024 conv-channels: 1024
dropout: 0.1 dropout: 0.1
activation-fn: relu activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048 encoder-ffn-embed-dim: 2048
encoder-layers: 12 encoder-layers: 12
decoder-layers: 6 decoder-layers: 6
......
train-subset: train_st train-subset: train-clean-100
valid-subset: dev_st valid-subset: dev-clean
max-epoch: 50 max-epoch: 100
max-update: 100000 max-update: 100000
num-workers: 8 num-workers: 8
...@@ -15,6 +15,14 @@ report-accuracy: True ...@@ -15,6 +15,14 @@ report-accuracy: True
#load-pretrained-acoustic-encoder-from: #load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from: #load-pretrained-text-encoder-from:
#load-pretrained-decoder-from: #load-pretrained-decoder-from:
#load-pretrained-acoustic-encoder-from: /home/xuchen/st/checkpoints/mustc/asr/1007_st_ctc_baseline/avg_10_checkpoint.pt
#load-pretrained-acoustic-encoder-from: /home/xuchen/st/checkpoints/mustc/asr/1111_st_ctc_conformer_lr0.001/avg_10_checkpoint.pt
#load-pretrained-acoustic-encoder-from: /home/xuchen/st/checkpoints/mustc/asr/1007_st_pyramid4_all256_3333_sr8_ctc/avg_10_checkpoint.pt
#load-pretrained-acoustic-encoder-from: /home/xuchen/st/checkpoints/mustc/asr/1114_st_pyramid4_all256_ctc_fix/avg_10_checkpoint.pt
#load-pretrained-acoustic-encoder-from: /home/xuchen/st/checkpoints/mustc/asr/1015_st_pyramid4_all256_conformer_baseline/avg_10_checkpoint.pt
#load-pretrained-acoustic-encoder-from: /home/xuchen/st/checkpoints/mustc/asr/1111_st_pyramid4_all256_conformer_ctc/avg_10_checkpoint.pt
arch: s2t_sate arch: s2t_sate
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
...@@ -24,33 +32,37 @@ lr-scheduler: inverse_sqrt ...@@ -24,33 +32,37 @@ lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7 warmup-init-lr: 1e-7
warmup-updates: 10000 warmup-updates: 10000
lr: 2e-3 lr: 2e-3
#adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy
label_smoothing: 0.1
encoder-normalize-before: True
decoder-normalize-before: True
conv-kernel-sizes: 5,5
conv-channels: 1024
dropout: 0.1 dropout: 0.1
activation-fn: relu activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048 encoder-ffn-embed-dim: 2048
encoder-layers: 12 encoder-layers: 12
text-encoder-layers: 6 text-encoder-layers: 6
decoder-layers: 6 decoder-layers: 6
encoder-attention-heads: 4 encoder-attention-heads: 4
macaron-style: True #macaron-style: True
use-cnn-module: True #use-cnn-module: True
cnn-module-kernel: 31 #cnn-module-kernel: 31
acoustic-encoder: transformer #acoustic-encoder: transformer
#acoustic-encoder: conformer
acoustic-encoder: pyramid
adapter: league adapter: league
#adapter: none
#adapter: context
#decoder-embed-dim: 256 encoder-embed-dim: 256
#decoder-ffn-embed-dim: 2048 pyramid-stages: 4
#decoder-attention-heads: 4 #pyramid-dropout: 0
#attention-dropout: 0.1 pyramid-layers: 3_3_3_3
#activation-dropout: 0.1 pyramid-sr-ratios: 2_2_1_2
pyramid-embed-dims: 256_256_256_256
pyramid-fuse: True
pyramid-reduced-embed: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8
pyramid-heads: 4_4_4_4
\ No newline at end of file
arch: pys2t_transformer_s arch: pdss2t_transformer_s_8
encoder-embed-dim: 256
pyramid-stages: 3
pyramid-layers: 3_6_3
pyramid-fuse-way: all_conv
pyramid-fuse: True
pyramid-sr-ratios: 2_2_2
pyramid-embed-dims: 256_256_256
pyramid-reduced-embed: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1
pyramid-kernel-sizes: 5_5_5
pyramid-ffn-ratios: 8_8_8
pyramid-heads: 4_4_4
train-subset: train-clean-100,train-clean-360,train-other-500 train-subset: train-clean-100,train-clean-360,train-other-500
valid-subset: dev-clean valid-subset: dev-clean
...@@ -20,7 +7,7 @@ max-epoch: 100 ...@@ -20,7 +7,7 @@ max-epoch: 100
max-update: 300000 max-update: 300000
num-workers: 8 num-workers: 8
patience: 20 patience: 10
no-progress-bar: True no-progress-bar: True
log-interval: 100 log-interval: 100
seed: 1 seed: 1
...@@ -41,7 +28,6 @@ lr: 2e-3 ...@@ -41,7 +28,6 @@ lr: 2e-3
criterion: label_smoothed_cross_entropy_with_ctc criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1 label_smoothing: 0.1
conv-channels: 1024
dropout: 0.1 dropout: 0.1
activation-fn: relu activation-fn: relu
encoder-ffn-embed-dim: 2048 encoder-ffn-embed-dim: 2048
...@@ -52,5 +38,3 @@ encoder-attention-heads: 4 ...@@ -52,5 +38,3 @@ encoder-attention-heads: 4
decoder-embed-dim: 256 decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048 decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4 decoder-attention-heads: 4
attention-dropout: 0.1
activation-dropout: 0.1
arch: pys2t_transformer_s arch: pdss2t_transformer_s_16
encoder-embed-dim: 256 encoder-embed-dim: 256
pyramid-stages: 4 pyramid-stages: 4
#pyramid-dropout: 0 #pyramid-dropout: 0
pyramid-layers: 2_2_6_2 pyramid-layers: 2_2_6_2
pyramid-sr-ratios: 2_2_2_2 pyramid-ratios: 2_2_2_2
pyramid-fuse: True pyramid-fusion: True
pyramid-fuse-way: all_conv pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256 pyramid-embed-dims: 256_256_256_256
pyramid-reduced-embed: conv pyramid-ds-method: conv
pyramid-embed-norm: True pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1 pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5 pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8 pyramid-ffn-ratios: 8_8_8_8
pyramid-heads: 4_4_4_4 pyramid-attn-heads: 4_4_4_4
train-subset: train-clean-100,train-clean-360,train-other-500 train-subset: train-clean-100,train-clean-360,train-other-500
valid-subset: dev-clean valid-subset: dev-clean
...@@ -21,7 +22,7 @@ max-epoch: 100 ...@@ -21,7 +22,7 @@ max-epoch: 100
max-update: 300000 max-update: 300000
num-workers: 8 num-workers: 8
patience: 20 patience: 10
no-progress-bar: True no-progress-bar: True
log-interval: 100 log-interval: 100
seed: 1 seed: 1
...@@ -42,7 +43,6 @@ lr: 2e-3 ...@@ -42,7 +43,6 @@ lr: 2e-3
criterion: label_smoothed_cross_entropy_with_ctc criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1 label_smoothing: 0.1
conv-channels: 1024
dropout: 0.1 dropout: 0.1
activation-fn: relu activation-fn: relu
encoder-ffn-embed-dim: 2048 encoder-ffn-embed-dim: 2048
...@@ -53,5 +53,3 @@ encoder-attention-heads: 4 ...@@ -53,5 +53,3 @@ encoder-attention-heads: 4
decoder-embed-dim: 256 decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048 decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4 decoder-attention-heads: 4
attention-dropout: 0.1
activation-dropout: 0.1
train-subset: train_st arch: pdss2t_transformer_s_32
valid-subset: dev_st
max-epoch: 50 encoder-embed-dim: 256
max-update: 100000 pyramid-stages: 5
#pyramid-dropout: 0
pyramid-layers: 2_2_3_3_2
pyramid-ratios: 2_2_2_2_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1_1
pyramid-kernel-sizes: 5_5_5_5_5
pyramid-ffn-ratios: 8_8_8_8_8
pyramid-attn-heads: 4_4_4_4_4
train-subset: train-clean-100,train-clean-360,train-other-500
valid-subset: dev-clean
max-epoch: 100
max-update: 300000
num-workers: 8 num-workers: 8
patience: 10 patience: 10
...@@ -12,11 +29,8 @@ seed: 1 ...@@ -12,11 +29,8 @@ seed: 1
report-accuracy: True report-accuracy: True
#load-pretrained-encoder-from: #load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from: #load-pretrained-decoder-from:
arch: s2t_sate
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
clip-norm: 10.0 clip-norm: 10.0
...@@ -26,31 +40,16 @@ warmup-updates: 10000 ...@@ -26,31 +40,16 @@ warmup-updates: 10000
lr: 2e-3 lr: 2e-3
#adam_betas: (0.9,0.98) #adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1 label_smoothing: 0.1
encoder-normalize-before: True
decoder-normalize-before: True
conv-kernel-sizes: 5,5
conv-channels: 1024
dropout: 0.1 dropout: 0.1
activation-fn: relu activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048 encoder-ffn-embed-dim: 2048
encoder-layers: 12 encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6 decoder-layers: 6
encoder-attention-heads: 4 encoder-attention-heads: 4
macaron-style: True decoder-embed-dim: 256
use-cnn-module: True decoder-ffn-embed-dim: 2048
cnn-module-kernel: 31 decoder-attention-heads: 4
acoustic-encoder: transformer
adapter: league
#decoder-embed-dim: 256
#decoder-ffn-embed-dim: 2048
#decoder-attention-heads: 4
#attention-dropout: 0.1
#activation-dropout: 0.1
arch: pys2t_transformer_s arch: pdss2t_transformer_s_8
encoder-embed-dim: 256 encoder-embed-dim: 256
pyramid-stages: 4 pyramid-stages: 4
#pyramid-dropout: 0
pyramid-layers: 3_3_3_3 pyramid-layers: 3_3_3_3
pyramid-sr-ratios: 2_2_1_2 pyramid-ratios: 2_2_1_2
pyramid-fuse: True pyramid-fusion: True
pyramid-fuse-way: all_conv pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256 pyramid-embed-dims: 256_256_256_256
pyramid-reduced-embed: conv pyramid-ds-method: conv
pyramid-embed-norm: True pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1 pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5 pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8 pyramid-ffn-ratios: 8_8_8_8
pyramid-heads: 4_4_4_4 pyramid-attn-heads: 4_4_4_4
train-subset: train-clean-100,train-clean-360,train-other-500 train-subset: train-clean-100,train-clean-360,train-other-500
valid-subset: dev-clean valid-subset: dev-clean
...@@ -20,7 +22,7 @@ max-epoch: 100 ...@@ -20,7 +22,7 @@ max-epoch: 100
max-update: 300000 max-update: 300000
num-workers: 8 num-workers: 8
patience: 20 patience: 10
no-progress-bar: True no-progress-bar: True
log-interval: 100 log-interval: 100
seed: 1 seed: 1
...@@ -41,7 +43,6 @@ lr: 2e-3 ...@@ -41,7 +43,6 @@ lr: 2e-3
criterion: label_smoothed_cross_entropy_with_ctc criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1 label_smoothing: 0.1
conv-channels: 1024
dropout: 0.1 dropout: 0.1
activation-fn: relu activation-fn: relu
encoder-ffn-embed-dim: 2048 encoder-ffn-embed-dim: 2048
...@@ -52,5 +53,3 @@ encoder-attention-heads: 4 ...@@ -52,5 +53,3 @@ encoder-attention-heads: 4
decoder-embed-dim: 256 decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048 decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4 decoder-attention-heads: 4
attention-dropout: 0.1
activation-dropout: 0.1
train-subset: train_st arch: pdss2t_transformer_m_8
valid-subset: dev_st #arch: pdss2t_transformer_m_16
#arch: pdss2t_transformer_m_32
max-epoch: 50 train-subset: train-clean-100,train-clean-360,train-other-500
max-update: 100000 valid-subset: dev-clean
max-epoch: 100
max-update: 300000
num-workers: 8 num-workers: 8
patience: 10 patience: 10
...@@ -12,11 +16,8 @@ seed: 1 ...@@ -12,11 +16,8 @@ seed: 1
report-accuracy: True report-accuracy: True
#load-pretrained-encoder-from: #load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from: #load-pretrained-decoder-from:
arch: s2t_sate
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
clip-norm: 10.0 clip-norm: 10.0
...@@ -26,32 +27,16 @@ warmup-updates: 10000 ...@@ -26,32 +27,16 @@ warmup-updates: 10000
lr: 2e-3 lr: 2e-3
#adam_betas: (0.9,0.98) #adam_betas: (0.9,0.98)
ctc-weight: 0.3
criterion: label_smoothed_cross_entropy_with_ctc criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1 label_smoothing: 0.1
encoder-normalize-before: True
decoder-normalize-before: True
conv-kernel-sizes: 5,5
conv-channels: 1024
dropout: 0.1 dropout: 0.1
activation-fn: relu activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048 encoder-ffn-embed-dim: 2048
encoder-layers: 12 encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6 decoder-layers: 6
encoder-attention-heads: 4 encoder-attention-heads: 4
macaron-style: True decoder-embed-dim: 512
use-cnn-module: True decoder-ffn-embed-dim: 2048
cnn-module-kernel: 31 decoder-attention-heads: 8
acoustic-encoder: transformer
adapter: league
#decoder-embed-dim: 256
#decoder-ffn-embed-dim: 2048
#decoder-attention-heads: 4
#attention-dropout: 0.1
#activation-dropout: 0.1
arch: pys2t_transformer_s arch: pdss2t_transformer_m_16
encoder-embed-dim: 512 encoder-embed-dim: 512
pyramid-stages: 4 pyramid-stages: 4
#pyramid-dropout: 0
pyramid-layers: 2_2_6_2 pyramid-layers: 2_2_6_2
#pyramid-layers: 3_3_3_3 pyramid-ratios: 2_2_2_2
pyramid-sr-ratios: 2_2_2_2 pyramid-fusion: True
pyramid-fuse: True pyramid-fusion-method: all_conv
pyramid-fuse-way: all_conv
pyramid-embed-dims: 512_512_512_512 pyramid-embed-dims: 512_512_512_512
pyramid-reduced-embed: conv pyramid-ds-method: conv
pyramid-embed-norm: True pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1 pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5 pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8 pyramid-ffn-ratios: 4_4_4_4
pyramid-heads: 8_8_8_8 pyramid-attn-heads: 8_8_8_8
train-subset: train-clean-100,train-clean-360,train-other-500 train-subset: train-clean-100,train-clean-360,train-other-500
valid-subset: dev-clean valid-subset: dev-clean
...@@ -21,7 +22,7 @@ max-epoch: 100 ...@@ -21,7 +22,7 @@ max-epoch: 100
max-update: 300000 max-update: 300000
num-workers: 8 num-workers: 8
patience: 20 patience: 10
no-progress-bar: True no-progress-bar: True
log-interval: 100 log-interval: 100
seed: 1 seed: 1
...@@ -42,16 +43,13 @@ lr: 2e-3 ...@@ -42,16 +43,13 @@ lr: 2e-3
criterion: label_smoothed_cross_entropy_with_ctc criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1 label_smoothing: 0.1
conv-channels: 1024
dropout: 0.1 dropout: 0.1
activation-fn: relu activation-fn: relu
encoder-ffn-embed-dim: 2048 encoder-ffn-embed-dim: 2048
encoder-layers: 12 encoder-layers: 12
decoder-layers: 6 decoder-layers: 6
encoder-attention-heads: 4 encoder-attention-heads: 8
decoder-embed-dim: 256 decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048 decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4 decoder-attention-heads: 8
attention-dropout: 0.1
activation-dropout: 0.1
train-subset: train arch: pdss2t_transformer_m_32
valid-subset: valid
max-epoch: 50 encoder-embed-dim: 512
max-update: 100000 pyramid-stages: 5
#pyramid-dropout: 0
pyramid-layers: 2_2_3_3_2
pyramid-ratios: 2_2_2_2_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 512_512_512_512_512
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1_1
pyramid-kernel-sizes: 5_5_5_5_5
pyramid-ffn-ratios: 4_4_4_4_4
pyramid-attn-heads: 8_8_8_8_8
train-subset: train-clean-100,train-clean-360,train-other-500
valid-subset: dev-clean
max-epoch: 100
max-update: 300000
num-workers: 8 num-workers: 8
patience: 10 patience: 10
...@@ -10,40 +27,29 @@ no-progress-bar: True ...@@ -10,40 +27,29 @@ no-progress-bar: True
log-interval: 100 log-interval: 100
seed: 1 seed: 1
report-accuracy: True report-accuracy: True
skip-invalid-size-inputs-valid-test: True
#load-pretrained-encoder-from: #load-pretrained-encoder-from:
#load-pretrained-decoder-from: #load-pretrained-decoder-from:
arch: dlcl_transformer
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
clip-norm: 10.0 clip-norm: 10.0
lr-scheduler: inverse_sqrt lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7 warmup-init-lr: 1e-7
warmup-updates: 8000 warmup-updates: 10000
lr: 1e-3 lr: 2e-3
adam_betas: (0.9,0.997) #adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1 label_smoothing: 0.1
dropout: 0.1 dropout: 0.1
attention-dropout: 0.1
activation-dropout: 0.1
activation-fn: relu activation-fn: relu
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048 encoder-ffn-embed-dim: 2048
encoder-layers: 6 encoder-layers: 12
decoder-layers: 6 decoder-layers: 6
encoder-attention-heads: 8 encoder-attention-heads: 8
decoder-embed-dim: 512 decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048 decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8 decoder-attention-heads: 8
use-enc-dlcl: True
use-dec-dlcl: True
\ No newline at end of file
train-subset: train arch: pdss2t_transformer_m_8
valid-subset: valid
max-epoch: 50 encoder-embed-dim: 512
max-update: 100000 pyramid-stages: 4
#pyramid-dropout: 0
pyramid-layers: 3_3_3_3
pyramid-ratios: 2_2_1_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 512_512_512_512
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 4_4_4_4
pyramid-attn-heads: 8_8_8_8
train-subset: train-clean-100,train-clean-360,train-other-500
valid-subset: dev-clean
max-epoch: 100
max-update: 300000
num-workers: 8 num-workers: 8
patience: 10 patience: 10
...@@ -10,42 +27,29 @@ no-progress-bar: True ...@@ -10,42 +27,29 @@ no-progress-bar: True
log-interval: 100 log-interval: 100
seed: 1 seed: 1
report-accuracy: True report-accuracy: True
skip-invalid-size-inputs-valid-test: True
#load-pretrained-encoder-from: #load-pretrained-encoder-from:
#load-pretrained-decoder-from: #load-pretrained-decoder-from:
arch: transformer
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
clip-norm: 10.0 clip-norm: 10.0
lr-scheduler: inverse_sqrt lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7 warmup-init-lr: 1e-7
warmup-updates: 8000 warmup-updates: 10000
lr: 1e-3 lr: 2e-3
adam_betas: (0.9,0.997) #adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1 label_smoothing: 0.1
dropout: 0.1 dropout: 0.1
attention-dropout: 0.1
activation-dropout: 0.1
activation-fn: relu activation-fn: relu
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048 encoder-ffn-embed-dim: 2048
encoder-layers: 6 encoder-layers: 12
decoder-layers: 6 decoder-layers: 6
encoder-attention-heads: 8 encoder-attention-heads: 8
decoder-embed-dim: 512 decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048 decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8 decoder-attention-heads: 8
encoder-attention-type: relative
decoder-attention-type: relative
max-encoder-relative-length: 20
max-decoder-relative-length: 20
#train-subset: train-clean-100,train-clean-360,train-other-500 arch: pdss2t_transformer_sd_8
train-subset: train-clean-100 #arch: pdss2t_transformer_sd_16
#arch: pdss2t_transformer_sd_32
train-subset: train-clean-100,train-clean-360,train-other-500
valid-subset: dev-clean valid-subset: dev-clean
max-epoch: 100 max-epoch: 100
max-update: 300000 max-update: 300000
num-workers: 0 num-workers: 8
patience: 10 patience: 10
no-progress-bar: True no-progress-bar: True
log-interval: 100 log-interval: 100
seed: 1 seed: 1
report-accuracy: True report-accuracy: True
arch: s2t_transformer_s #load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
clip-norm: 10.0 clip-norm: 10.0
...@@ -22,26 +27,16 @@ warmup-updates: 10000 ...@@ -22,26 +27,16 @@ warmup-updates: 10000
lr: 2e-3 lr: 2e-3
#adam_betas: (0.9,0.98) #adam_betas: (0.9,0.98)
ctc-weight: 0.3
criterion: label_smoothed_cross_entropy_with_ctc criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1 label_smoothing: 0.1
conv-kernel-sizes: 5,5
conv-channels: 1024
dropout: 0.1 dropout: 0.1
activation-fn: relu activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048 encoder-ffn-embed-dim: 2048
encoder-layers: 3 encoder-layers: 30
decoder-layers: 3 decoder-layers: 6
encoder-attention-heads: 4 encoder-attention-heads: 4
macaron-style: True decoder-embed-dim: 256
use-cnn-module: True decoder-ffn-embed-dim: 2048
cnn-module-kernel: 31 decoder-attention-heads: 4
#decoder-embed-dim: 256
#decoder-ffn-embed-dim: 2048
#decoder-attention-heads: 4
#attention-dropout: 0.1
#activation-dropout: 0.1
arch: pys2t_transformer_s arch: pdss2t_transformer_sd_16
encoder-embed-dim: 256 encoder-embed-dim: 256
pyramid-stages: 4 pyramid-stages: 4
pyramid-layers: 3_3_8_4 #pyramid-dropout: 0
pyramid-sr-ratios: 2_2_2_2 pyramid-layers: 5_5_12_8
pyramid-fuse: True pyramid-ratios: 2_2_2_2
pyramid-fuse-way: all_conv pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256 pyramid-embed-dims: 256_256_256_256
pyramid-reduced-embed: conv pyramid-ds-method: conv
pyramid-embed-norm: True pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1 pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5 pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8 pyramid-ffn-ratios: 8_8_8_8
pyramid-heads: 4_4_4_4 pyramid-attn-heads: 4_4_4_4
train-subset: train-clean-100,train-clean-360,train-other-500 train-subset: train-clean-100,train-clean-360,train-other-500
valid-subset: dev-clean valid-subset: dev-clean
...@@ -20,7 +22,7 @@ max-epoch: 100 ...@@ -20,7 +22,7 @@ max-epoch: 100
max-update: 300000 max-update: 300000
num-workers: 8 num-workers: 8
patience: 20 patience: 10
no-progress-bar: True no-progress-bar: True
log-interval: 100 log-interval: 100
seed: 1 seed: 1
...@@ -41,16 +43,13 @@ lr: 2e-3 ...@@ -41,16 +43,13 @@ lr: 2e-3
criterion: label_smoothed_cross_entropy_with_ctc criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1 label_smoothing: 0.1
conv-channels: 1024
dropout: 0.1 dropout: 0.1
activation-fn: relu activation-fn: relu
encoder-ffn-embed-dim: 2048 encoder-ffn-embed-dim: 2048
encoder-layers: 12 encoder-layers: 30
decoder-layers: 6 decoder-layers: 6
encoder-attention-heads: 4 encoder-attention-heads: 4
decoder-embed-dim: 256 decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048 decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4 decoder-attention-heads: 4
attention-dropout: 0.1
activation-dropout: 0.1
arch: pdss2t_transformer_sd_32
encoder-embed-dim: 256
pyramid-stages: 5
#pyramid-dropout: 0
pyramid-layers: 5_5_7_7_6
pyramid-ratios: 2_2_2_2_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1_1
pyramid-kernel-sizes: 5_5_5_5_5
pyramid-ffn-ratios: 8_8_8_8_8
pyramid-attn-heads: 4_4_4_4_4
train-subset: train-clean-100,train-clean-360,train-other-500
valid-subset: dev-clean
max-epoch: 100
max-update: 300000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
#adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 30
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
arch: pys2t_transformer_s arch: pdss2t_transformer_sd_8
encoder-embed-dim: 256 encoder-embed-dim: 256
pyramid-stages: 4 pyramid-stages: 4
pyramid-layers: 5_5_15_5 #pyramid-dropout: 0
pyramid-sr-ratios: 2_2_2_2 pyramid-layers: 7_7_7_9
pyramid-fuse: True pyramid-ratios: 2_2_1_2
pyramid-fuse-way: all_conv pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256 pyramid-embed-dims: 256_256_256_256
pyramid-reduced-embed: conv pyramid-ds-method: conv
pyramid-embed-norm: True pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1 pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5 pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8 pyramid-ffn-ratios: 8_8_8_8
pyramid-heads: 4_4_4_4 pyramid-attn-heads: 4_4_4_4
train-subset: train-clean-100,train-clean-360,train-other-500 train-subset: train-clean-100,train-clean-360,train-other-500
valid-subset: dev-clean valid-subset: dev-clean
...@@ -20,7 +22,7 @@ max-epoch: 100 ...@@ -20,7 +22,7 @@ max-epoch: 100
max-update: 300000 max-update: 300000
num-workers: 8 num-workers: 8
patience: 20 patience: 10
no-progress-bar: True no-progress-bar: True
log-interval: 100 log-interval: 100
seed: 1 seed: 1
...@@ -41,16 +43,13 @@ lr: 2e-3 ...@@ -41,16 +43,13 @@ lr: 2e-3
criterion: label_smoothed_cross_entropy_with_ctc criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1 label_smoothing: 0.1
conv-channels: 1024
dropout: 0.1 dropout: 0.1
activation-fn: relu activation-fn: relu
encoder-ffn-embed-dim: 2048 encoder-ffn-embed-dim: 2048
encoder-layers: 12 encoder-layers: 30
decoder-layers: 6 decoder-layers: 6
encoder-attention-heads: 4 encoder-attention-heads: 4
decoder-embed-dim: 256 decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048 decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4 decoder-attention-heads: 4
attention-dropout: 0.1
activation-dropout: 0.1
...@@ -6,23 +6,24 @@ gpu_num=8 ...@@ -6,23 +6,24 @@ gpu_num=8
update_freq=1 update_freq=1
max_tokens=100000 max_tokens=100000
extra_tag=
extra_parameter=
#extra_tag="${extra_tag}"
#extra_parameter="${extra_parameter} "
#exp_tag= #exp_tag=
#config_list=(base) #config_list=(base)
#config_list=(ctc) #config_list=(ctc)
#config_list=(ctc conformer rpr) #config_list=(ctc conformer rpr)
config_list=(base conformer rpr) config_list=(base conformer rpr)
#config_list=(pyramid4_all256) #config_list=(pds_base)
#config_list=(pyramid5_all256) #config_list=(pds_big)
#config_list=(pds_deep)
# exp full name # exp full name
exp_name= exp_name=
extra_tag=
extra_parameter=
#extra_tag="${extra_tag}"
#extra_parameter="${extra_parameter} "
train_config=$(echo ${config_list[*]} | sed 's/ /,/g') train_config=$(echo ${config_list[*]} | sed 's/ /,/g')
cmd="./run.sh cmd="./run.sh
......
...@@ -2,22 +2,22 @@ set -e ...@@ -2,22 +2,22 @@ set -e
eval=1 eval=1
lcrm=0
tokenizer=0
root_dir=~/st/Fairseq-S2T root_dir=~/st/Fairseq-S2T
data_dir=/home/xuchen/st/data/test data_dir=~/st/data/test
vocab_dir=/home/xuchen/st/data/mustc/st_lcrm/en-de vocab_dir=~/st/data/mustc/st/en-de
asr_vocab_prefix=spm_unigram10000_st_share asr_vocab_prefix=spm_unigram10000_st_share
src_lang=en src_lang=en
tgt_lang=de tgt_lang=de
splits=(2019) subsets=(2019)
source ~/tools/audio/bin/activate
splits=`echo ${splits[*]} | sed 's/ /,/g'`
cp -r ${vocab_dir}/${asr_vocab_prefix}.* ${data_dir}/${src_lang}-${tgt_lang} cp -r ${vocab_dir}/${asr_vocab_prefix}.* ${data_dir}/${src_lang}-${tgt_lang}
rm -rf ${data_dir}/${src_lang}-${tgt_lang}/fbank80.zip rm -rf ${data_dir}/${src_lang}-${tgt_lang}/fbank80.zip
splits=$(echo ${subsets[*]} | sed 's/ /,/g')
cmd="python ${root_dir}/examples/speech_to_text/prep_st_data.py cmd="python ${root_dir}/examples/speech_to_text/prep_st_data.py
--data-root ${data_dir} --data-root ${data_dir}
--output-root ${data_dir} --output-root ${data_dir}
...@@ -42,4 +42,3 @@ cmd="python ${root_dir}/examples/speech_to_text/prep_st_data.py ...@@ -42,4 +42,3 @@ cmd="python ${root_dir}/examples/speech_to_text/prep_st_data.py
echo -e "\033[34mRun command: \n${cmd} \033[0m" echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd} [[ $eval -eq 1 ]] && eval ${cmd}
deactivate
arch: s2t_conformer_s
macaron-style: True macaron-style: True
use-cnn-module: True use-cnn-module: True
cnn-module-kernel: 31 cnn-module-kernel: 31
train-subset: train_st arch: pdss2t_transformer_s_8
valid-subset: dev_st
max-epoch: 50 train-subset: train_asr
valid-subset: dev_asr
max-epoch: 100
max-update: 100000 max-update: 100000
num-workers: 8 num-workers: 8
...@@ -11,10 +13,10 @@ log-interval: 100 ...@@ -11,10 +13,10 @@ log-interval: 100
seed: 1 seed: 1
report-accuracy: True report-accuracy: True
#load-pretrained-encoder-from: #load-pretrained-encoder-from:
#load-pretrained-decoder-from: #load-pretrained-decoder-from:
arch: s2t_conformer_s
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
clip-norm: 10.0 clip-norm: 10.0
...@@ -24,26 +26,16 @@ warmup-updates: 10000 ...@@ -24,26 +26,16 @@ warmup-updates: 10000
lr: 2e-3 lr: 2e-3
#adam_betas: (0.9,0.98) #adam_betas: (0.9,0.98)
ctc-weight: 0.3
criterion: label_smoothed_cross_entropy_with_ctc criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1 label_smoothing: 0.1
conv-kernel-sizes: 5,5
conv-channels: 1024
dropout: 0.1 dropout: 0.1
activation-fn: relu activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048 encoder-ffn-embed-dim: 2048
encoder-layers: 12 encoder-layers: 12
decoder-layers: 6 decoder-layers: 6
encoder-attention-heads: 4 encoder-attention-heads: 4
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 31
decoder-embed-dim: 256 decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048 decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4 decoder-attention-heads: 4
attention-dropout: 0.1
activation-dropout: 0.1
arch: pys2t_transformer_s arch: pdss2t_transformer_s_16
encoder-embed-dim: 256 encoder-embed-dim: 256
pyramid-stages: 4 pyramid-stages: 4
#pyramid-dropout: 0 #pyramid-dropout: 0
pyramid-layers: 2_2_6_2 pyramid-layers: 2_2_6_2
pyramid-sr-ratios: 2_2_2_2 pyramid-ratios: 2_2_2_2
pyramid-fuse: True pyramid-fusion: True
pyramid-fuse-way: all_conv pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256 pyramid-embed-dims: 256_256_256_256
pyramid-reduced-embed: conv pyramid-ds-method: conv
pyramid-embed-norm: True pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1 pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5 pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8 pyramid-ffn-ratios: 8_8_8_8
pyramid-heads: 4_4_4_4 pyramid-attn-heads: 4_4_4_4
train-subset: train_asr train-subset: train_asr
valid-subset: dev_asr valid-subset: dev_asr
...@@ -21,12 +22,13 @@ max-epoch: 100 ...@@ -21,12 +22,13 @@ max-epoch: 100
max-update: 100000 max-update: 100000
num-workers: 8 num-workers: 8
patience: 20 patience: 10
no-progress-bar: True no-progress-bar: True
log-interval: 100 log-interval: 100
seed: 1 seed: 1
report-accuracy: True report-accuracy: True
#load-pretrained-encoder-from: #load-pretrained-encoder-from:
#load-pretrained-decoder-from: #load-pretrained-decoder-from:
...@@ -42,7 +44,6 @@ lr: 2e-3 ...@@ -42,7 +44,6 @@ lr: 2e-3
criterion: label_smoothed_cross_entropy_with_ctc criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1 label_smoothing: 0.1
conv-channels: 1024
dropout: 0.1 dropout: 0.1
activation-fn: relu activation-fn: relu
encoder-ffn-embed-dim: 2048 encoder-ffn-embed-dim: 2048
...@@ -53,5 +54,3 @@ encoder-attention-heads: 4 ...@@ -53,5 +54,3 @@ encoder-attention-heads: 4
decoder-embed-dim: 256 decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048 decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4 decoder-attention-heads: 4
attention-dropout: 0.1
activation-dropout: 0.1
train-subset: train_st arch: pdss2t_transformer_s_32
valid-subset: dev_st
max-epoch: 50 encoder-embed-dim: 256
pyramid-stages: 5
#pyramid-dropout: 0
pyramid-layers: 2_2_3_3_2
pyramid-ratios: 2_2_2_2_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1_1
pyramid-kernel-sizes: 5_5_5_5_5
pyramid-ffn-ratios: 8_8_8_8_8
pyramid-attn-heads: 4_4_4_4_4
train-subset: train_asr
valid-subset: dev_asr
max-epoch: 100
max-update: 100000 max-update: 100000
num-workers: 8 num-workers: 8
...@@ -11,12 +28,10 @@ log-interval: 100 ...@@ -11,12 +28,10 @@ log-interval: 100
seed: 1 seed: 1
report-accuracy: True report-accuracy: True
#load-pretrained-encoder-from: #load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from: #load-pretrained-decoder-from:
arch: s2t_sate
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
clip-norm: 10.0 clip-norm: 10.0
...@@ -26,32 +41,16 @@ warmup-updates: 10000 ...@@ -26,32 +41,16 @@ warmup-updates: 10000
lr: 2e-3 lr: 2e-3
#adam_betas: (0.9,0.98) #adam_betas: (0.9,0.98)
ctc-weight: 0.3
criterion: label_smoothed_cross_entropy_with_ctc criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1 label_smoothing: 0.1
encoder-normalize-before: True
decoder-normalize-before: True
conv-kernel-sizes: 5,5
conv-channels: 1024
dropout: 0.1 dropout: 0.1
activation-fn: relu activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048 encoder-ffn-embed-dim: 2048
encoder-layers: 12 encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6 decoder-layers: 6
encoder-attention-heads: 4 encoder-attention-heads: 4
macaron-style: True decoder-embed-dim: 256
use-cnn-module: True decoder-ffn-embed-dim: 2048
cnn-module-kernel: 31 decoder-attention-heads: 4
acoustic-encoder: transformer
adapter: league
#decoder-embed-dim: 256
#decoder-ffn-embed-dim: 2048
#decoder-attention-heads: 4
#attention-dropout: 0.1
#activation-dropout: 0.1
arch: pys2t_transformer_s arch: pdss2t_transformer_s_8
encoder-embed-dim: 256 encoder-embed-dim: 256
pyramid-stages: 4 pyramid-stages: 4
#pyramid-dropout: 0 #pyramid-dropout: 0
pyramid-layers: 3_3_3_3 pyramid-layers: 3_3_3_3
pyramid-sr-ratios: 2_2_1_2 pyramid-ratios: 2_2_1_2
pyramid-fuse: True pyramid-fusion: True
pyramid-fuse-way: all_conv pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256 pyramid-embed-dims: 256_256_256_256
pyramid-reduced-embed: conv pyramid-ds-method: conv
pyramid-embed-norm: True pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1 pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5 pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8 pyramid-ffn-ratios: 8_8_8_8
pyramid-heads: 4_4_4_4 pyramid-attn-heads: 4_4_4_4
train-subset: train_asr train-subset: train_asr
valid-subset: dev_asr valid-subset: dev_asr
...@@ -21,12 +22,13 @@ max-epoch: 100 ...@@ -21,12 +22,13 @@ max-epoch: 100
max-update: 100000 max-update: 100000
num-workers: 8 num-workers: 8
patience: 20 patience: 10
no-progress-bar: True no-progress-bar: True
log-interval: 100 log-interval: 100
seed: 1 seed: 1
report-accuracy: True report-accuracy: True
#load-pretrained-encoder-from: #load-pretrained-encoder-from:
#load-pretrained-decoder-from: #load-pretrained-decoder-from:
...@@ -42,7 +44,6 @@ lr: 2e-3 ...@@ -42,7 +44,6 @@ lr: 2e-3
criterion: label_smoothed_cross_entropy_with_ctc criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1 label_smoothing: 0.1
conv-channels: 1024
dropout: 0.1 dropout: 0.1
activation-fn: relu activation-fn: relu
encoder-ffn-embed-dim: 2048 encoder-ffn-embed-dim: 2048
...@@ -53,5 +54,3 @@ encoder-attention-heads: 4 ...@@ -53,5 +54,3 @@ encoder-attention-heads: 4
decoder-embed-dim: 256 decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048 decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4 decoder-attention-heads: 4
attention-dropout: 0.1
activation-dropout: 0.1
...@@ -6,23 +6,24 @@ gpu_num=8 ...@@ -6,23 +6,24 @@ gpu_num=8
update_freq=1 update_freq=1
max_tokens=40000 max_tokens=40000
extra_tag=
extra_parameter=
#extra_tag="${extra_tag}"
#extra_parameter="${extra_parameter} "
exp_tag= exp_tag=
#config_list=(base) #config_list=(base)
#config_list=(ctc) #config_list=(ctc)
#config_list=(base conformer) #config_list=(base conformer)
#config_list=(pyramid4_base) #config_list=(pds_base_16)
config_list=(pyramid4_base conformer rpr) config_list=(pds_base_16 conformer rpr)
# exp full name # exp full name
exp_name= exp_name=
extra_tag=
extra_parameter=
#extra_tag="${extra_tag}"
#extra_parameter="${extra_parameter} "
train_config=$(echo ${config_list[*]} | sed 's/ /,/g') train_config=$(echo ${config_list[*]} | sed 's/ /,/g')
cmd="./run.sh cmd="./run.sh
......
set -e set -e
eval=1 eval=1
lcrm=0
root_dir=~/st/Fairseq-S2T root_dir=~/st/Fairseq-S2T
data_dir=/home/xuchen/st/data/wmt/test data_dir=/home/xuchen/st/data/wmt/test
......
arch: s2t_conformer_s
macaron-style: True macaron-style: True
use-cnn-module: True use-cnn-module: True
cnn-module-kernel: 31 cnn-module-kernel: 31
arch: pdss2t_transformer_s_8
train-subset: train_st train-subset: train_st
valid-subset: dev_st valid-subset: dev_st
max-epoch: 50 max-epoch: 100
max-update: 100000 max-update: 100000
num-workers: 8 num-workers: 8
...@@ -14,7 +16,6 @@ report-accuracy: True ...@@ -14,7 +16,6 @@ report-accuracy: True
#load-pretrained-encoder-from: #load-pretrained-encoder-from:
#load-pretrained-decoder-from: #load-pretrained-decoder-from:
arch: s2t_transformer_s
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
clip-norm: 10.0 clip-norm: 10.0
...@@ -24,14 +25,11 @@ warmup-updates: 10000 ...@@ -24,14 +25,11 @@ warmup-updates: 10000
lr: 2e-3 lr: 2e-3
#adam_betas: (0.9,0.98) #adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1 label_smoothing: 0.1
conv-kernel-sizes: 5,5
conv-channels: 1024
dropout: 0.1 dropout: 0.1
activation-fn: relu activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048 encoder-ffn-embed-dim: 2048
encoder-layers: 12 encoder-layers: 12
decoder-layers: 6 decoder-layers: 6
...@@ -40,5 +38,3 @@ encoder-attention-heads: 4 ...@@ -40,5 +38,3 @@ encoder-attention-heads: 4
decoder-embed-dim: 256 decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048 decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4 decoder-attention-heads: 4
attention-dropout: 0.1
activation-dropout: 0.1
arch: pdss2t_transformer_s_16
encoder-embed-dim: 256
pyramid-stages: 4
#pyramid-dropout: 0
pyramid-layers: 2_2_6_2
pyramid-ratios: 2_2_2_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8
pyramid-attn-heads: 4_4_4_4
train-subset: train_st train-subset: train_st
valid-subset: dev_st valid-subset: dev_st
max-epoch: 50 max-epoch: 100
max-update: 100000 max-update: 100000
num-workers: 8 num-workers: 8
...@@ -14,7 +31,6 @@ report-accuracy: True ...@@ -14,7 +31,6 @@ report-accuracy: True
#load-pretrained-encoder-from: #load-pretrained-encoder-from:
#load-pretrained-decoder-from: #load-pretrained-decoder-from:
arch: s2t_transformer_s
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
clip-norm: 10.0 clip-norm: 10.0
...@@ -24,27 +40,16 @@ warmup-updates: 10000 ...@@ -24,27 +40,16 @@ warmup-updates: 10000
lr: 2e-3 lr: 2e-3
#adam_betas: (0.9,0.98) #adam_betas: (0.9,0.98)
ctc-weight: 0.3
criterion: label_smoothed_cross_entropy_with_ctc criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1 label_smoothing: 0.1
conv-kernel-sizes: 5,5
conv-channels: 1024
dropout: 0.1 dropout: 0.1
activation-fn: relu activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048 encoder-ffn-embed-dim: 2048
encoder-layers: 12 encoder-layers: 12
decoder-layers: 6 decoder-layers: 6
encoder-attention-heads: 4 encoder-attention-heads: 4
encoder-attention-type: relative
decoder-attention-type: relative
max-encoder-relative-length: 100
max-decoder-relative-length: 20
decoder-embed-dim: 256 decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048 decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4 decoder-attention-heads: 4
attention-dropout: 0.1
activation-dropout: 0.1
arch: pdss2t_transformer_s_32
encoder-embed-dim: 256
pyramid-stages: 5
#pyramid-dropout: 0
pyramid-layers: 2_2_3_3_2
pyramid-ratios: 2_2_2_2_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1_1
pyramid-kernel-sizes: 5_5_5_5_5
pyramid-ffn-ratios: 8_8_8_8_8
pyramid-attn-heads: 4_4_4_4_4
train-subset: train_st train-subset: train_st
valid-subset: dev_st valid-subset: dev_st
max-epoch: 50 max-epoch: 100
max-update: 100000 max-update: 100000
num-workers: 8 num-workers: 8
...@@ -14,7 +31,6 @@ report-accuracy: True ...@@ -14,7 +31,6 @@ report-accuracy: True
#load-pretrained-encoder-from: #load-pretrained-encoder-from:
#load-pretrained-decoder-from: #load-pretrained-decoder-from:
arch: s2t_conformer_s
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
clip-norm: 10.0 clip-norm: 10.0
...@@ -24,31 +40,16 @@ warmup-updates: 10000 ...@@ -24,31 +40,16 @@ warmup-updates: 10000
lr: 2e-3 lr: 2e-3
#adam_betas: (0.9,0.98) #adam_betas: (0.9,0.98)
ctc-weight: 0.3
criterion: label_smoothed_cross_entropy_with_ctc criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1 label_smoothing: 0.1
conv-kernel-sizes: 5,5
conv-channels: 1024
dropout: 0.1 dropout: 0.1
activation-fn: relu activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048 encoder-ffn-embed-dim: 2048
encoder-layers: 12 encoder-layers: 12
decoder-layers: 6 decoder-layers: 6
encoder-attention-heads: 4 encoder-attention-heads: 4
macaron-style: True decoder-embed-dim: 256
use-cnn-module: True decoder-ffn-embed-dim: 2048
cnn-module-kernel: 31 decoder-attention-heads: 4
encoder-attention-type: relative
decoder-attention-type: relative
max-encoder-relative-length: 100
max-decoder-relative-length: 20
#decoder-embed-dim: 256
#decoder-ffn-embed-dim: 2048
#decoder-attention-heads: 4
#attention-dropout: 0.1
#activation-dropout: 0.1
arch: pys2t_transformer_s arch: pdss2t_transformer_s_8
encoder-embed-dim: 256 encoder-embed-dim: 256
#pyramid-dropout: 0
pyramid-stages: 4 pyramid-stages: 4
#pyramid-dropout: 0
pyramid-layers: 3_3_3_3 pyramid-layers: 3_3_3_3
pyramid-sr-ratios: 2_2_1_2 pyramid-ratios: 2_2_1_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256 pyramid-embed-dims: 256_256_256_256
pyramid-fuse: True pyramid-ds-method: conv
pyramid-reduced-embed: conv
pyramid-embed-norm: True pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1 pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5 pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8 pyramid-ffn-ratios: 8_8_8_8
pyramid-heads: 4_4_4_4 pyramid-attn-heads: 4_4_4_4
train-subset: train_st train-subset: train_st
valid-subset: dev_st valid-subset: dev_st
...@@ -26,10 +28,8 @@ log-interval: 100 ...@@ -26,10 +28,8 @@ log-interval: 100
seed: 1 seed: 1
report-accuracy: True report-accuracy: True
#load-pretrained-encoder-from: /home/xuchen/st/checkpoints/mustc/asr/1002_pyramid4_all256_3333_sr8/avg_10_checkpoint.pt #load-pretrained-encoder-from:
#load-pretrained-encoder-from: /home/xuchen/st/checkpoints/mustc/asr/1002_pyramid4_all256_3333_sr8/checkpoint_best.pt #load-pretrained-decoder-from:
load-pretrained-encoder-from: /home/xuchen/st/checkpoints/mustc/asr/1007_st_pyramid4_all256_3333_sr8_ctc/avg_10_checkpoint.pt
load-pretrained-decoder-from: /home/xuchen/st/checkpoints/mustc/mt/st_1003_2349_train_s_baseline/avg_10_checkpoint.pt
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
...@@ -43,7 +43,6 @@ lr: 2e-3 ...@@ -43,7 +43,6 @@ lr: 2e-3
criterion: label_smoothed_cross_entropy_with_ctc criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1 label_smoothing: 0.1
conv-channels: 1024
dropout: 0.1 dropout: 0.1
activation-fn: relu activation-fn: relu
encoder-ffn-embed-dim: 2048 encoder-ffn-embed-dim: 2048
...@@ -54,5 +53,3 @@ encoder-attention-heads: 4 ...@@ -54,5 +53,3 @@ encoder-attention-heads: 4
decoder-embed-dim: 256 decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048 decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4 decoder-attention-heads: 4
attention-dropout: 0.1
activation-dropout: 0.1
arch: s2t_sate_s
acoustic-encoder: transformer
adapter: league
#load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from:
...@@ -43,10 +43,11 @@ text-encoder-layers: 6 ...@@ -43,10 +43,11 @@ text-encoder-layers: 6
decoder-layers: 6 decoder-layers: 6
encoder-attention-heads: 4 encoder-attention-heads: 4
macaron-style: True #macaron-style: True
use-cnn-module: True #use-cnn-module: True
cnn-module-kernel: 31 #cnn-module-kernel: 31
#acoustic-encoder: pds
acoustic-encoder: transformer acoustic-encoder: transformer
adapter: league adapter: league
...@@ -54,18 +55,17 @@ encoder-embed-dim: 256 ...@@ -54,18 +55,17 @@ encoder-embed-dim: 256
pyramid-stages: 4 pyramid-stages: 4
#pyramid-dropout: 0 #pyramid-dropout: 0
pyramid-layers: 3_3_3_3 pyramid-layers: 3_3_3_3
pyramid-sr-ratios: 2_2_1_2 pyramid-ratios: 2_2_1_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256 pyramid-embed-dims: 256_256_256_256
pyramid-fuse: True pyramid-ds-method: conv
pyramid-reduced-embed: conv
pyramid-embed-norm: True pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1 pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5 pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8 pyramid-ffn-ratios: 8_8_8_8
pyramid-heads: 4_4_4_4 pyramid-attn-heads: 4_4_4_4
decoder-embed-dim: 256 decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048 decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4 decoder-attention-heads: 4
attention-dropout: 0.1
activation-dropout: 0.1
...@@ -6,6 +6,11 @@ gpu_num=8 ...@@ -6,6 +6,11 @@ gpu_num=8
update_freq=1 update_freq=1
max_tokens=40000 max_tokens=40000
extra_tag=
extra_parameter=
#extra_tag="${extra_tag}"
#extra_parameter="${extra_parameter} "
exp_tag= exp_tag=
#config_list=(base) #config_list=(base)
...@@ -14,17 +19,12 @@ config_list=(ctc) ...@@ -14,17 +19,12 @@ config_list=(ctc)
#config_list=(ctc conformer rpr) #config_list=(ctc conformer rpr)
#config_list=(base sate) #config_list=(base sate)
#config_list=(pyramid4_base_sr8) #config_list=(pds_base)
#config_list=(pyramid4_base_sr8 conformer) #config_list=(pds_base conformer)
# exp full name # exp full name
exp_name= exp_name=
extra_tag=
extra_parameter=
#extra_tag="${extra_tag}"
#extra_parameter="${extra_parameter} "
train_config=$(echo ${config_list[*]} | sed 's/ /,/g') train_config=$(echo ${config_list[*]} | sed 's/ /,/g')
cmd="./run.sh cmd="./run.sh
......
set -e
eval=1
lcrm=0
tokenizer=0
root_dir=~/st/Fairseq-S2T
data_dir=~/st/data/test
vocab_dir=~/st/data/mustc/st/en-de
asr_vocab_prefix=spm_unigram10000_st_share
src_lang=en
tgt_lang=de
subsets=(2019)
cp -r ${vocab_dir}/${asr_vocab_prefix}.* ${data_dir}/${src_lang}-${tgt_lang}
rm -rf ${data_dir}/${src_lang}-${tgt_lang}/fbank80.zip
splits=$(echo ${subsets[*]} | sed 's/ /,/g')
cmd="python ${root_dir}/examples/speech_to_text/prep_st_data.py
--data-root ${data_dir}
--output-root ${data_dir}
--splits ${splits}
--task asr
--src-lang ${src_lang}
--tgt-lang ${tgt_lang}
--add-src
--share
--asr-prefix ${asr_vocab_prefix}
--cmvn-type utterance"
if [[ ${lcrm} -eq 1 ]]; then
cmd="$cmd
--lowercase-src
--rm-punc-src"
fi
if [[ ${tokenizer} -eq 1 ]]; then
cmd="$cmd
--tokenizer"
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
train-subset: train_st train-subset: train_asr
valid-subset: dev_st valid-subset: dev_asr
max-epoch: 50 max-epoch: 100
max-update: 100000 max-update: 100000
num-workers: 8 num-workers: 8
...@@ -24,7 +24,6 @@ warmup-updates: 10000 ...@@ -24,7 +24,6 @@ warmup-updates: 10000
lr: 2e-3 lr: 2e-3
#adam_betas: (0.9,0.98) #adam_betas: (0.9,0.98)
ctc-weight: 0.3
criterion: label_smoothed_cross_entropy_with_ctc criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1 label_smoothing: 0.1
......
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 31
use-enc-dlcl: True
use-dec-dlcl: True
encoder-attention-type: local
hard-mask-window: 0
gauss-mask-sigma: 3
init-mask-weight: 0
\ No newline at end of file
train-subset: train_st arch: pdss2t_transformer_s_8
valid-subset: dev_st
max-epoch: 50 train-subset: train_asr
valid-subset: dev_asr
max-epoch: 100
max-update: 100000 max-update: 100000
num-workers: 8 num-workers: 8
...@@ -11,10 +13,10 @@ log-interval: 100 ...@@ -11,10 +13,10 @@ log-interval: 100
seed: 1 seed: 1
report-accuracy: True report-accuracy: True
#load-pretrained-encoder-from: #load-pretrained-encoder-from:
#load-pretrained-decoder-from: #load-pretrained-decoder-from:
arch: s2t_conformer_s
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
clip-norm: 10.0 clip-norm: 10.0
...@@ -24,26 +26,16 @@ warmup-updates: 10000 ...@@ -24,26 +26,16 @@ warmup-updates: 10000
lr: 2e-3 lr: 2e-3
#adam_betas: (0.9,0.98) #adam_betas: (0.9,0.98)
ctc-weight: 0.3
criterion: label_smoothed_cross_entropy_with_ctc criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1 label_smoothing: 0.1
conv-kernel-sizes: 5,5
conv-channels: 1024
dropout: 0.1 dropout: 0.1
activation-fn: relu activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048 encoder-ffn-embed-dim: 2048
encoder-layers: 12 encoder-layers: 12
decoder-layers: 6 decoder-layers: 6
encoder-attention-heads: 4 encoder-attention-heads: 4
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 31
decoder-embed-dim: 256 decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048 decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4 decoder-attention-heads: 4
attention-dropout: 0.1
activation-dropout: 0.1
train-subset: train_st arch: pdss2t_transformer_s_16
valid-subset: dev_st
max-epoch: 50 encoder-embed-dim: 256
pyramid-stages: 4
#pyramid-dropout: 0
pyramid-layers: 2_2_6_2
pyramid-ratios: 2_2_2_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8
pyramid-attn-heads: 4_4_4_4
train-subset: train_asr
valid-subset: dev_asr
max-epoch: 100
max-update: 100000 max-update: 100000
num-workers: 8 num-workers: 8
...@@ -11,12 +28,10 @@ log-interval: 100 ...@@ -11,12 +28,10 @@ log-interval: 100
seed: 1 seed: 1
report-accuracy: True report-accuracy: True
#load-pretrained-encoder-from: #load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from: #load-pretrained-decoder-from:
arch: s2t_sate
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
clip-norm: 10.0 clip-norm: 10.0
...@@ -26,32 +41,16 @@ warmup-updates: 10000 ...@@ -26,32 +41,16 @@ warmup-updates: 10000
lr: 2e-3 lr: 2e-3
#adam_betas: (0.9,0.98) #adam_betas: (0.9,0.98)
ctc-weight: 0.3
criterion: label_smoothed_cross_entropy_with_ctc criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1 label_smoothing: 0.1
encoder-normalize-before: True
decoder-normalize-before: True
conv-kernel-sizes: 5,5
conv-channels: 1024
dropout: 0.1 dropout: 0.1
activation-fn: relu activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048 encoder-ffn-embed-dim: 2048
encoder-layers: 12 encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6 decoder-layers: 6
encoder-attention-heads: 4 encoder-attention-heads: 4
macaron-style: True decoder-embed-dim: 256
use-cnn-module: True decoder-ffn-embed-dim: 2048
cnn-module-kernel: 31 decoder-attention-heads: 4
acoustic-encoder: conformer
adapter: league
#decoder-embed-dim: 256
#decoder-ffn-embed-dim: 2048
#decoder-attention-heads: 4
#attention-dropout: 0.1
#activation-dropout: 0.1
train-subset: train_st arch: pdss2t_transformer_s_32
valid-subset: dev_st
max-epoch: 50 encoder-embed-dim: 256
pyramid-stages: 5
#pyramid-dropout: 0
pyramid-layers: 2_2_3_3_2
pyramid-ratios: 2_2_2_2_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1_1
pyramid-kernel-sizes: 5_5_5_5_5
pyramid-ffn-ratios: 8_8_8_8_8
pyramid-attn-heads: 4_4_4_4_4
train-subset: train_asr
valid-subset: dev_asr
max-epoch: 100
max-update: 100000 max-update: 100000
num-workers: 8 num-workers: 8
...@@ -11,12 +28,10 @@ log-interval: 100 ...@@ -11,12 +28,10 @@ log-interval: 100
seed: 1 seed: 1
report-accuracy: True report-accuracy: True
#load-pretrained-encoder-from: #load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from: #load-pretrained-decoder-from:
arch: s2t_sate
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
clip-norm: 10.0 clip-norm: 10.0
...@@ -26,32 +41,16 @@ warmup-updates: 10000 ...@@ -26,32 +41,16 @@ warmup-updates: 10000
lr: 2e-3 lr: 2e-3
#adam_betas: (0.9,0.98) #adam_betas: (0.9,0.98)
ctc-weight: 0.3
criterion: label_smoothed_cross_entropy_with_ctc criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1 label_smoothing: 0.1
encoder-normalize-before: True
decoder-normalize-before: True
conv-kernel-sizes: 5,5
conv-channels: 1024
dropout: 0.1 dropout: 0.1
activation-fn: relu activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048 encoder-ffn-embed-dim: 2048
encoder-layers: 12 encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6 decoder-layers: 6
encoder-attention-heads: 4 encoder-attention-heads: 4
macaron-style: True decoder-embed-dim: 256
use-cnn-module: True decoder-ffn-embed-dim: 2048
cnn-module-kernel: 31 decoder-attention-heads: 4
acoustic-encoder: conformer
adapter: league
#decoder-embed-dim: 256
#decoder-ffn-embed-dim: 2048
#decoder-attention-heads: 4
#attention-dropout: 0.1
#activation-dropout: 0.1
train-subset: train_st arch: pdss2t_transformer_s_8
valid-subset: dev_st
max-epoch: 50 encoder-embed-dim: 256
pyramid-stages: 4
#pyramid-dropout: 0
pyramid-layers: 3_3_3_3
pyramid-ratios: 2_2_1_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8
pyramid-attn-heads: 4_4_4_4
train-subset: train_asr
valid-subset: dev_asr
max-epoch: 100
max-update: 100000 max-update: 100000
num-workers: 8 num-workers: 8
...@@ -11,12 +28,10 @@ log-interval: 100 ...@@ -11,12 +28,10 @@ log-interval: 100
seed: 1 seed: 1
report-accuracy: True report-accuracy: True
#load-pretrained-encoder-from: #load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from: #load-pretrained-decoder-from:
arch: s2t_sate
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
clip-norm: 10.0 clip-norm: 10.0
...@@ -26,37 +41,16 @@ warmup-updates: 10000 ...@@ -26,37 +41,16 @@ warmup-updates: 10000
lr: 2e-3 lr: 2e-3
#adam_betas: (0.9,0.98) #adam_betas: (0.9,0.98)
ctc-weight: 0.3
criterion: label_smoothed_cross_entropy_with_ctc criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1 label_smoothing: 0.1
encoder-normalize-before: True
decoder-normalize-before: True
conv-kernel-sizes: 5,5
conv-channels: 1024
dropout: 0.1 dropout: 0.1
activation-fn: relu activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048 encoder-ffn-embed-dim: 2048
encoder-layers: 12 encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6 decoder-layers: 6
encoder-attention-heads: 4 encoder-attention-heads: 4
macaron-style: True decoder-embed-dim: 256
use-cnn-module: True decoder-ffn-embed-dim: 2048
cnn-module-kernel: 31 decoder-attention-heads: 4
acoustic-encoder: transformer
adapter: league
encoder-attention-type: relative
decoder-attention-type: relative
max-encoder-relative-length: 100
max-decoder-relative-length: 20
#decoder-embed-dim: 256
#decoder-ffn-embed-dim: 2048
#decoder-attention-heads: 4
#attention-dropout: 0.1
#activation-dropout: 0.1
encoder-attention-type: rel_selfattn
#encoder-attention-type: relative
#max-encoder-relative-length: 100
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
gpu_num=1 gpu_num=1
data_dir= data_dir=
test_subset=(test-cleam test-other) test_subset=(test)
exp_name= exp_name=
if [ "$#" -eq 1 ]; then if [ "$#" -eq 1 ]; then
...@@ -13,7 +13,7 @@ fi ...@@ -13,7 +13,7 @@ fi
n_average=10 n_average=10
beam_size=5 beam_size=5
len_penalty=1.0 len_penalty=1.0
max_tokens=10000 max_tokens=80000
dec_model=checkpoint_best.pt dec_model=checkpoint_best.pt
cmd="./run.sh cmd="./run.sh
...@@ -31,9 +31,9 @@ cmd="./run.sh ...@@ -31,9 +31,9 @@ cmd="./run.sh
if [[ -n ${data_dir} ]]; then if [[ -n ${data_dir} ]]; then
cmd="$cmd --data_dir ${data_dir}" cmd="$cmd --data_dir ${data_dir}"
fi fi
if [[ -n ${test_subset} ]]; then if [[ ${#test_subset[@]} -ne 0 ]]; then
test_subset=`echo ${test_subset[*]} | sed 's/ /,/g'` subsets=$(echo ${test_subset[*]} | sed 's/ /,/g')
cmd="$cmd --test_subset ${test_subset}" cmd="$cmd --test_subset ${subsets}"
fi fi
echo $cmd echo $cmd
......
gpu_num=1 gpu_num=4
cmd="sh train.sh"
while : while :
do do
all_devices=$(seq 0 `gpustat | sed '1,2d' | wc -l`); record=$(mktemp -t temp.record.XXXXXX)
gpustat > $record
all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
count=0 count=0
for dev in ${all_devices[@]} for dev in ${all_devices[@]}
do do
line=`expr $dev + 2` line=$((dev + 2))
use=`gpustat -p | head -n $line | tail -1 | cut -d '|' -f4 | wc -w` use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
if [[ $use -eq 0 ]]; then
if [[ $use -lt 100 ]]; then
device[$count]=$dev device[$count]=$dev
count=`expr $count + 1` count=$((count + 1))
if [[ $count -eq $gpu_num ]]; then if [[ $count -eq $gpu_num ]]; then
break break
fi fi
......
...@@ -5,17 +5,18 @@ get_devices(){ ...@@ -5,17 +5,18 @@ get_devices(){
device=() device=()
while : while :
do do
record=`mktemp -t temp.record.XXXXXX` record=$(mktemp -t temp.record.XXXXXX)
gpustat > $record gpustat > $record
all_devices=$(seq 0 `cat $record | sed '1,2d' | wc -l`); all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
count=0 count=0
for dev in ${all_devices[@]} for dev in ${all_devices[@]}
do do
line=`expr $dev + 2` line=$((dev + 2))
use=`cat $record | head -n $line | tail -1 | cut -d '|' -f3 | cut -d '/' -f1` use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
if [[ $use -lt 100 ]]; then if [[ $use -lt 100 ]]; then
device[$count]=$dev device[$count]=$dev
count=`expr $count + 1` count=$((count + 1))
if [[ $count -eq $gpu_num ]]; then if [[ $count -eq $gpu_num ]]; then
break break
fi fi
......
#! /bin/bash #! /bin/bash
# Processing LibriSpeech Datasets # Processing ASR Datasets
# Copyright 2021 Natural Language Processing Laboratory # Copyright 2021 Natural Language Processing Laboratory
# Xu Chen (xuchenneu@163.com) # Xu Chen (xuchenneu@163.com)
...@@ -20,7 +20,7 @@ stop_stage=0 ...@@ -20,7 +20,7 @@ stop_stage=0
######## hardware ######## ######## hardware ########
# devices # devices
device=() #device=()
gpu_num=8 gpu_num=8
update_freq=1 update_freq=1
...@@ -31,40 +31,40 @@ pwd_dir=$PWD ...@@ -31,40 +31,40 @@ pwd_dir=$PWD
src_lang=en src_lang=en
lang=${src_lang} lang=${src_lang}
dataset= dataset=asr
task=speech_to_text task=speech_to_text
vocab_type=unigram vocab_type=unigram
vocab_size=10000 vocab_size=5000
speed_perturb=0 speed_perturb=0
lcrm=1 lcrm=0
tokenizer=0 tokenizer=0
use_specific_dict=0 use_specific_dict=0
specific_prefix=valid specific_prefix=st
specific_dir=/home/xuchen/st/data/mustc/st_lcrm/en-de specific_dir=/home/xuchen/st/data/mustc/st/en-de
asr_vocab_prefix=spm_unigram10000_st_share asr_vocab_prefix=spm_unigram10000_st_share
org_data_dir=/media/data/${dataset} org_data_dir=~/st/data/${dataset}
data_dir=~/st/data/${dataset} data_dir=~/st/data/${dataset}/asr
train_split=train train_split=train
valid_split=valid valid_split=valid
test_split=test test_split=test
test_subset=dev-clean,dev-other,test-clean,test-other test_subset=test
# exp # exp
exp_prefix=${time} exp_prefix=$(date "+%m%d")
extra_tag= extra_tag=
extra_parameter= extra_parameter=
exp_tag=baseline exp_tag=baseline
exp_name= exp_name=
# config # config
train_config=train_ctc.yaml train_config=ctc
data_config=config.yaml data_config=config_asr.yaml
# training setting # training setting
fp16=1 fp16=1
max_tokens=20000 max_tokens=40000
step_valid=0 step_valid=0
# decoding setting # decoding setting
...@@ -77,17 +77,24 @@ if [[ ${speed_perturb} -eq 1 ]]; then ...@@ -77,17 +77,24 @@ if [[ ${speed_perturb} -eq 1 ]]; then
data_dir=${data_dir}_sp data_dir=${data_dir}_sp
exp_prefix=${exp_prefix}_sp exp_prefix=${exp_prefix}_sp
fi fi
if [[ ${lcrm} -eq 1 ]]; then
data_dir=${data_dir}_lcrm
exp_prefix=${exp_prefix}_lcrm
fi
if [[ ${use_specific_dict} -eq 1 ]]; then if [[ ${use_specific_dict} -eq 1 ]]; then
data_dir=${data_dir}_${specific_prefix} data_dir=${data_dir}_${specific_prefix}
exp_prefix=${exp_prefix}_${specific_prefix} exp_prefix=${exp_prefix}_${specific_prefix}
fi fi
if [[ ${tokenizer} -eq 1 ]]; then
data_dir=${data_dir}_tok
exp_prefix=${exp_prefix}_tok
fi
. ./local/parse_options.sh || exit 1; . ./local/parse_options.sh || exit 1;
# full path
train_config=$pwd_dir/conf/${train_config}
if [[ -z ${exp_name} ]]; then if [[ -z ${exp_name} ]]; then
exp_name=${exp_prefix}_$(basename ${train_config%.*})_${exp_tag} config_string=${train_config//,/_}
exp_name=${exp_prefix}_${config_string}_${exp_tag}
if [[ -n ${extra_tag} ]]; then if [[ -n ${extra_tag} ]]; then
exp_name=${exp_name}_${extra_tag} exp_name=${exp_name}_${extra_tag}
fi fi
...@@ -102,12 +109,10 @@ fi ...@@ -102,12 +109,10 @@ fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
### Task dependent. You have to make data the following preparation part by yourself. ### Task dependent. You have to make data the following preparation part by yourself.
### But you can utilize Kaldi recipes in most cases ### But you can utilize Kaldi recipes in most cases
echo "stage 0: Data Preparation" echo "stage 0: ASR Data Preparation"
if [[ ! -e ${data_dir} ]]; then if [[ ! -e ${data_dir} ]]; then
mkdir -p ${data_dir} mkdir -p ${data_dir}
fi fi
source ~/tools/audio/bin/activate
cmd="python ${root_dir}/examples/speech_to_text/prep_asr_data.py cmd="python ${root_dir}/examples/speech_to_text/prep_asr_data.py
--data-root ${org_data_dir} --data-root ${org_data_dir}
...@@ -136,17 +141,20 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then ...@@ -136,17 +141,20 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
cmd="$cmd cmd="$cmd
--tokenizer" --tokenizer"
fi fi
echo -e "\033[34mRun command: \n${cmd} \033[0m" echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval $cmd [[ $eval -eq 1 ]] && eval ${cmd}
fi fi
data_dir=${data_dir}/${lang}
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "stage 1: ASR Network Training" echo "stage 1: ASR Network Training"
[[ ! -d ${data_dir} ]] && echo "The data dir ${data_dir} is not existing!" && exit 1; [[ ! -d ${data_dir} ]] && echo "The data dir ${data_dir} is not existing!" && exit 1;
if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
if [[ ${gpu_num} -eq 0 ]]; then if [[ ${gpu_num} -eq 0 ]]; then
device=() device=""
else else
source ./local/utils.sh source ./local/utils.sh
device=$(get_devices $gpu_num 0) device=$(get_devices $gpu_num 0)
...@@ -163,12 +171,31 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ...@@ -163,12 +171,31 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
cp ${BASH_SOURCE[0]} ${model_dir} cp ${BASH_SOURCE[0]} ${model_dir}
cp ${PWD}/train.sh ${model_dir} cp ${PWD}/train.sh ${model_dir}
cp ${train_config} ${model_dir}
config_list="${train_config//,/ }"
idx=0
for config in ${config_list[@]}
do
config_path=$pwd_dir/conf/${config}.yaml
if [[ ! -f ${config_path} ]]; then
echo "No config file ${config_path}"
exit
fi
cp ${config_path} ${model_dir}
if [[ idx -eq 0 ]]; then
extra_parameter="${extra_parameter}
--train-config ${config_path}"
else
extra_parameter="${extra_parameter}
--train-config${idx} ${config_path}"
fi
idx=$((idx + 1))
done
cmd="python3 -u ${root_dir}/fairseq_cli/train.py cmd="python3 -u ${root_dir}/fairseq_cli/train.py
${data_dir} ${data_dir}
--config-yaml ${data_config} --config-yaml ${data_config}
--train-config ${train_config}
--task ${task} --task ${task}
--max-tokens ${max_tokens} --max-tokens ${max_tokens}
--skip-invalid-size-inputs-valid-test --skip-invalid-size-inputs-valid-test
...@@ -230,8 +257,8 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ...@@ -230,8 +257,8 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# save info # save info
log=./history.log log=./history.log
echo "${time} | ${device} | ${data_dir} | ${model_dir} " >> $log echo "${time} | ${device} | ${data_dir} | ${exp_name} | ${model_dir} " >> $log
cat $log | tail -n 50 > tmp.log tail -n 50 ${log} > tmp.log
mv tmp.log $log mv tmp.log $log
export CUDA_VISIBLE_DEVICES=${device} export CUDA_VISIBLE_DEVICES=${device}
...@@ -239,7 +266,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ...@@ -239,7 +266,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
if [[ $eval -eq 1 ]]; then if [[ $eval -eq 1 ]]; then
eval $cmd eval $cmd
sleep 2s sleep 2s
tail -n `wc -l ${model_dir}/train.log | awk '{print $1+1}'` -f ${model_dir}/train.log tail -n "$(wc -l ${model_dir}/train.log | awk '{print $1+1}')" -f ${model_dir}/train.log
fi fi
fi fi
wait wait
...@@ -262,7 +289,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then ...@@ -262,7 +289,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
if [[ ${gpu_num} -eq 0 ]]; then if [[ ${gpu_num} -eq 0 ]]; then
device=() device=""
else else
source ./local/utils.sh source ./local/utils.sh
device=$(get_devices $gpu_num 0) device=$(get_devices $gpu_num 0)
...@@ -270,14 +297,12 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then ...@@ -270,14 +297,12 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
fi fi
export CUDA_VISIBLE_DEVICES=${device} export CUDA_VISIBLE_DEVICES=${device}
#tmp_file=$(mktemp ${model_dir}/tmp-XXXXX)
#trap 'rm -rf ${tmp_file}' EXIT
result_file=${model_dir}/decode_result result_file=${model_dir}/decode_result
[[ -f ${result_file} ]] && rm ${result_file} [[ -f ${result_file} ]] && rm ${result_file}
test_subset=(${test_subset//,/ }) test_subset=${test_subset//,/ }
for subset in ${test_subset[@]}; do for subset in ${test_subset[@]}; do
subset=${subset} subset=${subset}_asr
cmd="python ${root_dir}/fairseq_cli/generate.py cmd="python ${root_dir}/fairseq_cli/generate.py
${data_dir} ${data_dir}
--config-yaml ${data_config} --config-yaml ${data_config}
...@@ -288,7 +313,11 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then ...@@ -288,7 +313,11 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--max-tokens ${max_tokens} --max-tokens ${max_tokens}
--beam ${beam_size} --beam ${beam_size}
--lenpen ${len_penalty} --lenpen ${len_penalty}
--scoring wer" --scoring wer
--wer-tokenizer 13a
--wer-lowercase
--wer-remove-punct
"
echo -e "\033[34mRun command: \n${cmd} \033[0m" echo -e "\033[34mRun command: \n${cmd} \033[0m"
if [[ $eval -eq 1 ]]; then if [[ $eval -eq 1 ]]; then
......
...@@ -3,17 +3,28 @@ ...@@ -3,17 +3,28 @@
# training the model # training the model
gpu_num=8 gpu_num=8
update_freq=2 update_freq=1
max_tokens=20000 max_tokens=40000
extra_tag= extra_tag=
extra_parameter= extra_parameter=
#extra_tag="${extra_tag}" #extra_tag="${extra_tag}"
#extra_parameter="${extra_parameter} " #extra_parameter="${extra_parameter} "
exp_tag= exp_tag=
train_config=train_ctc.yaml
#config_list=(base)
#config_list=(ctc)
#config_list=(base conformer)
#config_list=(pds_base_16)
config_list=(pds_base_16 conformer rpr)
# exp full name
exp_name=
train_config=$(echo ${config_list[*]} | sed 's/ /,/g')
cmd="./run.sh cmd="./run.sh
--stage 1 --stage 1
...@@ -24,6 +35,9 @@ cmd="./run.sh ...@@ -24,6 +35,9 @@ cmd="./run.sh
--max_tokens ${max_tokens} --max_tokens ${max_tokens}
" "
if [[ -n ${exp_name} ]]; then
cmd="$cmd --exp_name ${exp_name}"
fi
if [[ -n ${exp_tag} ]]; then if [[ -n ${exp_tag} ]]; then
cmd="$cmd --exp_tag ${exp_tag}" cmd="$cmd --exp_tag ${exp_tag}"
fi fi
...@@ -34,5 +48,5 @@ if [[ -n ${extra_parameter} ]]; then ...@@ -34,5 +48,5 @@ if [[ -n ${extra_parameter} ]]; then
cmd="$cmd --extra_parameter \"${extra_parameter}\"" cmd="$cmd --extra_parameter \"${extra_parameter}\""
fi fi
echo $cmd echo ${cmd}
eval $cmd eval ${cmd}
use-enc-dlcl: True
use-dec-dlcl: True
#encoder-attention-type: rel_selfattn
encoder-attention-type: relative
decoder-attention-type: relative
max-encoder-relative-length: 20
max-decoder-relative-length: 20
\ No newline at end of file
...@@ -13,7 +13,7 @@ fi ...@@ -13,7 +13,7 @@ fi
n_average=10 n_average=10
beam_size=5 beam_size=5
len_penalty=1.0 len_penalty=1.0
max_tokens=10000 max_tokens=80000
dec_model=checkpoint_best.pt dec_model=checkpoint_best.pt
cmd="./run.sh cmd="./run.sh
......
gpu_num=1 gpu_num=4
cmd="sh train.sh"
while : while :
do do
all_devices=$(seq 0 `gpustat | sed '1,2d' | wc -l`); record=$(mktemp -t temp.record.XXXXXX)
gpustat > $record
all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
count=0 count=0
for dev in ${all_devices[@]} for dev in ${all_devices[@]}
do do
line=`expr $dev + 2` line=$((dev + 2))
use=`gpustat -p | head -n $line | tail -1 | cut -d '|' -f4 | wc -w` use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
if [[ $use -eq 0 ]]; then
if [[ $use -lt 100 ]]; then
device[$count]=$dev device[$count]=$dev
count=`expr $count + 1` count=$((count + 1))
if [[ $count -eq $gpu_num ]]; then if [[ $count -eq $gpu_num ]]; then
break break
fi fi
......
...@@ -5,17 +5,18 @@ get_devices(){ ...@@ -5,17 +5,18 @@ get_devices(){
device=() device=()
while : while :
do do
record=`mktemp -t temp.record.XXXXXX` record=$(mktemp -t temp.record.XXXXXX)
gpustat > $record gpustat > $record
all_devices=$(seq 0 `cat $record | sed '1,2d' | wc -l`); all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
count=0 count=0
for dev in ${all_devices[@]} for dev in ${all_devices[@]}
do do
line=`expr $dev + 2` line=$((dev + 2))
use=`cat $record | head -n $line | tail -1 | cut -d '|' -f3 | cut -d '/' -f1` use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
if [[ $use -lt 100 ]]; then if [[ $use -lt 100 ]]; then
device[$count]=$dev device[$count]=$dev
count=`expr $count + 1` count=$((count + 1))
if [[ $count -eq $gpu_num ]]; then if [[ $count -eq $gpu_num ]]; then
break break
fi fi
......
...@@ -20,7 +20,7 @@ stop_stage=0 ...@@ -20,7 +20,7 @@ stop_stage=0
######## hardware ######## ######## hardware ########
# devices # devices
#device=() device=()
gpu_num=8 gpu_num=8
update_freq=1 update_freq=1
...@@ -32,21 +32,21 @@ src_lang=en ...@@ -32,21 +32,21 @@ src_lang=en
tgt_lang=de tgt_lang=de
lang=${src_lang}-${tgt_lang} lang=${src_lang}-${tgt_lang}
dataset= dataset=mt
task=translation task=translation
vocab_type=unigram vocab_type=unigram
vocab_size=10000 vocab_size=10000
share_dict=1 share_dict=1
lcrm=1 lcrm=0
tokenizer=1 tokenizer=0
use_specific_dict=0 use_specific_dict=0
specific_prefix=wmt_share32k specific_prefix=st
specific_dir=/home/xuchen/st/data/wmt/mt_lcrm/en-de/unigram32000_share specific_dir=/home/xuchen/st/data/mustc/st/en-de/
src_vocab_prefix=spm_unigram32000_share src_vocab_prefix=spm_unigram10000_st_share
tgt_vocab_prefix=spm_unigram32000_share tgt_vocab_prefix=spm_unigram10000_st_share
org_data_dir=/media/data/${dataset} org_data_dir=~/st/data/${dataset}
data_dir=~/st/data/${dataset}/mt/${lang} data_dir=~/st/data/${dataset}/mt/${lang}
train_subset=train train_subset=train
valid_subset=dev valid_subset=dev
...@@ -61,7 +61,7 @@ exp_tag=baseline ...@@ -61,7 +61,7 @@ exp_tag=baseline
exp_name= exp_name=
# config # config
train_config=train.yaml train_config=base_s
# training setting # training setting
fp16=1 fp16=1
...@@ -104,9 +104,9 @@ fi ...@@ -104,9 +104,9 @@ fi
. ./local/parse_options.sh || exit 1; . ./local/parse_options.sh || exit 1;
# full path # full path
train_config=$pwd_dir/conf/${train_config}
if [[ -z ${exp_name} ]]; then if [[ -z ${exp_name} ]]; then
exp_name=${exp_prefix}_$(basename ${train_config%.*})_${exp_tag} config_string=${train_config//,/_}
exp_name=${exp_prefix}_${config_string}_${exp_tag}
if [[ -n ${extra_tag} ]]; then if [[ -n ${extra_tag} ]]; then
exp_name=${exp_name}_${extra_tag} exp_name=${exp_name}_${extra_tag}
fi fi
...@@ -150,7 +150,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then ...@@ -150,7 +150,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
mkdir -p ${data_dir}/data mkdir -p ${data_dir}/data
for split in ${train_subset} ${valid_subset} ${trans_subset}; do for split in ${train_subset} ${valid_subset} ${trans_subset}; do
{ {
cmd="cat ${org_data_dir}/${lang}/data/${split}.${src_lang}" cmd="cat ${org_data_dir}/${lang}/data/${split}/txt/${split}.${src_lang}"
if [[ ${lcrm} -eq 1 ]]; then if [[ ${lcrm} -eq 1 ]]; then
cmd="python local/lower_rm.py ${org_data_dir}/${lang}/data/${split}.${src_lang}" cmd="python local/lower_rm.py ${org_data_dir}/${lang}/data/${split}.${src_lang}"
fi fi
...@@ -178,7 +178,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then ...@@ -178,7 +178,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--source-lang ${src_lang} --target-lang ${tgt_lang} --source-lang ${src_lang} --target-lang ${tgt_lang}
--trainpref ${data_dir}/data/${train_subset} --trainpref ${data_dir}/data/${train_subset}
--validpref ${data_dir}/data/${valid_subset} --validpref ${data_dir}/data/${valid_subset}
--testpref ${data_dir}/data/${test_subset} --testpref ${data_dir}/data/${trans_subset}
--destdir ${data_dir}/data-bin --destdir ${data_dir}/data-bin
--srcdict ${data_dir}/${src_vocab_prefix}.txt --srcdict ${data_dir}/${src_vocab_prefix}.txt
--tgtdict ${data_dir}/${tgt_vocab_prefix}.txt --tgtdict ${data_dir}/${tgt_vocab_prefix}.txt
...@@ -196,7 +196,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ...@@ -196,7 +196,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
if [[ ${gpu_num} -eq 0 ]]; then if [[ ${gpu_num} -eq 0 ]]; then
device=() device=""
else else
source ./local/utils.sh source ./local/utils.sh
device=$(get_devices $gpu_num 0) device=$(get_devices $gpu_num 0)
...@@ -213,13 +213,32 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ...@@ -213,13 +213,32 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
cp ${BASH_SOURCE[0]} ${model_dir} cp ${BASH_SOURCE[0]} ${model_dir}
cp ${PWD}/train.sh ${model_dir} cp ${PWD}/train.sh ${model_dir}
cp ${train_config} ${model_dir}
config_list="${train_config//,/ }"
idx=0
for config in ${config_list[@]}
do
config_path=$pwd_dir/conf/${config}.yaml
if [[ ! -f ${config_path} ]]; then
echo "No config file ${config_path}"
exit
fi
cp ${config_path} ${model_dir}
if [[ idx -eq 0 ]]; then
extra_parameter="${extra_parameter}
--train-config ${config_path}"
else
extra_parameter="${extra_parameter}
--train-config${idx} ${config_path}"
fi
idx=$((idx + 1))
done
cmd="python3 -u ${root_dir}/fairseq_cli/train.py cmd="python3 -u ${root_dir}/fairseq_cli/train.py
${data_dir} ${data_dir}
--source-lang ${src_lang} --source-lang ${src_lang}
--target-lang ${tgt_lang} --target-lang ${tgt_lang}
--train-config ${train_config}
--task ${task} --task ${task}
--max-tokens ${max_tokens} --max-tokens ${max_tokens}
--skip-invalid-size-inputs-valid-test --skip-invalid-size-inputs-valid-test
...@@ -246,7 +265,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ...@@ -246,7 +265,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
save_interval=1 save_interval=1
keep_last_epochs=10 keep_last_epochs=10
no_epoch_checkpoints=0 no_epoch_checkpoints=0
save_interval_updates=10000 save_interval_updates=500
keep_interval_updates=10 keep_interval_updates=10
else else
validate_interval=1 validate_interval=1
...@@ -290,8 +309,8 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ...@@ -290,8 +309,8 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# save info # save info
log=./history.log log=./history.log
echo "${time} | ${device} | ${data_dir} | ${model_dir} " >> $log echo "${time} | ${device} | ${data_dir} | ${exp_name} | ${model_dir} " >> $log
cat $log | tail -n 50 > tmp.log tail -n 50 ${log} > tmp.log
mv tmp.log $log mv tmp.log $log
export CUDA_VISIBLE_DEVICES=${device} export CUDA_VISIBLE_DEVICES=${device}
...@@ -299,7 +318,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ...@@ -299,7 +318,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
if [[ $eval -eq 1 ]]; then if [[ $eval -eq 1 ]]; then
eval $cmd eval $cmd
sleep 2s sleep 2s
tail -n `wc -l ${model_dir}/train.log | awk '{print $1+1}'` -f ${model_dir}/train.log tail -n "$(wc -l ${model_dir}/train.log | awk '{print $1+1}')" -f ${model_dir}/train.log
fi fi
fi fi
wait wait
...@@ -322,7 +341,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then ...@@ -322,7 +341,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
if [[ ${gpu_num} -eq 0 ]]; then if [[ ${gpu_num} -eq 0 ]]; then
device=() device=""
else else
source ./local/utils.sh source ./local/utils.sh
device=$(get_devices $gpu_num 0) device=$(get_devices $gpu_num 0)
...@@ -335,7 +354,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then ...@@ -335,7 +354,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
test_subset=(${test_subset//,/ }) test_subset=(${test_subset//,/ })
for subset in ${test_subset[@]}; do for subset in ${test_subset[@]}; do
subset=${subset}_st
cmd="python ${root_dir}/fairseq_cli/generate.py cmd="python ${root_dir}/fairseq_cli/generate.py
${data_dir} ${data_dir}
--source-lang ${src_lang} --source-lang ${src_lang}
......
...@@ -4,16 +4,20 @@ ...@@ -4,16 +4,20 @@
gpu_num=1 gpu_num=1
update_freq=1 update_freq=1
max_tokens=4096 max_tokens=8192
exp_tag=baseline
config_list=(base)
# exp full name
exp_name=
extra_tag= extra_tag=
extra_parameter= extra_parameter=
#extra_tag="${extra_tag}" #extra_tag="${extra_tag}"
#extra_parameter="${extra_parameter} " #extra_parameter="${extra_parameter} "
exp_tag=baseline train_config=$(echo ${config_list[*]} | sed 's/ /,/g')
train_config=train.yaml
cmd="./run.sh cmd="./run.sh
--stage 1 --stage 1
...@@ -24,6 +28,9 @@ cmd="./run.sh ...@@ -24,6 +28,9 @@ cmd="./run.sh
--max_tokens ${max_tokens} --max_tokens ${max_tokens}
" "
if [[ -n ${exp_name} ]]; then
cmd="$cmd --exp_name ${exp_name}"
fi
if [[ -n ${exp_tag} ]]; then if [[ -n ${exp_tag} ]]; then
cmd="$cmd --exp_tag ${exp_tag}" cmd="$cmd --exp_tag ${exp_tag}"
fi fi
...@@ -34,5 +41,5 @@ if [[ -n ${extra_parameter} ]]; then ...@@ -34,5 +41,5 @@ if [[ -n ${extra_parameter} ]]; then
cmd="$cmd --extra_parameter \"${extra_parameter}\"" cmd="$cmd --extra_parameter \"${extra_parameter}\""
fi fi
echo $cmd echo ${cmd}
eval $cmd eval ${cmd}
...@@ -15,9 +15,7 @@ src_lang=en ...@@ -15,9 +15,7 @@ src_lang=en
tgt_lang=de tgt_lang=de
splits=(2019) splits=(2019)
source ~/tools/audio/bin/activate splits=$(echo ${splits[*]} | sed 's/ /_/g')
splits=`echo ${splits[*]} | sed 's/ /,/g'`
cp -r ${vocab_dir}/${asr_vocab_prefix}.* ${data_dir}/${src_lang}-${tgt_lang} cp -r ${vocab_dir}/${asr_vocab_prefix}.* ${data_dir}/${src_lang}-${tgt_lang}
cp -r ${vocab_dir}/${st_vocab_prefix}.* ${data_dir}/${src_lang}-${tgt_lang} cp -r ${vocab_dir}/${st_vocab_prefix}.* ${data_dir}/${src_lang}-${tgt_lang}
...@@ -48,4 +46,3 @@ cmd="python ${root_dir}/examples/speech_to_text/prep_st_data.py ...@@ -48,4 +46,3 @@ cmd="python ${root_dir}/examples/speech_to_text/prep_st_data.py
echo -e "\033[34mRun command: \n${cmd} \033[0m" echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd} [[ $eval -eq 1 ]] && eval ${cmd}
deactivate
train-subset: train_st train-subset: train_st
valid-subset: dev_st valid-subset: dev_st
max-epoch: 50 max-epoch: 100
max-update: 100000 max-update: 100000
num-workers: 8 num-workers: 8
...@@ -24,7 +24,6 @@ warmup-updates: 10000 ...@@ -24,7 +24,6 @@ warmup-updates: 10000
lr: 2e-3 lr: 2e-3
#adam_betas: (0.9,0.98) #adam_betas: (0.9,0.98)
ctc-weight: 0.3
criterion: label_smoothed_cross_entropy_with_ctc criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1 label_smoothing: 0.1
......
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 31
ctc-weight: 0.3
\ No newline at end of file
use-enc-dlcl: True
use-dec-dlcl: True
encoder-attention-type: local
hard-mask-window: 0
gauss-mask-sigma: 3
init-mask-weight: 0
\ No newline at end of file
arch: pdss2t_transformer_s_8
train-subset: train_st train-subset: train_st
valid-subset: dev_st valid-subset: dev_st
max-epoch: 50 max-epoch: 100
max-update: 100000 max-update: 100000
num-workers: 8 num-workers: 8
...@@ -14,7 +16,6 @@ report-accuracy: True ...@@ -14,7 +16,6 @@ report-accuracy: True
#load-pretrained-encoder-from: #load-pretrained-encoder-from:
#load-pretrained-decoder-from: #load-pretrained-decoder-from:
arch: s2t_transformer_s
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
clip-norm: 10.0 clip-norm: 10.0
...@@ -24,14 +25,11 @@ warmup-updates: 10000 ...@@ -24,14 +25,11 @@ warmup-updates: 10000
lr: 2e-3 lr: 2e-3
#adam_betas: (0.9,0.98) #adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1 label_smoothing: 0.1
conv-kernel-sizes: 5,5
conv-channels: 1024
dropout: 0.1 dropout: 0.1
activation-fn: relu activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048 encoder-ffn-embed-dim: 2048
encoder-layers: 12 encoder-layers: 12
decoder-layers: 6 decoder-layers: 6
...@@ -40,5 +38,3 @@ encoder-attention-heads: 4 ...@@ -40,5 +38,3 @@ encoder-attention-heads: 4
decoder-embed-dim: 256 decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048 decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4 decoder-attention-heads: 4
attention-dropout: 0.1
activation-dropout: 0.1
arch: pdss2t_transformer_s_16
encoder-embed-dim: 256
pyramid-stages: 4
#pyramid-dropout: 0
pyramid-layers: 2_2_6_2
pyramid-ratios: 2_2_2_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8
pyramid-attn-heads: 4_4_4_4
train-subset: train_st train-subset: train_st
valid-subset: dev_st valid-subset: dev_st
max-epoch: 50 max-epoch: 100
max-update: 100000 max-update: 100000
num-workers: 8 num-workers: 8
...@@ -14,7 +31,6 @@ report-accuracy: True ...@@ -14,7 +31,6 @@ report-accuracy: True
#load-pretrained-encoder-from: #load-pretrained-encoder-from:
#load-pretrained-decoder-from: #load-pretrained-decoder-from:
arch: s2t_transformer_s
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
clip-norm: 10.0 clip-norm: 10.0
...@@ -24,27 +40,16 @@ warmup-updates: 10000 ...@@ -24,27 +40,16 @@ warmup-updates: 10000
lr: 2e-3 lr: 2e-3
#adam_betas: (0.9,0.98) #adam_betas: (0.9,0.98)
ctc-weight: 0.3
criterion: label_smoothed_cross_entropy_with_ctc criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1 label_smoothing: 0.1
conv-kernel-sizes: 5,5
conv-channels: 1024
dropout: 0.1 dropout: 0.1
activation-fn: relu activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048 encoder-ffn-embed-dim: 2048
encoder-layers: 12 encoder-layers: 12
decoder-layers: 6 decoder-layers: 6
encoder-attention-heads: 4 encoder-attention-heads: 4
encoder-attention-type: relative
decoder-attention-type: relative
max-encoder-relative-length: 100
max-decoder-relative-length: 20
decoder-embed-dim: 256 decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048 decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4 decoder-attention-heads: 4
attention-dropout: 0.1
activation-dropout: 0.1
arch: pdss2t_transformer_s_32
encoder-embed-dim: 256
pyramid-stages: 5
#pyramid-dropout: 0
pyramid-layers: 2_2_3_3_2
pyramid-ratios: 2_2_2_2_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1_1
pyramid-kernel-sizes: 5_5_5_5_5
pyramid-ffn-ratios: 8_8_8_8_8
pyramid-attn-heads: 4_4_4_4_4
train-subset: train_st train-subset: train_st
valid-subset: dev_st valid-subset: dev_st
max-epoch: 50 max-epoch: 100
max-update: 100000 max-update: 100000
num-workers: 8 num-workers: 8
...@@ -14,7 +31,6 @@ report-accuracy: True ...@@ -14,7 +31,6 @@ report-accuracy: True
#load-pretrained-encoder-from: #load-pretrained-encoder-from:
#load-pretrained-decoder-from: #load-pretrained-decoder-from:
arch: s2t_conformer_s
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
clip-norm: 10.0 clip-norm: 10.0
...@@ -24,31 +40,16 @@ warmup-updates: 10000 ...@@ -24,31 +40,16 @@ warmup-updates: 10000
lr: 2e-3 lr: 2e-3
#adam_betas: (0.9,0.98) #adam_betas: (0.9,0.98)
ctc-weight: 0.3
criterion: label_smoothed_cross_entropy_with_ctc criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1 label_smoothing: 0.1
conv-kernel-sizes: 5,5
conv-channels: 1024
dropout: 0.1 dropout: 0.1
activation-fn: relu activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048 encoder-ffn-embed-dim: 2048
encoder-layers: 12 encoder-layers: 12
decoder-layers: 6 decoder-layers: 6
encoder-attention-heads: 4 encoder-attention-heads: 4
macaron-style: True decoder-embed-dim: 256
use-cnn-module: True decoder-ffn-embed-dim: 2048
cnn-module-kernel: 31 decoder-attention-heads: 4
encoder-attention-type: relative
decoder-attention-type: relative
max-encoder-relative-length: 100
max-decoder-relative-length: 20
#decoder-embed-dim: 256
#decoder-ffn-embed-dim: 2048
#decoder-attention-heads: 4
#attention-dropout: 0.1
#activation-dropout: 0.1
arch: pdss2t_transformer_s_8
encoder-embed-dim: 256
pyramid-stages: 4
#pyramid-dropout: 0
pyramid-layers: 3_3_3_3
pyramid-ratios: 2_2_1_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8
pyramid-attn-heads: 4_4_4_4
train-subset: train_st train-subset: train_st
valid-subset: dev_st valid-subset: dev_st
max-epoch: 50 max-epoch: 100
max-update: 100000 max-update: 100000
num-workers: 8 num-workers: 8
...@@ -12,11 +29,8 @@ seed: 1 ...@@ -12,11 +29,8 @@ seed: 1
report-accuracy: True report-accuracy: True
#load-pretrained-encoder-from: #load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from: #load-pretrained-decoder-from:
arch: s2t_sate
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
clip-norm: 10.0 clip-norm: 10.0
...@@ -26,37 +40,16 @@ warmup-updates: 10000 ...@@ -26,37 +40,16 @@ warmup-updates: 10000
lr: 2e-3 lr: 2e-3
#adam_betas: (0.9,0.98) #adam_betas: (0.9,0.98)
ctc-weight: 0.3
criterion: label_smoothed_cross_entropy_with_ctc criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1 label_smoothing: 0.1
encoder-normalize-before: True
decoder-normalize-before: True
conv-kernel-sizes: 5,5
conv-channels: 1024
dropout: 0.1 dropout: 0.1
activation-fn: relu activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048 encoder-ffn-embed-dim: 2048
encoder-layers: 12 encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6 decoder-layers: 6
encoder-attention-heads: 4 encoder-attention-heads: 4
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 31
acoustic-encoder: conformer
adapter: league
encoder-attention-type: relative
decoder-attention-type: relative
max-encoder-relative-length: 100
max-decoder-relative-length: 20
decoder-embed-dim: 256 decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048 decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4 decoder-attention-heads: 4
attention-dropout: 0.1
activation-dropout: 0.1
encoder-attention-type: rel_selfattn
#encoder-attention-type: relative
#decoder-attention-type: relative
#max-encoder-relative-length: 100
#max-decoder-relative-length: 20
train-subset: train_st train-subset: train_st
valid-subset: dev_st valid-subset: dev_st
max-epoch: 50 max-epoch: 100
max-update: 100000 max-update: 100000
num-workers: 8 num-workers: 8
...@@ -43,20 +43,29 @@ text-encoder-layers: 6 ...@@ -43,20 +43,29 @@ text-encoder-layers: 6
decoder-layers: 6 decoder-layers: 6
encoder-attention-heads: 4 encoder-attention-heads: 4
macaron-style: True #macaron-style: True
use-cnn-module: True #use-cnn-module: True
cnn-module-kernel: 31 #cnn-module-kernel: 31
acoustic-encoder: conformer #acoustic-encoder: pds
acoustic-encoder: transformer
adapter: league adapter: league
encoder-attention-type: relative encoder-embed-dim: 256
decoder-attention-type: relative pyramid-stages: 4
max-encoder-relative-length: 100 #pyramid-dropout: 0
max-decoder-relative-length: 20 pyramid-layers: 3_3_3_3
pyramid-ratios: 2_2_1_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8
pyramid-attn-heads: 4_4_4_4
decoder-embed-dim: 256 decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048 decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4 decoder-attention-heads: 4
attention-dropout: 0.1
activation-dropout: 0.1
...@@ -13,7 +13,7 @@ fi ...@@ -13,7 +13,7 @@ fi
n_average=10 n_average=10
beam_size=5 beam_size=5
len_penalty=1.0 len_penalty=1.0
max_tokens=10000 max_tokens=80000
dec_model=checkpoint_best.pt dec_model=checkpoint_best.pt
cmd="./run.sh cmd="./run.sh
...@@ -31,9 +31,9 @@ cmd="./run.sh ...@@ -31,9 +31,9 @@ cmd="./run.sh
if [[ -n ${data_dir} ]]; then if [[ -n ${data_dir} ]]; then
cmd="$cmd --data_dir ${data_dir}" cmd="$cmd --data_dir ${data_dir}"
fi fi
if [[ -n ${test_subset} ]]; then if [[ ${#test_subset[@]} -eq 0 ]]; then
test_subset=`echo ${test_subset[*]} | sed 's/ /,/g'` subsets=$(echo ${test_subset[*]} | sed 's/ /,/g')
cmd="$cmd --test_subset ${test_subset}" cmd="$cmd --test_subset ${subsets}"
fi fi
echo $cmd echo $cmd
......
gpu_num=1 gpu_num=4
cmd="sh train.sh"
while : while :
do do
all_devices=$(seq 0 `gpustat | sed '1,2d' | wc -l`); record=$(mktemp -t temp.record.XXXXXX)
gpustat > $record
all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
count=0 count=0
for dev in ${all_devices[@]} for dev in ${all_devices[@]}
do do
line=`expr $dev + 2` line=$((dev + 2))
use=`gpustat -p | head -n $line | tail -1 | cut -d '|' -f4 | wc -w` use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
if [[ $use -eq 0 ]]; then
if [[ $use -lt 100 ]]; then
device[$count]=$dev device[$count]=$dev
count=`expr $count + 1` count=$((count + 1))
if [[ $count -eq $gpu_num ]]; then if [[ $count -eq $gpu_num ]]; then
break break
fi fi
......
...@@ -5,17 +5,18 @@ get_devices(){ ...@@ -5,17 +5,18 @@ get_devices(){
device=() device=()
while : while :
do do
record=`mktemp -t temp.record.XXXXXX` record=$(mktemp -t temp.record.XXXXXX)
gpustat > $record gpustat > $record
all_devices=$(seq 0 `cat $record | sed '1,2d' | wc -l`); all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
count=0 count=0
for dev in ${all_devices[@]} for dev in ${all_devices[@]}
do do
line=`expr $dev + 2` line=$((dev + 2))
use=`cat $record | head -n $line | tail -1 | cut -d '|' -f3 | cut -d '/' -f1` use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
if [[ $use -lt 100 ]]; then if [[ $use -lt 100 ]]; then
device[$count]=$dev device[$count]=$dev
count=`expr $count + 1` count=$((count + 1))
if [[ $count -eq $gpu_num ]]; then if [[ $count -eq $gpu_num ]]; then
break break
fi fi
......
...@@ -32,14 +32,14 @@ src_lang=en ...@@ -32,14 +32,14 @@ src_lang=en
tgt_lang=de tgt_lang=de
lang=${src_lang}-${tgt_lang} lang=${src_lang}-${tgt_lang}
dataset=mustc-v2 dataset=st
task=speech_to_text task=speech_to_text
vocab_type=unigram vocab_type=unigram
asr_vocab_size=5000 asr_vocab_size=5000
vocab_size=10000 vocab_size=10000
share_dict=1 share_dict=1
speed_perturb=0 speed_perturb=0
lcrm=1 lcrm=0
tokenizer=0 tokenizer=0
use_specific_dict=0 use_specific_dict=0
...@@ -48,19 +48,19 @@ specific_dir=/home/xuchen/st/data/mustc/st_lcrm/en-de ...@@ -48,19 +48,19 @@ specific_dir=/home/xuchen/st/data/mustc/st_lcrm/en-de
asr_vocab_prefix=spm_unigram10000_st_share asr_vocab_prefix=spm_unigram10000_st_share
st_vocab_prefix=spm_unigram10000_st_share st_vocab_prefix=spm_unigram10000_st_share
org_data_dir=/media/data/${dataset} org_data_dir=~/st/data/${dataset}
data_dir=~/st/data/${dataset}/st data_dir=~/st/data/${dataset}/st
test_subset=tst-COMMON test_subset=tst-COMMON
# exp # exp
exp_prefix=${time} exp_prefix=$(date "+%m%d")
extra_tag= extra_tag=
extra_parameter= extra_parameter=
exp_tag=baseline exp_tag=baseline
exp_name= exp_name=
# config # config
train_config=train_ctc.yaml train_config=ctc
# training setting # training setting
fp16=1 fp16=1
...@@ -98,10 +98,9 @@ fi ...@@ -98,10 +98,9 @@ fi
. ./local/parse_options.sh || exit 1; . ./local/parse_options.sh || exit 1;
# full path
train_config=$pwd_dir/conf/${train_config}
if [[ -z ${exp_name} ]]; then if [[ -z ${exp_name} ]]; then
exp_name=${exp_prefix}_$(basename ${train_config%.*})_${exp_tag} config_string=${train_config//,/_}
exp_name=${exp_prefix}_${config_string}_${exp_tag}
if [[ -n ${extra_tag} ]]; then if [[ -n ${extra_tag} ]]; then
exp_name=${exp_name}_${extra_tag} exp_name=${exp_name}_${extra_tag}
fi fi
...@@ -120,7 +119,6 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then ...@@ -120,7 +119,6 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
if [[ ! -e ${data_dir}/${lang} ]]; then if [[ ! -e ${data_dir}/${lang} ]]; then
mkdir -p ${data_dir}/${lang} mkdir -p ${data_dir}/${lang}
fi fi
source ~/tools/audio/bin/activate
cmd="python ${root_dir}/examples/speech_to_text/prep_asr_data.py cmd="python ${root_dir}/examples/speech_to_text/prep_asr_data.py
--data-root ${org_data_dir} --data-root ${org_data_dir}
...@@ -183,7 +181,6 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then ...@@ -183,7 +181,6 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
echo -e "\033[34mRun command: \n${cmd} \033[0m" echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd} [[ $eval -eq 1 ]] && eval ${cmd}
deactivate
fi fi
data_dir=${data_dir}/${lang} data_dir=${data_dir}/${lang}
...@@ -194,7 +191,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ...@@ -194,7 +191,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
if [[ ${gpu_num} -eq 0 ]]; then if [[ ${gpu_num} -eq 0 ]]; then
device=() device=""
else else
source ./local/utils.sh source ./local/utils.sh
device=$(get_devices $gpu_num 0) device=$(get_devices $gpu_num 0)
...@@ -211,12 +208,31 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ...@@ -211,12 +208,31 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
cp ${BASH_SOURCE[0]} ${model_dir} cp ${BASH_SOURCE[0]} ${model_dir}
cp ${PWD}/train.sh ${model_dir} cp ${PWD}/train.sh ${model_dir}
cp ${train_config} ${model_dir}
config_list="${train_config//,/ }"
idx=0
for config in ${config_list[@]}
do
config_path=$pwd_dir/conf/${config}.yaml
if [[ ! -f ${config_path} ]]; then
echo "No config file ${config_path}"
exit
fi
cp ${config_path} ${model_dir}
if [[ idx -eq 0 ]]; then
extra_parameter="${extra_parameter}
--train-config ${config_path}"
else
extra_parameter="${extra_parameter}
--train-config${idx} ${config_path}"
fi
idx=$((idx + 1))
done
cmd="python3 -u ${root_dir}/fairseq_cli/train.py cmd="python3 -u ${root_dir}/fairseq_cli/train.py
${data_dir} ${data_dir}
--config-yaml ${data_config} --config-yaml ${data_config}
--train-config ${train_config}
--task ${task} --task ${task}
--max-tokens ${max_tokens} --max-tokens ${max_tokens}
--skip-invalid-size-inputs-valid-test --skip-invalid-size-inputs-valid-test
...@@ -287,8 +303,8 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ...@@ -287,8 +303,8 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# save info # save info
log=./history.log log=./history.log
echo "${time} | ${device} | ${data_dir} | ${model_dir} " >> $log echo "${time} | ${device} | ${data_dir} | ${exp_name} | ${model_dir} " >> $log
cat $log | tail -n 50 > tmp.log tail -n 50 ${log} > tmp.log
mv tmp.log $log mv tmp.log $log
export CUDA_VISIBLE_DEVICES=${device} export CUDA_VISIBLE_DEVICES=${device}
...@@ -296,7 +312,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ...@@ -296,7 +312,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
if [[ $eval -eq 1 ]]; then if [[ $eval -eq 1 ]]; then
eval $cmd eval $cmd
sleep 2s sleep 2s
tail -n `wc -l ${model_dir}/train.log | awk '{print $1+1}'` -f ${model_dir}/train.log tail -n "$(wc -l ${model_dir}/train.log | awk '{print $1+1}')" -f ${model_dir}/train.log
fi fi
fi fi
wait wait
...@@ -319,7 +335,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then ...@@ -319,7 +335,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
if [[ ${gpu_num} -eq 0 ]]; then if [[ ${gpu_num} -eq 0 ]]; then
device=() device=""
else else
source ./local/utils.sh source ./local/utils.sh
device=$(get_devices $gpu_num 0) device=$(get_devices $gpu_num 0)
...@@ -330,8 +346,8 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then ...@@ -330,8 +346,8 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
result_file=${model_dir}/decode_result result_file=${model_dir}/decode_result
[[ -f ${result_file} ]] && rm ${result_file} [[ -f ${result_file} ]] && rm ${result_file}
test_subset=(${test_subset//,/ }) test_subset=${test_subset//,/ }
for subset in ${test_subset[@]}; do for subset in "${test_subset[@]}"; do
subset=${subset}_st subset=${subset}_st
cmd="python ${root_dir}/fairseq_cli/generate.py cmd="python ${root_dir}/fairseq_cli/generate.py
${data_dir} ${data_dir}
......
...@@ -3,30 +3,29 @@ ...@@ -3,30 +3,29 @@
# training the model # training the model
gpu_num=8 gpu_num=8
update_freq=2 update_freq=1
max_tokens=20000 max_tokens=40000
exp_name=
extra_tag= extra_tag=
extra_parameter= extra_parameter=
#extra_tag="${extra_tag}" #extra_tag="${extra_tag}"
#extra_parameter="${extra_parameter} " #extra_parameter="${extra_parameter} "
#extra_tag="${extra_tag}_encdlcl" exp_tag=
#extra_parameter="${extra_parameter} --use-enc-dlcl"
#config_list=(base)
config_list=(ctc)
#config_list=(sate_ctc)
#config_list=(ctc conformer rpr)
#config_list=(base sate)
#extra_tag="${extra_tag}_decdlcl" #config_list=(pds_base)
#extra_parameter="${extra_parameter} --use-dec-dlcl" #config_list=(pds_base conformer)
# exp full name
exp_name=
exp_tag=baseline train_config=$(echo ${config_list[*]} | sed 's/ /,/g')
train_config=train_ctc.yaml
#train_config=train_ctc_conformer.yaml
#train_config=train_ctc_conformer_rpr.yaml
#train_config=train_ctc_sate.yaml
#train_config=train_ctc_sate_rpr.yaml
#train_config=train_ctc_sate_conformer.yaml
#train_config=train_ctc_sate_conformer_rpr.yaml
cmd="./run.sh cmd="./run.sh
--stage 1 --stage 1
...@@ -50,5 +49,5 @@ if [[ -n ${extra_parameter} ]]; then ...@@ -50,5 +49,5 @@ if [[ -n ${extra_parameter} ]]; then
cmd="$cmd --extra_parameter \"${extra_parameter}\"" cmd="$cmd --extra_parameter \"${extra_parameter}\""
fi fi
echo $cmd echo ${cmd}
eval $cmd eval ${cmd}
train-subset: train_st
valid-subset: dev_st
max-epoch: 50
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: s2t_conformer_m
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
#adam_betas: (0.9,0.98)
ctc-weight: 0.3
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
conv-kernel-sizes: 5,5
conv-channels: 1024
#dropout: 0.1
#activation-fn: relu
#encoder-embed-dim: 256
#encoder-ffn-embed-dim: 2048
#encoder-layers: 12
#decoder-layers: 6
#encoder-attention-heads: 4
#decoder-embed-dim: 256
#decoder-ffn-embed-dim: 2048
#decoder-attention-heads: 4
#attention-dropout: 0.1
#activation-dropout: 0.1
# conformer
#macaron-style: True
#use-cnn-module: True
#cnn-module-kernel: 31
# relative position encoding
#encoder-attention-type: relative
#decoder-attention-type: relative
#max-encoder-relative-length: 100
#max-decoder-relative-length: 20
train-subset: train_st,train_covost
valid-subset: dev_st
max-epoch: 50
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from:
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
#adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy
label_smoothing: 0.1
encoder-normalize-before: True
decoder-normalize-before: True
conv-kernel-sizes: 5,5
conv-channels: 1024
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 4
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 31
acoustic-encoder: transformer
adapter: league
encoder-attention-type: relative
decoder-attention-type: relative
max-encoder-relative-length: 100
max-decoder-relative-length: 20
#decoder-embed-dim: 256
#decoder-ffn-embed-dim: 2048
#decoder-attention-heads: 4
#attention-dropout: 0.1
#activation-dropout: 0.1
MAIN_ROOT=$PWD/../../..
KALDI_ROOT=$MAIN_ROOT/tools/kaldi
export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PATH
[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
. $KALDI_ROOT/tools/config/common_path.sh
export LC_ALL=C
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$MAIN_ROOT/src/lib
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$MAIN_ROOT/tools/chainer_ctc/ext/warp-ctc/build
. "${MAIN_ROOT}"/tools/activate_python.sh && . "${MAIN_ROOT}"/tools/extra_path.sh
export PATH=$MAIN_ROOT/utils:$MAIN_ROOT/espnet/bin:$PATH
export OMP_NUM_THREADS=1
# check extra module installation
if ! which tokenizer.perl > /dev/null; then
echo "Error: it seems that moses is not installed." >&2
echo "Error: please install moses as follows." >&2
echo "Error: cd ${MAIN_ROOT}/tools && make moses.done" >&2
return 1
fi
# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
train-subset: train
valid-subset: valid
max-epoch: 50
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
skip-invalid-size-inputs-valid-test: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: dlcl_transformer
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 8000
lr: 1e-3
adam_betas: (0.9,0.997)
criterion: label_smoothed_cross_entropy
label_smoothing: 0.1
dropout: 0.1
attention-dropout: 0.1
activation-dropout: 0.1
activation-fn: relu
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
encoder-attention-type: relative
decoder-attention-type: relative
max-encoder-relative-length: 20
max-decoder-relative-length: 20
use-enc-dlcl: True
use-dec-dlcl: True
MAIN_ROOT=$PWD/../../..
KALDI_ROOT=$MAIN_ROOT/tools/kaldi
export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PATH
[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
. $KALDI_ROOT/tools/config/common_path.sh
export LC_ALL=C
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$MAIN_ROOT/src/lib
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$MAIN_ROOT/tools/chainer_ctc/ext/warp-ctc/build
. "${MAIN_ROOT}"/tools/activate_python.sh && . "${MAIN_ROOT}"/tools/extra_path.sh
export PATH=$MAIN_ROOT/utils:$MAIN_ROOT/espnet/bin:$PATH
export OMP_NUM_THREADS=1
# check extra module installation
if ! which tokenizer.perl > /dev/null; then
echo "Error: it seems that moses is not installed." >&2
echo "Error: please install moses as follows." >&2
echo "Error: cd ${MAIN_ROOT}/tools && make moses.done" >&2
return 1
fi
# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
train-subset: train_st
valid-subset: dev_st
max-epoch: 50
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from:
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
#adam_betas: (0.9,0.98)
ctc-weight: 0.3
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
encoder-normalize-before: True
decoder-normalize-before: True
conv-kernel-sizes: 5,5
conv-channels: 1024
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 4
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 31
acoustic-encoder: transformer
adapter: league
encoder-attention-type: relative
decoder-attention-type: relative
max-encoder-relative-length: 100
max-decoder-relative-length: 20
#decoder-embed-dim: 256
#decoder-ffn-embed-dim: 2048
#decoder-attention-heads: 4
#attention-dropout: 0.1
#activation-dropout: 0.1
train-subset: train_st
valid-subset: dev_st
max-epoch: 50
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: s2t_conformer_m
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
#adam_betas: (0.9,0.98)
ctc-weight: 0.3
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
conv-kernel-sizes: 5,5
conv-channels: 1024
#dropout: 0.1
#activation-fn: relu
#encoder-embed-dim: 256
#encoder-ffn-embed-dim: 2048
#encoder-layers: 12
#decoder-layers: 6
#encoder-attention-heads: 4
#decoder-embed-dim: 256
#decoder-ffn-embed-dim: 2048
#decoder-attention-heads: 4
#attention-dropout: 0.1
#activation-dropout: 0.1
# conformer
#macaron-style: True
#use-cnn-module: True
#cnn-module-kernel: 31
# relative position encoding
#encoder-attention-type: relative
#decoder-attention-type: relative
#max-encoder-relative-length: 100
#max-decoder-relative-length: 20
train-subset: train_st,train_covost
valid-subset: dev_st
max-epoch: 50
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from:
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
#adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy
label_smoothing: 0.1
encoder-normalize-before: True
decoder-normalize-before: True
conv-kernel-sizes: 5,5
conv-channels: 1024
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 4
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 31
acoustic-encoder: transformer
adapter: league
encoder-attention-type: relative
decoder-attention-type: relative
max-encoder-relative-length: 100
max-decoder-relative-length: 20
#decoder-embed-dim: 256
#decoder-ffn-embed-dim: 2048
#decoder-attention-heads: 4
#attention-dropout: 0.1
#activation-dropout: 0.1
MAIN_ROOT=$PWD/../../..
KALDI_ROOT=$MAIN_ROOT/tools/kaldi
export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PATH
[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
. $KALDI_ROOT/tools/config/common_path.sh
export LC_ALL=C
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$MAIN_ROOT/src/lib
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$MAIN_ROOT/tools/chainer_ctc/ext/warp-ctc/build
. "${MAIN_ROOT}"/tools/activate_python.sh && . "${MAIN_ROOT}"/tools/extra_path.sh
export PATH=$MAIN_ROOT/utils:$MAIN_ROOT/espnet/bin:$PATH
export OMP_NUM_THREADS=1
# check extra module installation
if ! which tokenizer.perl > /dev/null; then
echo "Error: it seems that moses is not installed." >&2
echo "Error: please install moses as follows." >&2
echo "Error: cd ${MAIN_ROOT}/tools && make moses.done" >&2
return 1
fi
# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
from fairseq import checkpoint_utils
from fairseq.models import (
register_model,
register_model_architecture,
)
from fairseq.models.speech_to_text import (
S2TTransformerModel,
s2t_transformer_s,
)
@register_model("s2ttransformer_simul_trans")
class SimulS2TTransformerModel(S2TTransformerModel):
"""
Implementation of the paper:
SimulMT to SimulST: Adapting Simultaneous Text Translation to
End-to-End Simultaneous Speech Translation
https://www.aclweb.org/anthology/2020.aacl-main.58.pdf
"""
@staticmethod
def add_args(parser):
super(SimulS2TTransformerModel, SimulS2TTransformerModel).add_args(parser)
parser.add_argument(
"--train-monotonic-only",
action="store_true",
default=False,
help="Only train monotonic attention",
)
# @classmethod
# def build_decoder(cls, args, task, embed_tokens):
# tgt_dict = task.tgt_dict
#
# from examples.simultaneous_translation.models.transformer_monotonic_attention import (
# TransformerMonotonicDecoder,
# )
#
# decoder = TransformerMonotonicDecoder(args, tgt_dict, embed_tokens)
#
# if getattr(args, "load_pretrained_decoder_from", None):
# decoder = checkpoint_utils.load_pretrained_component_from_model(
# component=decoder, checkpoint=args.load_pretrained_decoder_from
# )
# return decoder
@register_model_architecture(
"s2ttransformer_simul_trans", "s2ttransformer_simul_trans_base"
)
def s2ttransformer_simul_trans_base(args):
s2t_transformer_s(args)
...@@ -38,7 +38,7 @@ log = logging.getLogger(__name__) ...@@ -38,7 +38,7 @@ log = logging.getLogger(__name__)
MANIFEST_COLUMNS = ["id", "audio", "n_frames", "tgt_text", "speaker"] MANIFEST_COLUMNS = ["id", "audio", "n_frames", "tgt_text", "speaker"]
class ASR_Dataset(Dataset): class ASRDataset(Dataset):
""" """
Create a Dataset for MuST-C. Each item is a tuple of the form: Create a Dataset for MuST-C. Each item is a tuple of the form:
waveform, sample_rate, source utterance, target utterance, speaker_id, waveform, sample_rate, source utterance, target utterance, speaker_id,
...@@ -70,9 +70,6 @@ class ASR_Dataset(Dataset): ...@@ -70,9 +70,6 @@ class ASR_Dataset(Dataset):
self.data = [] self.data = []
for wav_filename, _seg_group in groupby(segments, lambda x: x["wav"]): for wav_filename, _seg_group in groupby(segments, lambda x: x["wav"]):
wav_path = wav_root / wav_filename wav_path = wav_root / wav_filename
try:
sample_rate = torchaudio.info(wav_path.as_posix())[0].rate
except TypeError:
sample_rate = torchaudio.info(wav_path.as_posix()).sample_rate sample_rate = torchaudio.info(wav_path.as_posix()).sample_rate
seg_group = sorted(_seg_group, key=lambda x: float(x["offset"])) seg_group = sorted(_seg_group, key=lambda x: float(x["offset"]))
for i, segment in enumerate(seg_group): for i, segment in enumerate(seg_group):
...@@ -185,7 +182,7 @@ def process(args): ...@@ -185,7 +182,7 @@ def process(args):
for split in splits: for split in splits:
print(f"Fetching split {split}...") print(f"Fetching split {split}...")
dataset = ASR_Dataset(root.as_posix(), lang, split, args.speed_perturb, args.tokenizer) dataset = ASRDataset(root.as_posix(), lang, split, args.speed_perturb, args.tokenizer)
is_train_split = split.startswith("train") is_train_split = split.startswith("train")
print("Extracting log mel filter bank features...") print("Extracting log mel filter bank features...")
if is_train_split and args.cmvn_type == "global": if is_train_split and args.cmvn_type == "global":
...@@ -246,7 +243,7 @@ def process(args): ...@@ -246,7 +243,7 @@ def process(args):
if args.task == "st" and args.add_src: if args.task == "st" and args.add_src:
manifest["src_text"] = [] manifest["src_text"] = []
dataset = ASR_Dataset(args.data_root, lang, split, args.speed_perturb, args.tokenizer) dataset = ASRDataset(args.data_root, lang, split, args.speed_perturb, args.tokenizer)
for idx in range(len(dataset)): for idx in range(len(dataset)):
items = dataset.get_fast(idx) items = dataset.get_fast(idx)
for item in items: for item in items:
......
...@@ -25,7 +25,7 @@ log = logging.getLogger(__name__) ...@@ -25,7 +25,7 @@ log = logging.getLogger(__name__)
MANIFEST_COLUMNS = ["src_text", "tgt_text"] MANIFEST_COLUMNS = ["src_text", "tgt_text"]
class MTData(Dataset): class MTDataset(Dataset):
""" """
Create a Dataset for MuST-C. Each item is a tuple of the form: Create a Dataset for MuST-C. Each item is a tuple of the form:
waveform, sample_rate, source utterance, target utterance, speaker_id, waveform, sample_rate, source utterance, target utterance, speaker_id,
...@@ -72,7 +72,7 @@ def process(args): ...@@ -72,7 +72,7 @@ def process(args):
is_train_split = split.startswith("train") is_train_split = split.startswith("train")
manifest = {c: [] for c in MANIFEST_COLUMNS} manifest = {c: [] for c in MANIFEST_COLUMNS}
dataset = MTData(args.data_root, src_lang, tgt_lang, split) dataset = MTDataset(args.data_root, src_lang, tgt_lang, split)
for src_text, tgt_text in tqdm(dataset): for src_text, tgt_text in tqdm(dataset):
if args.lowercase_src: if args.lowercase_src:
src_text = src_text.lower() src_text = src_text.lower()
......
...@@ -75,9 +75,6 @@ class MUSTC(Dataset): ...@@ -75,9 +75,6 @@ class MUSTC(Dataset):
self.data = [] self.data = []
for wav_filename, _seg_group in groupby(segments, lambda x: x["wav"]): for wav_filename, _seg_group in groupby(segments, lambda x: x["wav"]):
wav_path = wav_root / wav_filename wav_path = wav_root / wav_filename
try:
sample_rate = torchaudio.info(wav_path.as_posix())[0].rate
except TypeError:
sample_rate = torchaudio.info(wav_path.as_posix()).sample_rate sample_rate = torchaudio.info(wav_path.as_posix()).sample_rate
seg_group = sorted(_seg_group, key=lambda x: float(x["offset"])) seg_group = sorted(_seg_group, key=lambda x: float(x["offset"]))
for i, segment in enumerate(seg_group): for i, segment in enumerate(seg_group):
......
...@@ -38,14 +38,15 @@ log = logging.getLogger(__name__) ...@@ -38,14 +38,15 @@ log = logging.getLogger(__name__)
MANIFEST_COLUMNS = ["id", "audio", "n_frames", "tgt_text", "speaker"] MANIFEST_COLUMNS = ["id", "audio", "n_frames", "tgt_text", "speaker"]
class ST_Dataset(Dataset): class STDataset(Dataset):
""" """
Create a Dataset for MuST-C. Each item is a tuple of the form: Create a Dataset for MuST-C. Each item is a tuple of the form:
waveform, sample_rate, source utterance, target utterance, speaker_id, waveform, sample_rate, source utterance, target utterance, speaker_id,
utterance_id utterance_id
""" """
def __init__(self, root: str, src_lang, tgt_lang: str, split: str, speed_perturb: bool = False, tokenizer: bool = False) -> None: def __init__(self, root: str, src_lang, tgt_lang: str, split: str,
speed_perturb: bool = False, tokenizer: bool = False) -> None:
_root = Path(root) / f"{src_lang}-{tgt_lang}" / split _root = Path(root) / f"{src_lang}-{tgt_lang}" / split
wav_root, txt_root = _root / "wav", _root / "txt" wav_root, txt_root = _root / "wav", _root / "txt"
if tokenizer: if tokenizer:
...@@ -71,9 +72,6 @@ class ST_Dataset(Dataset): ...@@ -71,9 +72,6 @@ class ST_Dataset(Dataset):
self.data = [] self.data = []
for wav_filename, _seg_group in groupby(segments, lambda x: x["wav"]): for wav_filename, _seg_group in groupby(segments, lambda x: x["wav"]):
wav_path = wav_root / wav_filename wav_path = wav_root / wav_filename
try:
sample_rate = torchaudio.info(wav_path.as_posix())[0].rate
except TypeError:
sample_rate = torchaudio.info(wav_path.as_posix()).sample_rate sample_rate = torchaudio.info(wav_path.as_posix()).sample_rate
seg_group = sorted(_seg_group, key=lambda x: float(x["offset"])) seg_group = sorted(_seg_group, key=lambda x: float(x["offset"]))
for i, segment in enumerate(seg_group): for i, segment in enumerate(seg_group):
...@@ -194,7 +192,7 @@ def process(args): ...@@ -194,7 +192,7 @@ def process(args):
for split in splits: for split in splits:
print(f"Fetching split {split}...") print(f"Fetching split {split}...")
dataset = ST_Dataset(root.as_posix(), src_lang, tgt_lang, split, args.speed_perturb, args.tokenizer) dataset = STDataset(root.as_posix(), src_lang, tgt_lang, split, args.speed_perturb, args.tokenizer)
is_train_split = split.startswith("train") is_train_split = split.startswith("train")
print("Extracting log mel filter bank features...") print("Extracting log mel filter bank features...")
if is_train_split and args.cmvn_type == "global": if is_train_split and args.cmvn_type == "global":
...@@ -255,7 +253,7 @@ def process(args): ...@@ -255,7 +253,7 @@ def process(args):
if args.task == "st" and args.add_src: if args.task == "st" and args.add_src:
manifest["src_text"] = [] manifest["src_text"] = []
dataset = ST_Dataset(args.data_root, src_lang, tgt_lang, split, args.speed_perturb, args.tokenizer) dataset = STDataset(args.data_root, src_lang, tgt_lang, split, args.speed_perturb, args.tokenizer)
for idx in range(len(dataset)): for idx in range(len(dataset)):
items = dataset.get_fast(idx) items = dataset.get_fast(idx)
for item in items: for item in items:
......
...@@ -10,6 +10,7 @@ import torch.nn.functional as F ...@@ -10,6 +10,7 @@ import torch.nn.functional as F
from fairseq import metrics, utils from fairseq import metrics, utils
from fairseq.criterions import register_criterion from fairseq.criterions import register_criterion
from fairseq.data.data_utils import post_process from fairseq.data.data_utils import post_process
from fairseq.logging.meters import safe_round
from .label_smoothed_cross_entropy import LabelSmoothedCrossEntropyCriterion from .label_smoothed_cross_entropy import LabelSmoothedCrossEntropyCriterion
...@@ -23,14 +24,15 @@ class LabelSmoothedCrossEntropyCriterionWithCTC( ...@@ -23,14 +24,15 @@ class LabelSmoothedCrossEntropyCriterionWithCTC(
self.blank_idx = task.target_dictionary.index(task.blank_symbol) if hasattr(task, 'blank_symbol') else 0 self.blank_idx = task.target_dictionary.index(task.blank_symbol) if hasattr(task, 'blank_symbol') else 0
self.pad_idx = task.target_dictionary.pad() self.pad_idx = task.target_dictionary.pad()
self.eos_idx = task.target_dictionary.eos() self.eos_idx = task.target_dictionary.eos()
self.report_accuracy = True self.report_accuracy = True
assert 0 <= ctc_weight <= 1 assert 0 <= ctc_weight
self.ctc_weight = ctc_weight self.ctc_weight = ctc_weight
if self.ctc_weight > 0: if self.ctc_weight > 0:
assert getattr(task, "src_dict", None) is not None, "CTC need a source dictionary." assert getattr(task, "src_dict", None) is not None, "CTC need a source dictionary."
self.zero_infinity = True
self.post_process = post_process self.post_process = post_process
self.ctc_loss = torch.nn.CTCLoss(blank=self.blank_idx, reduction="sum", zero_infinity=True)
@staticmethod @staticmethod
def add_args(parser): def add_args(parser):
...@@ -54,7 +56,7 @@ class LabelSmoothedCrossEntropyCriterionWithCTC( ...@@ -54,7 +56,7 @@ class LabelSmoothedCrossEntropyCriterionWithCTC(
default="letter", default="letter",
type=str, type=str,
help="how to post process predictions into words. can be letter, " help="how to post process predictions into words. can be letter, "
"wordpiece, BPE symbols, etc. " "word-piece, BPE symbols, etc. "
"See fairseq.data.data_utils.post_process() for full list of options", "See fairseq.data.data_utils.post_process() for full list of options",
) )
...@@ -72,7 +74,6 @@ class LabelSmoothedCrossEntropyCriterionWithCTC( ...@@ -72,7 +74,6 @@ class LabelSmoothedCrossEntropyCriterionWithCTC(
prev_output_tokens=prev_output_tokens, encoder_out=encoder_out prev_output_tokens=prev_output_tokens, encoder_out=encoder_out
) )
# net_output = model(**sample["net_input"])
loss, nll_loss = self.compute_loss(model, net_output, sample, reduce=reduce) loss, nll_loss = self.compute_loss(model, net_output, sample, reduce=reduce)
sample_size = ( sample_size = (
sample["target"].size(0) if self.sentence_avg else sample["ntokens"] sample["target"].size(0) if self.sentence_avg else sample["ntokens"]
...@@ -100,10 +101,12 @@ class LabelSmoothedCrossEntropyCriterionWithCTC( ...@@ -100,10 +101,12 @@ class LabelSmoothedCrossEntropyCriterionWithCTC(
def compute_ctc_loss(self, model, sample, encoder_out): def compute_ctc_loss(self, model, sample, encoder_out):
transcript = sample["transcript"] transcript = sample["transcript"]
ctc_logit = model.encoder.compute_ctc_logit(encoder_out) ctc_logit = encoder_out["ctc_logit"][0]
lprobs = model.get_normalized_probs( lprobs = model.get_normalized_probs(
[ctc_logit], log_probs=True [ctc_logit], log_probs=True
).contiguous() # (T, B, C) from the encoder ).contiguous() # (T, B, C) from the encoder
lprobs.batch_first = False
non_padding_mask = ~encoder_out["encoder_padding_mask"][0] non_padding_mask = ~encoder_out["encoder_padding_mask"][0]
input_lengths = non_padding_mask.long().sum(-1) input_lengths = non_padding_mask.long().sum(-1)
...@@ -114,14 +117,11 @@ class LabelSmoothedCrossEntropyCriterionWithCTC( ...@@ -114,14 +117,11 @@ class LabelSmoothedCrossEntropyCriterionWithCTC(
transcript_lengths = pad_mask.sum(-1) transcript_lengths = pad_mask.sum(-1)
with torch.backends.cudnn.flags(enabled=False): with torch.backends.cudnn.flags(enabled=False):
loss = F.ctc_loss( loss = self.ctc_loss(
lprobs, lprobs,
targets_flat, targets_flat,
input_lengths, input_lengths,
transcript_lengths, transcript_lengths,
blank=self.blank_idx,
reduction="sum",
zero_infinity=self.zero_infinity,
) )
logging_output = { logging_output = {
...@@ -141,9 +141,7 @@ class LabelSmoothedCrossEntropyCriterionWithCTC( ...@@ -141,9 +141,7 @@ class LabelSmoothedCrossEntropyCriterionWithCTC(
wv_errs = 0 wv_errs = 0
for lp, t, inp_l in zip( for lp, t, inp_l in zip(
lprobs_t, lprobs_t,
sample["target_label"] sample["target_label"] if "target_label" in sample else sample["target"],
if "target_label" in sample
else sample["target"],
input_lengths, input_lengths,
): ):
lp = lp[:inp_l].unsqueeze(0) lp = lp[:inp_l].unsqueeze(0)
...@@ -239,6 +237,44 @@ class LabelSmoothedCrossEntropyCriterionWithCTC( ...@@ -239,6 +237,44 @@ class LabelSmoothedCrossEntropyCriterionWithCTC(
else float("nan"), else float("nan"),
) )
c_errors = sum(log.get("c_errors", 0) for log in logging_outputs)
metrics.log_scalar("_c_errors", c_errors)
c_total = sum(log.get("c_total", 0) for log in logging_outputs)
metrics.log_scalar("_c_total", c_total)
w_errors = sum(log.get("w_errors", 0) for log in logging_outputs)
metrics.log_scalar("_w_errors", w_errors)
wv_errors = sum(log.get("wv_errors", 0) for log in logging_outputs)
metrics.log_scalar("_wv_errors", wv_errors)
w_total = sum(log.get("w_total", 0) for log in logging_outputs)
metrics.log_scalar("_w_total", w_total)
if c_total > 0:
metrics.log_derived(
"uer",
lambda meters: safe_round(
meters["_c_errors"].sum * 100.0 / meters["_c_total"].sum, 3
)
if meters["_c_total"].sum > 0
else float("nan"),
)
if w_total > 0:
metrics.log_derived(
"wer",
lambda meters: safe_round(
meters["_w_errors"].sum * 100.0 / meters["_w_total"].sum, 3
)
if meters["_w_total"].sum > 0
else float("nan"),
)
metrics.log_derived(
"raw_wer",
lambda meters: safe_round(
meters["_wv_errors"].sum * 100.0 / meters["_w_total"].sum, 3
)
if meters["_w_total"].sum > 0
else float("nan"),
)
@staticmethod @staticmethod
def logging_outputs_can_be_summed() -> bool: def logging_outputs_can_be_summed() -> bool:
""" """
......
...@@ -19,7 +19,6 @@ class FairseqDecoder(nn.Module): ...@@ -19,7 +19,6 @@ class FairseqDecoder(nn.Module):
self.onnx_trace = False self.onnx_trace = False
self.adaptive_softmax = None self.adaptive_softmax = None
def forward(self, prev_output_tokens, encoder_out=None, **kwargs): def forward(self, prev_output_tokens, encoder_out=None, **kwargs):
""" """
Args: Args:
......
...@@ -4,8 +4,9 @@ ...@@ -4,8 +4,9 @@
# LICENSE file in the root directory of this source tree. # LICENSE file in the root directory of this source tree.
from .berard import * # noqa from .berard import * # noqa
from .ctc import * # noqa
from .convtransformer import * # noqa from .convtransformer import * # noqa
from .s2t_transformer import * # noqa from .s2t_transformer import * # noqa
from .s2t_conformer import * # noqa from .s2t_conformer import * # noqa
from .pys2t_transformer import * # noqa from .pdss2t_transformer import * # noqa
from .s2t_sate import * # noqa from .s2t_sate import * # noqa
...@@ -7,8 +7,8 @@ from typing import Dict, List, Optional, Tuple ...@@ -7,8 +7,8 @@ from typing import Dict, List, Optional, Tuple
import torch import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
from fairseq.data.data_utils import lengths_to_padding_mask
from fairseq import checkpoint_utils, utils from fairseq import checkpoint_utils, utils
from fairseq.data.data_utils import lengths_to_padding_mask
from fairseq.models import ( from fairseq.models import (
FairseqEncoder, FairseqEncoder,
FairseqEncoderDecoderModel, FairseqEncoderDecoderModel,
...@@ -28,6 +28,7 @@ class ConvTransformerModel(FairseqEncoderDecoderModel): ...@@ -28,6 +28,7 @@ class ConvTransformerModel(FairseqEncoderDecoderModel):
Transformer-based Speech translation model from ESPNet-ST Transformer-based Speech translation model from ESPNet-ST
https://arxiv.org/abs/2004.10234 https://arxiv.org/abs/2004.10234
""" """
def __init__(self, encoder, decoder): def __init__(self, encoder, decoder):
super().__init__(encoder, decoder) super().__init__(encoder, decoder)
...@@ -303,11 +304,11 @@ class ConvTransformerEncoder(FairseqEncoder): ...@@ -303,11 +304,11 @@ class ConvTransformerEncoder(FairseqEncoder):
x = self.embed_scale * x x = self.embed_scale * x
subsampling_factor = int(max_seq_len * 1.0 / output_seq_len + 0.5) subsampling_factor = int(max_seq_len * 1.0 / output_seq_len + 0.5)
input_len_0 = (src_lengths.float() / subsampling_factor).ceil().long()
input_lengths = torch.min( input_len_1 = x.size(0) * torch.ones([src_lengths.size(0)]).long().to(
(src_lengths.float() / subsampling_factor).ceil().long(), input_len_0.device
x.size(0) * src_lengths.new_ones([src_lengths.size(0)]).long()
) )
input_lengths = torch.min(input_len_0, input_len_1)
encoder_padding_mask = lengths_to_padding_mask(input_lengths) encoder_padding_mask = lengths_to_padding_mask(input_lengths)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论