Commit bab6c520 by xuchen

fix the bugs and use float32 for softmax

parent b0a45459
set -e
eval=1
lcrm=1
tokenizer=0
vocab_type=unigram
vocab_size=5000
use_raw_audio=0
speed_perturb=0
dataset=iwslt2022
root_dir=~/st
code_dir=${root_dir}/Fairseq-S2T
org_data_dir=${root_dir}/data/${dataset}
data_dir=${root_dir}/data/${dataset}/asr
use_specific_dict=0
specific_prefix=st
specific_dir=${root_dir}/data/mustc/st
asr_vocab_prefix=spm_unigram10000_st_share
src_lang=en
tgt_lang=zh
subsets=(train_covost)
mkdir -p $data_dir
splits=$(echo ${subsets[*]} | sed 's/ /_/g')
cmd="python ${code_dir}/examples/speech_to_text/prep_audio_data.py
--data-root ${org_data_dir}
--output-root ${data_dir}
--task asr
--src-lang ${src_lang}
--splits ${splits}
--vocab-type ${vocab_type}
--vocab-size ${vocab_size}"
if [[ ${use_raw_audio} -eq 1 ]]; then
cmd="$cmd
--raw"
fi
if [[ ${use_specific_dict} -eq 1 ]]; then
cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}
cmd="$cmd
--asr-prefix ${asr_vocab_prefix}"
fi
if [[ ${speed_perturb} -eq 1 ]]; then
cmd="$cmd
--speed-perturb"
fi
if [[ ${lcrm} -eq 1 ]]; then
cmd="$cmd
--lowercase-src
--rm-punc-src"
fi
if [[ ${tokenizer} -eq 1 ]]; then
cmd="$cmd
--tokenizer"
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
arch: s2t_transformer_s
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
subsampling-type: conv1d
subsmapling-layers: 2
subsampling-filter: 1024
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
attention-dropout: 0.1
activation-dropout: 0.1
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
train-subset: train
valid-subset: dev
max-epoch: 100
max-update: 100000
patience: 20
best_checkpoint_metric: loss
maximize_best_checkpoint_metric: False
no-epoch-checkpoints: True
#keep-last-epochs: 10
keep-best-checkpoints: 10
num-workers: 8
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
skip-invalid-size-inputs-valid-test: True
\ No newline at end of file
arch: s2t_transformer_m
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
subsampling-type: conv1d
subsmapling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.15
activation-fn: relu
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 31
encoder-attention-type: rel_pos
encoder-activation-fn: swish
ctc-weight: 0.3
post-process: sentencepiece
use-enc-dlcl: True
use-dec-dlcl: True
ctc-weight: 0.2
intermedia-ctc-layers: 6,9
intermedia-adapter: league
intermedia-ctc-weight: 0.1
ctc-self-distill-weight: 0
post-process: sentencepiece
\ No newline at end of file
encoder-attention-type: local
hard-mask-window: 0
gauss-mask-sigma: 3
init-mask-weight: 0
\ No newline at end of file
arch: pdss2t_transformer_s_8
pds-fusion: True
ctc-layer: 12
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_s_16
encoder-embed-dim: 256
pds-stages: 4
ctc-layer: 12
pds-layers: 2_2_6_2
pds-ratios: 2_2_2_2
pds-fusion: True
pds-fusion-method: all_conv
pds-embed-dims: 256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_s_32
encoder-embed-dim: 256
pds-stages: 5
ctc-layer: 12
pds-layers: 2_2_3_3_2
pds-ratios: 2_2_2_2_2
pds-fusion: True
pds-fusion-method: all_conv
pds-embed-dims: 256_256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1_1
pds-kernel-sizes: 5_5_5_5_5
pds-ffn-ratios: 8_8_8_8_8
pds-attn-heads: 4_4_4_4_4
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_s_8
encoder-embed-dim: 256
pds-stages: 4
ctc-layer: 12
pds-layers: 3_3_3_3
pds-ratios: 2_2_1_2
pds-fusion: True
pds-fusion-method: all_conv
pds-embed-dims: 256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: pdss2t_transformer_m_8
encoder-embed-dim: 512
pds-stages: 4
ctc-layer: 12
pds-layers: 3_3_3_3
pds-ratios: 2_2_1_2
pds-fusion: True
pds-fusion-method: all_conv
pds-embed-dims: 512_512_512_512
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 4_4_4_4
pds-attn-heads: 8_8_8_8
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
dropout: 0.15
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: s2t_ctc
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam_betas: (0.9,0.98)
criterion: ctc
zero_infinity: True
post-process: sentencepiece
subsampling-type: conv1d
subsmapling-layers: 2
subsampling-filter: 1024
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.1
attention-dropout: 0.1
activation-dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 12
encoder-attention-heads: 4
#load-pretrained-encoder-from:
\ No newline at end of file
encoder-attention-type: rel_selfattn
#encoder-attention-type: relative
#max-encoder-relative-length: 100
#! /bin/bash
gpu_num=1
data_dir=
test_subset=(dev tst-COMMON)
exp_name=
if [ "$#" -eq 1 ]; then
exp_name=$1
fi
cer=0
n_average=10
beam_size=5
len_penalty=1.0
max_tokens=80000
dec_model=checkpoint_best.pt
cmd="./run.sh
--stage 2
--stop_stage 2
--gpu_num ${gpu_num}
--exp_name ${exp_name}
--n_average ${n_average}
--cer ${cer}
--beam_size ${beam_size}
--len_penalty ${len_penalty}
--max_tokens ${max_tokens}
--dec_model ${dec_model}
"
if [[ -n ${data_dir} ]]; then
cmd="$cmd --data_dir ${data_dir}"
fi
if [[ ${#test_subset[@]} -ne 0 ]]; then
subsets=$(echo ${test_subset[*]} | sed 's/ /,/g')
cmd="$cmd --test_subset ${subsets}"
fi
echo $cmd
eval $cmd
gpu_num=4
cmd="sh train.sh"
while :
do
record=$(mktemp -t temp.record.XXXXXX)
gpustat > $record
all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
count=0
for dev in ${all_devices[@]}
do
line=$((dev + 2))
use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
if [[ $use -lt 100 ]]; then
device[$count]=$dev
count=$((count + 1))
if [[ $count -eq $gpu_num ]]; then
break
fi
fi
done
if [[ ${#device[@]} -lt $gpu_num ]]; then
sleep 60s
else
echo "Run $cmd"
eval $cmd
sleep 10s
exit
fi
done
#!/usr/bin/env bash
# Copyright 2012 Johns Hopkins University (Author: Daniel Povey);
# Arnab Ghoshal, Karel Vesely
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# Parse command-line options.
# To be sourced by another script (as in ". parse_options.sh").
# Option format is: --option-name arg
# and shell variable "option_name" gets set to value "arg."
# The exception is --help, which takes no arguments, but prints the
# $help_message variable (if defined).
###
### The --config file options have lower priority to command line
### options, so we need to import them first...
###
# Now import all the configs specified by command-line, in left-to-right order
for ((argpos=1; argpos<$#; argpos++)); do
if [ "${!argpos}" == "--config" ]; then
argpos_plus1=$((argpos+1))
config=${!argpos_plus1}
[ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
. $config # source the config file.
fi
done
###
### Now we process the command line options
###
while true; do
[ -z "${1:-}" ] && break; # break if there are no arguments
case "$1" in
# If the enclosing script is called with --help option, print the help
# message and exit. Scripts should put help messages in $help_message
--help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
else printf "$help_message\n" 1>&2 ; fi;
exit 0 ;;
--*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
exit 1 ;;
# If the first command-line argument begins with "--" (e.g. --foo-bar),
# then work out the variable name as $name, which will equal "foo_bar".
--*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
# Next we test whether the variable in question is undefned-- if so it's
# an invalid option and we die. Note: $0 evaluates to the name of the
# enclosing script.
# The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
# is undefined. We then have to wrap this test inside "eval" because
# foo_bar is itself inside a variable ($name).
eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
oldval="`eval echo \\$$name`";
# Work out whether we seem to be expecting a Boolean argument.
if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
was_bool=true;
else
was_bool=false;
fi
# Set the variable to the right value-- the escaped quotes make it work if
# the option had spaces, like --cmd "queue.pl -sync y"
eval $name=\"$2\";
# Check that Boolean-valued arguments are really Boolean.
if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
exit 1;
fi
shift 2;
;;
*) break;
esac
done
# Check for an empty argument to the --cmd option, which can easily occur as a
# result of scripting errors.
[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
true; # so this script returns exit code 0.
get_devices(){
gpu_num=$1
use_cpu=$2
device=()
while :
do
record=$(mktemp -t temp.record.XXXXXX)
gpustat > $record
all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
count=0
for dev in ${all_devices[@]}
do
line=$((dev + 2))
use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
if [[ $use -lt 100 ]]; then
device[$count]=$dev
count=$((count + 1))
if [[ $count -eq $gpu_num ]]; then
break
fi
fi
done
if [[ ${#device[@]} -lt $gpu_num ]]; then
if [[ $use_cpu -eq 1 ]]; then
device=(-1)
else
sleep 60s
fi
else
break
fi
done
echo ${device[*]} | sed 's/ /,/g'
return $?
}
#! /bin/bash
# Processing MuST-C Datasets
# Copyright 2021 Natural Language Processing Laboratory
# Xu Chen (xuchenneu@163.com)
# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set -e
#set -u
set -o pipefail
export PYTHONIOENCODING=UTF-8
eval=1
time=$(date "+%m%d_%H%M")
stage=0
stop_stage=0
######## hardware ########
# devices
#device=()
gpu_num=8
update_freq=1
root_dir=~/st
code_dir=${root_dir}/Fairseq-S2T
pwd_dir=$PWD
# dataset
src_lang=en
tgt_lang=de
lang=${src_lang}-${tgt_lang}
dataset=mustc
task=speech_to_text
vocab_type=unigram
vocab_size=5000
speed_perturb=0
lcrm=0
tokenizer=0
use_raw_audio=0
use_specific_dict=1
specific_prefix=st
specific_dir=${root_dir}/data/mustc/st
asr_vocab_prefix=spm_unigram10000_st_share
org_data_dir=${root_dir}/data/${dataset}
data_dir=${root_dir}/data/${dataset}/asr
train_split=train
valid_split=dev
test_split=tst-COMMON
test_subset=tst-COMMON
# exp
exp_prefix=$(date "+%m%d")
extra_tag=
extra_parameter=
exp_tag=baseline
exp_name=
# config
train_config=base
data_config=config.yaml
# training setting
fp16=1
max_tokens=40000
step_valid=0
# decoding setting
cer=0
dec_model=checkpoint_best.pt
n_average=10
beam_size=5
len_penalty=1.0
if [[ ${speed_perturb} -eq 1 ]]; then
data_dir=${data_dir}_sp
exp_prefix=${exp_prefix}_sp
fi
if [[ ${lcrm} -eq 1 ]]; then
data_dir=${data_dir}_lcrm
exp_prefix=${exp_prefix}_lcrm
fi
if [[ ${use_specific_dict} -eq 1 ]]; then
data_dir=${data_dir}_${specific_prefix}
exp_prefix=${exp_prefix}_${specific_prefix}
fi
if [[ ${tokenizer} -eq 1 ]]; then
data_dir=${data_dir}_tok
exp_prefix=${exp_prefix}_tok
fi
if [[ ${use_raw_audio} -eq 1 ]]; then
data_dir=${data_dir}_raw
exp_prefix=${exp_prefix}_raw
fi
. ./local/parse_options.sh || exit 1;
if [[ -z ${exp_name} ]]; then
config_string=${train_config//,/_}
exp_name=${exp_prefix}_${config_string}_${exp_tag}
if [[ -n ${extra_tag} ]]; then
exp_name=${exp_name}_${extra_tag}
fi
fi
model_dir=${root_dir}/checkpoints/${dataset}/asr/${exp_name}
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "stage -1: Data Download"
# pass
fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
### Task dependent. You have to make data the following preparation part by yourself.
### But you can utilize Kaldi recipes in most cases
echo "stage 0: ASR Data Preparation"
if [[ ! -e ${data_dir} ]]; then
mkdir -p ${data_dir}
fi
feature_zip=fbank80.zip
if [[ ${speed_perturb} -eq 1 ]]; then
feature_zip=fbank80_sp.zip
fi
if [[ ! -f ${data_dir}/${feature_zip} && -f ${data_dir}/../feature_zip ]]; then
ln -s ${data_dir}/../feature_zip ${data_dir}
fi
cmd="python ${code_dir}/examples/speech_to_text/prep_audio_data.py
--data-root ${org_data_dir}
--output-root ${data_dir}
--task asr
--src-lang ${src_lang}
--splits ${valid_split},${test_split},${train_split}
--vocab-type ${vocab_type}
--vocab-size ${vocab_size}"
if [[ ${use_raw_audio} -eq 1 ]]; then
cmd="$cmd
--raw"
fi
if [[ ${use_specific_dict} -eq 1 ]]; then
cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}
cmd="$cmd
--asr-prefix ${asr_vocab_prefix}"
fi
if [[ ${speed_perturb} -eq 1 ]]; then
cmd="$cmd
--speed-perturb"
fi
if [[ ${lcrm} -eq 1 ]]; then
cmd="$cmd
--lowercase-src
--rm-punc-src"
fi
if [[ ${tokenizer} -eq 1 ]]; then
cmd="$cmd
--tokenizer"
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
if [[ -f ${data_dir}/${feature_zip} && ! -f ${data_dir}/../${feature_zip} ]]; then
mv ${data_dir}/${feature_zip} ${data_dir}/..
ln -s ${data_dir}/../${feature_zip} ${data_dir}
fi
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "stage 1: ASR Network Training"
[[ ! -d ${data_dir} ]] && echo "The data dir ${data_dir} is not existing!" && exit 1;
if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
if [[ ${gpu_num} -eq 0 ]]; then
device=""
else
source ./local/utils.sh
device=$(get_devices $gpu_num 0)
fi
fi
echo -e "dev=${device} data=${data_dir} model=${model_dir}"
if [[ ! -d ${model_dir} ]]; then
mkdir -p ${model_dir}
else
echo "${model_dir} exists."
fi
cp ${BASH_SOURCE[0]} ${model_dir}
cp ${PWD}/train.sh ${model_dir}
extra_parameter="${extra_parameter}
--train-config ${pwd_dir}/conf/basis.yaml"
cp ${pwd_dir}/conf/basis.yaml ${model_dir}
config_list="${train_config//,/ }"
idx=1
for config in ${config_list[@]}
do
config_path=${pwd_dir}/conf/${config}.yaml
if [[ ! -f ${config_path} ]]; then
echo "No config file ${config_path}"
exit
fi
cp ${config_path} ${model_dir}
extra_parameter="${extra_parameter}
--train-config${idx} ${config_path}"
idx=$((idx + 1))
done
cmd="python3 -u ${code_dir}/fairseq_cli/train.py
${data_dir}
--config-yaml ${data_config}
--task ${task}
--max-tokens ${max_tokens}
--skip-invalid-size-inputs-valid-test
--update-freq ${update_freq}
--log-interval 100
--save-dir ${model_dir}
--tensorboard-logdir ${model_dir}"
if [[ -n ${extra_parameter} ]]; then
cmd="${cmd}
${extra_parameter}"
fi
if [[ ${gpu_num} -gt 0 ]]; then
cmd="${cmd}
--distributed-world-size $gpu_num
--ddp-backend no_c10d"
fi
if [[ $fp16 -eq 1 ]]; then
cmd="${cmd}
--fp16"
fi
if [[ $step_valid -eq 1 ]]; then
validate_interval=1
save_interval=1
keep_last_epochs=10
no_epoch_checkpoints=0
save_interval_updates=500
keep_interval_updates=10
else
validate_interval=1
keep_last_epochs=10
fi
if [[ -n $no_epoch_checkpoints && $no_epoch_checkpoints -eq 1 ]]; then
cmd="$cmd
--no-epoch-checkpoints"
fi
if [[ -n $validate_interval ]]; then
cmd="${cmd}
--validate-interval $validate_interval "
fi
if [[ -n $save_interval ]]; then
cmd="${cmd}
--save-interval $save_interval "
fi
if [[ -n $keep_last_epochs ]]; then
cmd="${cmd}
--keep-last-epochs $keep_last_epochs "
fi
if [[ -n $save_interval_updates ]]; then
cmd="${cmd}
--save-interval-updates $save_interval_updates"
if [[ -n $keep_interval_updates ]]; then
cmd="${cmd}
--keep-interval-updates $keep_interval_updates"
fi
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
# save info
log=./history.log
echo "${time} | ${device} | ${data_dir} | ${exp_name} | ${model_dir} " >> $log
tail -n 50 ${log} > tmp.log
mv tmp.log $log
export CUDA_VISIBLE_DEVICES=${device}
cmd="nohup ${cmd} >> ${model_dir}/train.log 2>&1 &"
if [[ $eval -eq 1 ]]; then
eval $cmd
sleep 2s
tail -n "$(wc -l ${model_dir}/train.log | awk '{print $1+1}')" -f ${model_dir}/train.log
fi
fi
wait
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "stage 2: ASR Decoding"
if [[ ${n_average} -ne 1 ]]; then
# Average models
dec_model=avg_${n_average}_checkpoint.pt
if [[ ! -f ${model_dir}/${dec_model} ]]; then
cmd="python ${code_dir}/scripts/average_checkpoints.py
--inputs ${model_dir}
--num-best-checkpoints ${n_average}
--output ${model_dir}/${dec_model}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval $cmd
fi
else
dec_model=${dec_model}
fi
if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
if [[ ${gpu_num} -eq 0 ]]; then
device=""
else
source ./local/utils.sh
device=$(get_devices $gpu_num 0)
fi
fi
export CUDA_VISIBLE_DEVICES=${device}
result_file=${model_dir}/decode_result
[[ -f ${result_file} ]] && rm ${result_file}
test_subset=${test_subset//,/ }
for subset in ${test_subset[@]}; do
subset=${subset}
cmd="python ${code_dir}/fairseq_cli/generate.py
${data_dir}
--config-yaml ${data_config}
--gen-subset ${subset}
--task speech_to_text
--path ${model_dir}/${dec_model}
--results-path ${model_dir}
--max-tokens ${max_tokens}
--beam ${beam_size}
--lenpen ${len_penalty}
--scoring wer
--wer-tokenizer 13a
--wer-lowercase
--wer-remove-punct
"
if [[ ${cer} -eq 1 ]]; then
cmd="${cmd}
--wer-char-level"
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
if [[ $eval -eq 1 ]]; then
eval $cmd
tail -n 1 ${model_dir}/generate-${subset}.txt >> ${result_file}
fi
done
cat ${result_file}
fi
#! /bin/bash
# training the model
gpu_num=8
update_freq=1
max_tokens=40000
extra_tag=
extra_parameter=
#extra_tag="${extra_tag}"
#extra_parameter="${extra_parameter} "
exp_tag=
config_list=(base ctc)
config_list=(purectc)
#config_list=(base conformer)
#config_list=(pds_base_16)
#config_list=(pds_base_16 conformer rpr)
# exp full name
exp_name=
train_config=$(echo ${config_list[*]} | sed 's/ /,/g')
cmd="./run.sh
--stage 1
--stop_stage 1
--gpu_num ${gpu_num}
--update_freq ${update_freq}
--train_config ${train_config}
--max_tokens ${max_tokens}
"
if [[ -n ${exp_name} ]]; then
cmd="$cmd --exp_name ${exp_name}"
fi
if [[ -n ${exp_tag} ]]; then
cmd="$cmd --exp_tag ${exp_tag}"
fi
if [[ -n ${extra_tag} ]]; then
cmd="$cmd --extra_tag ${extra_tag}"
fi
if [[ -n ${extra_parameter} ]]; then
cmd="$cmd --extra_parameter \"${extra_parameter}\""
fi
echo ${cmd}
eval ${cmd}
set -e
eval=1
lcrm=0
root_dir=~/st/Fairseq-S2T
data_dir=/home/xuchen/st/data/wmt/test
vocab_dir=/home/xuchen/st/data/wmt/mt/en-de/unigram32000_share
src_vocab_prefix=spm_unigram32000_share
tgt_vocab_prefix=spm_unigram32000_share
src_lang=en
tgt_lang=de
tokenize=1
splits=(newstest2014 newstest2016)
for split in ${splits[@]}; do
src_file=${data_dir}/${split}.${src_lang}
tgt_file=${data_dir}/${split}.${tgt_lang}
if [[ ${tokenize} -eq 1 ]]; then
cmd="tokenizer.perl -l ${src_lang} --threads 8 -no-escape < ${src_file} > ${src_file}.tok"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
cmd="tokenizer.perl -l ${tgt_lang} --threads 8 -no-escape < ${tgt_file} > ${tgt_file}.tok"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
src_file=${src_file}.tok
tgt_file=${tgt_file}.tok
fi
cmd="cat ${src_file}"
if [[ ${lcrm} -eq 1 ]]; then
cmd="python local/lower_rm.py ${src_file}"
fi
cmd="${cmd}
| spm_encode --model ${vocab_dir}/${src_vocab_prefix}.model
--output_format=piece
> ${src_file}.spm"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
cmd="spm_encode
--model ${vocab_dir}/${tgt_vocab_prefix}.model
--output_format=piece
< ${tgt_file}
> ${tgt_file}.spm"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
src_file=${src_file}.spm
tgt_file=${tgt_file}.spm
mkdir -p ${data_dir}/final
cmd="cp ${src_file} ${data_dir}/final/${split}.${src_lang}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
cmd="cp ${tgt_file} ${data_dir}/final/${split}.${tgt_lang}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
done
n_set=${#splits[*]}
for ((i=0;i<$n_set;i++)); do
dataset[$i]=${data_dir}/final/${splits[$i]}
done
pref=`echo ${dataset[*]} | sed 's/ /,/g'`
cmd="python ${root_dir}/fairseq_cli/preprocess.py
--source-lang ${src_lang}
--target-lang ${tgt_lang}
--testpref ${pref}
--destdir ${data_dir}/data-bin
--srcdict ${vocab_dir}/${src_vocab_prefix}.txt
--tgtdict ${vocab_dir}/${tgt_vocab_prefix}.txt
--workers 64"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
\ No newline at end of file
arch: transformer
share-all-embeddings: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 8000
lr: 1e-3
adam_betas: (0.9,0.997)
criterion: label_smoothed_cross_entropy
label_smoothing: 0.1
dropout: 0.1
attention-dropout: 0.1
activation-dropout: 0.1
activation-fn: relu
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
train-subset: train
valid-subset: valid
max-epoch: 50
max-update: 100000
patience: 20
best_checkpoint_metric: loss
maximize_best_checkpoint_metric: False
no-epoch-checkpoints: True
#keep-last-epochs: 10
keep-best-checkpoints: 10
num-workers: 8
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
skip-invalid-size-inputs-valid-test: True
\ No newline at end of file
use-enc-dlcl: True
use-dec-dlcl: True
#encoder-attention-type: rel_selfattn
encoder-attention-type: relative
decoder-attention-type: relative
max-encoder-relative-length: 20
max-decoder-relative-length: 20
arch: transformer
share-all-embeddings: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 8000
lr: 1e-3
adam_betas: (0.9,0.997)
criterion: label_smoothed_cross_entropy
label_smoothing: 0.1
dropout: 0.1
attention-dropout: 0.1
activation-dropout: 0.1
activation-fn: relu
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
#! /bin/bash
gpu_num=1
data_dir=
test_subset=(test)
exp_name=
if [ "$#" -eq 1 ]; then
exp_name=$1
fi
sacrebleu=1
n_average=10
beam_size=5
len_penalty=1.0
max_tokens=80000
dec_model=checkpoint_best.pt
cmd="./run.sh
--stage 2
--stop_stage 2
--gpu_num ${gpu_num}
--exp_name ${exp_name}
--sacrebleu ${sacrebleu}
--n_average ${n_average}
--beam_size ${beam_size}
--len_penalty ${len_penalty}
--max_tokens ${max_tokens}
--dec_model ${dec_model}
"
if [[ -n ${data_dir} ]]; then
cmd="$cmd --data_dir ${data_dir}"
fi
if [[ -n ${test_subset} ]]; then
test_subset=`echo ${test_subset[*]} | sed 's/ /,/g'`
cmd="$cmd --test_subset ${test_subset}"
fi
echo $cmd
eval $cmd
import sys
import string
in_file = sys.argv[1]
with open(in_file, "r", encoding="utf-8") as f:
for line in f.readlines():
line = line.strip().lower()
for w in string.punctuation:
line = line.replace(w, "")
line = line.replace(" ", "")
print(line)
gpu_num=4
cmd="sh train.sh"
while :
do
record=$(mktemp -t temp.record.XXXXXX)
gpustat > $record
all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
count=0
for dev in ${all_devices[@]}
do
line=$((dev + 2))
use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
if [[ $use -lt 100 ]]; then
device[$count]=$dev
count=$((count + 1))
if [[ $count -eq $gpu_num ]]; then
break
fi
fi
done
if [[ ${#device[@]} -lt $gpu_num ]]; then
sleep 60s
else
echo "Run $cmd"
eval $cmd
sleep 10s
exit
fi
done
#!/usr/bin/env bash
# Copyright 2012 Johns Hopkins University (Author: Daniel Povey);
# Arnab Ghoshal, Karel Vesely
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# Parse command-line options.
# To be sourced by another script (as in ". parse_options.sh").
# Option format is: --option-name arg
# and shell variable "option_name" gets set to value "arg."
# The exception is --help, which takes no arguments, but prints the
# $help_message variable (if defined).
###
### The --config file options have lower priority to command line
### options, so we need to import them first...
###
# Now import all the configs specified by command-line, in left-to-right order
for ((argpos=1; argpos<$#; argpos++)); do
if [ "${!argpos}" == "--config" ]; then
argpos_plus1=$((argpos+1))
config=${!argpos_plus1}
[ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
. $config # source the config file.
fi
done
###
### Now we process the command line options
###
while true; do
[ -z "${1:-}" ] && break; # break if there are no arguments
case "$1" in
# If the enclosing script is called with --help option, print the help
# message and exit. Scripts should put help messages in $help_message
--help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
else printf "$help_message\n" 1>&2 ; fi;
exit 0 ;;
--*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
exit 1 ;;
# If the first command-line argument begins with "--" (e.g. --foo-bar),
# then work out the variable name as $name, which will equal "foo_bar".
--*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
# Next we test whether the variable in question is undefned-- if so it's
# an invalid option and we die. Note: $0 evaluates to the name of the
# enclosing script.
# The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
# is undefined. We then have to wrap this test inside "eval" because
# foo_bar is itself inside a variable ($name).
eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
oldval="`eval echo \\$$name`";
# Work out whether we seem to be expecting a Boolean argument.
if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
was_bool=true;
else
was_bool=false;
fi
# Set the variable to the right value-- the escaped quotes make it work if
# the option had spaces, like --cmd "queue.pl -sync y"
eval $name=\"$2\";
# Check that Boolean-valued arguments are really Boolean.
if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
exit 1;
fi
shift 2;
;;
*) break;
esac
done
# Check for an empty argument to the --cmd option, which can easily occur as a
# result of scripting errors.
[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
true; # so this script returns exit code 0.
get_devices(){
gpu_num=$1
use_cpu=$2
device=()
while :
do
record=$(mktemp -t temp.record.XXXXXX)
gpustat > $record
all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
count=0
for dev in ${all_devices[@]}
do
line=$((dev + 2))
use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
if [[ $use -lt 100 ]]; then
device[$count]=$dev
count=$((count + 1))
if [[ $count -eq $gpu_num ]]; then
break
fi
fi
done
if [[ ${#device[@]} -lt $gpu_num ]]; then
if [[ $use_cpu -eq 1 ]]; then
device=(-1)
else
sleep 60s
fi
else
break
fi
done
echo ${device[*]} | sed 's/ /,/g'
return $?
}
#! /bin/bash
# Processing MuST-C Datasets
# Copyright 2021 Natural Language Processing Laboratory
# Xu Chen (xuchenneu@163.com)
# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set -e
#set -u
set -o pipefail
export PYTHONIOENCODING=UTF-8
eval=1
time=$(date "+%m%d")
stage=0
stop_stage=0
######## hardware ########
# devices
device=()
gpu_num=8
update_freq=1
root_dir=~/st
code_dir=${root_dir}/Fairseq-S2T
pwd_dir=$PWD
# dataset
src_lang=en
tgt_lang=de
lang=${src_lang}-${tgt_lang}
dataset=mustc
task=translation
vocab_type=unigram
vocab_size=10000
share_dict=1
lcrm=0
tokenizer=0
use_specific_dict=1
specific_prefix=st
specific_dir=${root_dir}/data/mustc/st
src_vocab_prefix=spm_unigram10000_st_share
tgt_vocab_prefix=spm_unigram10000_st_share
org_data_dir=${root_dir}/data/${dataset}
data_dir=${root_dir}/data/${dataset}/mt
train_subset=train
valid_subset=dev
trans_subset=tst-COMMON
test_subset=test
# exp
exp_prefix=${time}
extra_tag=
extra_parameter=
exp_tag=baseline
exp_name=
# config
train_config=base_s
# training setting
fp16=1
max_tokens=4096
step_valid=0
bleu_valid=0
# decoding setting
sacrebleu=1
dec_model=checkpoint_best.pt
n_average=10
beam_size=5
len_penalty=1.0
if [[ ${use_specific_dict} -eq 1 ]]; then
exp_prefix=${exp_prefix}_${specific_prefix}
data_dir=${data_dir}/${specific_prefix}
mkdir -p ${data_dir}
else
if [[ "${vocab_type}" == "char" ]]; then
vocab_name=${vocab_type}
exp_prefix=${exp_prefix}_${vocab_type}
else
vocab_name=${vocab_type}${vocab_size}
fi
data_dir=${data_dir}/${vocab_name}
src_vocab_prefix=spm_${vocab_name}_${src_lang}
tgt_vocab_prefix=spm_${vocab_name}_${tgt_lang}
if [[ $share_dict -eq 1 ]]; then
data_dir=${data_dir}_share
src_vocab_prefix=spm_${vocab_name}_share
tgt_vocab_prefix=spm_${vocab_name}_share
fi
fi
if [[ ${lcrm} -eq 1 ]]; then
data_dir=${data_dir}_lcrm
exp_prefix=${exp_prefix}_lcrm
fi
if [[ ${tokenizer} -eq 1 ]]; then
train_subset=${train_subset}.tok
valid_subset=${valid_subset}.tok
trans_subset=${trans_subset}.tok
data_dir=${data_dir}_tok
exp_prefix=${exp_prefix}_tok
fi
. ./local/parse_options.sh || exit 1;
# full path
if [[ -z ${exp_name} ]]; then
config_string=${train_config//,/_}
exp_name=${exp_prefix}_${config_string}_${exp_tag}
if [[ -n ${extra_tag} ]]; then
exp_name=${exp_name}_${extra_tag}
fi
fi
model_dir=${root_dir}/checkpoints/${dataset}/mt/${exp_name}
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "stage -1: Data Download"
# pass
fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
### Task dependent. You have to make data the following preparation part by yourself.
echo "stage 0: MT Data Preparation"
if [[ ! -e ${data_dir} ]]; then
mkdir -p ${data_dir}
fi
if [[ ! -f ${data_dir}/${src_vocab_prefix}.txt || ! -f ${data_dir}/${tgt_vocab_prefix}.txt ]]; then
if [[ ${use_specific_dict} -eq 0 ]]; then
cmd="python ${code_dir}/examples/speech_to_text/prep_mt_data.py
--data-root ${org_data_dir}
--output-root ${data_dir}
--splits ${train_subset},${valid_subset},${trans_subset}
--src-lang ${src_lang}
--tgt-lang ${tgt_lang}
--vocab-type ${vocab_type}
--vocab-size ${vocab_size}"
if [[ $share_dict -eq 1 ]]; then
cmd="$cmd
--share"
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
else
cp -r ${specific_dir}/${src_vocab_prefix}.* ${data_dir}
cp ${specific_dir}/${tgt_vocab_prefix}.* ${data_dir}
fi
fi
mkdir -p ${data_dir}/data
for split in ${train_subset} ${valid_subset} ${trans_subset}; do
{
if [[ -d ${org_data_dir}/data/${split}/txt ]]; then
txt_dir=${org_data_dir}/data/${split}/txt
else
txt_dir=${org_data_dir}/data/${split}
fi
cmd="cat ${txt_dir}/${split}.${src_lang}"
if [[ ${lcrm} -eq 1 ]]; then
cmd="python local/lower_rm.py ${org_data_dir}/data/${split}.${src_lang}"
fi
cmd="${cmd}
| spm_encode --model ${data_dir}/${src_vocab_prefix}.model
--output_format=piece
> ${data_dir}/data/${split}.${src_lang}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
cmd="spm_encode
--model ${data_dir}/${tgt_vocab_prefix}.model
--output_format=piece
< ${txt_dir}/${split}.${tgt_lang}
> ${data_dir}/data/${split}.${tgt_lang}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
}&
done
wait
cmd="python ${code_dir}/fairseq_cli/preprocess.py
--source-lang ${src_lang} --target-lang ${tgt_lang}
--trainpref ${data_dir}/data/${train_subset}
--validpref ${data_dir}/data/${valid_subset}
--testpref ${data_dir}/data/${trans_subset}
--destdir ${data_dir}/data-bin
--srcdict ${data_dir}/${src_vocab_prefix}.txt
--tgtdict ${data_dir}/${tgt_vocab_prefix}.txt
--workers 64"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
fi
data_dir=${data_dir}/data-bin
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "stage 1: MT Network Training"
[[ ! -d ${data_dir} ]] && echo "The data dir ${data_dir} is not existing!" && exit 1;
if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
if [[ ${gpu_num} -eq 0 ]]; then
device=""
else
source ./local/utils.sh
device=$(get_devices $gpu_num 0)
fi
fi
echo -e "dev=${device} data=${data_dir} model=${model_dir}"
if [[ ! -d ${model_dir} ]]; then
mkdir -p ${model_dir}
else
echo "${model_dir} exists."
fi
cp ${BASH_SOURCE[0]} ${model_dir}
cp ${PWD}/train.sh ${model_dir}
extra_parameter="${extra_parameter}
--train-config ${pwd_dir}/conf/basis.yaml"
cp ${pwd_dir}/conf/basis.yaml ${model_dir}
config_list="${train_config//,/ }"
idx=1
for config in ${config_list[@]}
do
config_path=${pwd_dir}/conf/${config}.yaml
if [[ ! -f ${config_path} ]]; then
echo "No config file ${config_path}"
exit
fi
cp ${config_path} ${model_dir}
extra_parameter="${extra_parameter}
--train-config${idx} ${config_path}"
idx=$((idx + 1))
done
cmd="python3 -u ${code_dir}/fairseq_cli/train.py
${data_dir}
--source-lang ${src_lang}
--target-lang ${tgt_lang}
--task ${task}
--max-tokens ${max_tokens}
--skip-invalid-size-inputs-valid-test
--update-freq ${update_freq}
--log-interval 100
--save-dir ${model_dir}
--tensorboard-logdir ${model_dir}"
if [[ -n ${extra_parameter} ]]; then
cmd="${cmd}
${extra_parameter}"
fi
if [[ ${gpu_num} -gt 0 ]]; then
cmd="${cmd}
--distributed-world-size $gpu_num
--ddp-backend no_c10d"
fi
if [[ $fp16 -eq 1 ]]; then
cmd="${cmd}
--fp16"
fi
if [[ $step_valid -eq 1 ]]; then
validate_interval=1
save_interval=1
no_epoch_checkpoints=0
save_interval_updates=500
keep_interval_updates=10
fi
if [[ $bleu_valid -eq 1 ]]; then
cmd="$cmd
--eval-bleu
--eval-bleu-args '{\"beam\": 1}'
--eval-tokenized-bleu
--eval-bleu-remove-bpe
--best-checkpoint-metric bleu
--maximize-best-checkpoint-metric"
fi
if [[ -n $no_epoch_checkpoints && $no_epoch_checkpoints -eq 1 ]]; then
cmd="$cmd
--no-epoch-checkpoints"
fi
if [[ -n $validate_interval ]]; then
cmd="${cmd}
--validate-interval $validate_interval "
fi
if [[ -n $save_interval ]]; then
cmd="${cmd}
--save-interval $save_interval "
fi
if [[ -n $save_interval_updates ]]; then
cmd="${cmd}
--save-interval-updates $save_interval_updates"
if [[ -n $keep_interval_updates ]]; then
cmd="${cmd}
--keep-interval-updates $keep_interval_updates"
fi
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
# save info
log=./history.log
echo "${time} | ${device} | ${data_dir} | ${exp_name} | ${model_dir} " >> $log
tail -n 50 ${log} > tmp.log
mv tmp.log $log
export CUDA_VISIBLE_DEVICES=${device}
cmd="nohup ${cmd} >> ${model_dir}/train.log 2>&1 &"
if [[ $eval -eq 1 ]]; then
eval $cmd
sleep 2s
tail -n "$(wc -l ${model_dir}/train.log | awk '{print $1+1}')" -f ${model_dir}/train.log
fi
fi
wait
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "stage 2: MT Decoding"
if [[ ${n_average} -ne 1 ]]; then
# Average models
dec_model=avg_${n_average}_checkpoint.pt
if [[ ! -f ${model_dir}/${dec_model} ]]; then
cmd="python ${code_dir}/scripts/average_checkpoints.py
--inputs ${model_dir}
--num-best-checkpoints ${n_average}
--output ${model_dir}/${dec_model}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval $cmd
fi
else
dec_model=${dec_model}
fi
if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
if [[ ${gpu_num} -eq 0 ]]; then
device=""
else
source ./local/utils.sh
device=$(get_devices $gpu_num 0)
fi
fi
export CUDA_VISIBLE_DEVICES=${device}
result_file=${model_dir}/decode_result
[[ -f ${result_file} ]] && rm ${result_file}
test_subset=(${test_subset//,/ })
for subset in ${test_subset[@]}; do
cmd="python ${code_dir}/fairseq_cli/generate.py
${data_dir}
--source-lang ${src_lang}
--target-lang ${tgt_lang}
--gen-subset ${subset}
--task ${task}
--path ${model_dir}/${dec_model}
--results-path ${model_dir}
--max-tokens ${max_tokens}
--beam ${beam_size}
--lenpen ${len_penalty}
--post-process sentencepiece"
if [[ ${sacrebleu} -eq 1 ]]; then
cmd="${cmd}
--scoring sacrebleu"
if [[ ${tokenizer} -eq 1 ]]; then
cmd="${cmd}
--tokenizer moses
--moses-source-lang ${src_lang}
--moses-target-lang ${tgt_lang}"
fi
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
if [[ $eval -eq 1 ]]; then
eval $cmd
tail -n 1 ${model_dir}/generate-${subset}.txt >> ${result_file}
fi
done
cat ${result_file}
fi
#! /bin/bash
# training the model
gpu_num=1
update_freq=1
max_tokens=8192
exp_tag=baseline
config_list=(base)
# exp full name
exp_name=
extra_tag=
extra_parameter=
#extra_tag="${extra_tag}"
#extra_parameter="${extra_parameter} "
train_config=$(echo ${config_list[*]} | sed 's/ /,/g')
cmd="./run.sh
--stage 1
--stop_stage 1
--gpu_num ${gpu_num}
--update_freq ${update_freq}
--train_config ${train_config}
--max_tokens ${max_tokens}
"
if [[ -n ${exp_name} ]]; then
cmd="$cmd --exp_name ${exp_name}"
fi
if [[ -n ${exp_tag} ]]; then
cmd="$cmd --exp_tag ${exp_tag}"
fi
if [[ -n ${extra_tag} ]]; then
cmd="$cmd --extra_tag ${extra_tag}"
fi
if [[ -n ${extra_parameter} ]]; then
cmd="$cmd --extra_parameter \"${extra_parameter}\""
fi
echo ${cmd}
eval ${cmd}
set -e
eval=1
lcrm=1
tokenizer=0
vocab_type=unigram
vocab_size=5000
use_raw_audio=0
speed_perturb=0
dataset=mustc
root_dir=~/st
code_dir=${root_dir}/Fairseq-S2T
org_data_dir=${root_dir}/data/${dataset}
data_dir=${root_dir}/data/${dataset}/st
use_specific_dict=0
specific_prefix=st
specific_dir=${root_dir}/data/mustc/st
asr_vocab_prefix=spm_unigram10000_st_share
src_lang=en
tgt_lang=zh
subsets=(2019)
splits=$(echo ${splits[*]} | sed 's/ /_/g')
cmd="python ${code_dir}/examples/speech_to_text/prep_audio_data.py
--data-root ${org_data_dir}
--output-root ${data_dir}
--task asr
--src-lang ${src_lang}
--tgt-lang ${tgt_lang}
--splits ${splits}
--vocab-type ${vocab_type}
--vocab-size ${vocab_size}"
if [[ ${use_raw_audio} -eq 1 ]]; then
cmd="$cmd
--raw"
fi
if [[ ${use_specific_dict} -eq 1 ]]; then
cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}
cmd="$cmd
--asr-prefix ${asr_vocab_prefix}"
fi
if [[ ${speed_perturb} -eq 1 ]]; then
cmd="$cmd
--speed-perturb"
fi
if [[ ${lcrm} -eq 1 ]]; then
cmd="$cmd
--lowercase-src
--rm-punc-src"
fi
if [[ ${tokenizer} -eq 1 ]]; then
cmd="$cmd
--tokenizer"
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
arch: s2t_transformer_s
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
subsampling-type: conv1d
subsmapling-layers: 2
subsampling-filter: 1024
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
attention-dropout: 0.1
activation-dropout: 0.1
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
train-subset: train
valid-subset: dev
max-epoch: 100
max-update: 100000
patience: 20
best_checkpoint_metric: loss
maximize_best_checkpoint_metric: False
no-epoch-checkpoints: True
#keep-last-epochs: 10
keep-best-checkpoints: 10
num-workers: 8
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
skip-invalid-size-inputs-valid-test: True
\ No newline at end of file
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 31
encoder-attention-type: rel_pos
encoder-activation-fn: swish
\ No newline at end of file
ctc-weight: 0.3
post-process: sentencepiece
\ No newline at end of file
use-enc-dlcl: True
use-dec-dlcl: True
ctc-weight: 0.2
intermedia-ctc-layers: 6,9
intermedia-adapter: league
intermedia-ctc-weight: 0.1
ctc-self-distill-weight: 0
post-process: sentencepiece
\ No newline at end of file
encoder-attention-type: local
hard-mask-window: 0
gauss-mask-sigma: 3
init-mask-weight: 0
\ No newline at end of file
arch: pdss2t_transformer_s_8
pds-fusion: True
ctc-layer: 12
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_s_16
encoder-embed-dim: 256
pds-stages: 4
ctc-layer: 12
pds-layers: 2_2_6_2
pds-ratios: 2_2_2_2
pds-fusion: True
pds-fusion-method: all_conv
pds-embed-dims: 256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_s_32
encoder-embed-dim: 256
pds-stages: 5
ctc-layer: 12
pds-layers: 2_2_3_3_2
pds-ratios: 2_2_2_2_2
pds-fusion: True
pds-fusion-method: all_conv
pds-embed-dims: 256_256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1_1
pds-kernel-sizes: 5_5_5_5_5
pds-ffn-ratios: 8_8_8_8_8
pds-attn-heads: 4_4_4_4_4
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_s_8
pds-ctc: 1_1_1_1
intermedia-adapter: league
intermedia-ctc-weight: 0.15
encoder-embed-dim: 256
pds-stages: 4
ctc-layer: 12
pds-layers: 3_3_3_3
pds-ratios: 2_2_1_2
pds-fusion: True
pds-fusion-method: all_conv
pds-embed-dims: 256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
encoder-attention-type: rel_selfattn
#encoder-attention-type: relative
#decoder-attention-type: relative
#max-encoder-relative-length: 100
#max-decoder-relative-length: 20
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
encoder-normalize-before: True
decoder-normalize-before: True
subsampling-type: conv1d
subsmapling-layers: 2
subsampling-filter: 1024
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
acoustic-encoder: transformer
adapter: league
#load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
encoder-normalize-before: True
decoder-normalize-before: True
subsampling-type: conv1d
subsmapling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.15
activation-fn: relu
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
acoustic-encoder: transformer
adapter: league
#load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
encoder-normalize-before: True
decoder-normalize-before: True
subsampling-type: conv1d
subsmapling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.15
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
acoustic-encoder: pds
adapter: league
encoder-embed-dim: 512
ctc-layer: 12
pds-stages: 4
pds-layers: 3_3_3_3
pds-ratios: 2_2_1_2
pds-fusion: True
pds-fusion-method: all_conv
pds-embed-dims: 512_512_512_512
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 4_4_4_4
pds-attn-heads: 8_8_8_8
#load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
encoder-normalize-before: True
decoder-normalize-before: True
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
acoustic-encoder: pds
adapter: league
encoder-embed-dim: 256
ctc-layer: 12
pds-stages: 4
pds-layers: 3_3_3_3
pds-ratios: 2_2_1_2
pds-fusion: True
pds-fusion-method: all_conv
pds-embed-dims: 256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
#load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
#! /bin/bash
gpu_num=1
data_dir=
test_subset=(dev tst-COMMON)
exp_name=
if [ "$#" -eq 1 ]; then
exp_name=$1
fi
sacrebleu=1
n_average=10
beam_size=5
len_penalty=1.0
max_tokens=80000
dec_model=checkpoint_best.pt
cmd="./run.sh
--stage 2
--stop_stage 2
--gpu_num ${gpu_num}
--exp_name ${exp_name}
--sacrebleu ${sacrebleu}
--n_average ${n_average}
--beam_size ${beam_size}
--len_penalty ${len_penalty}
--max_tokens ${max_tokens}
--dec_model ${dec_model}
"
if [[ -n ${data_dir} ]]; then
cmd="$cmd --data_dir ${data_dir}"
fi
if [[ ${#test_subset[@]} -ne 0 ]]; then
subsets=$(echo ${test_subset[*]} | sed 's/ /,/g')
cmd="$cmd --test_subset ${subsets}"
fi
echo $cmd
eval $cmd
set -e
gpu_num=1
root_dir=/home/xuchen/st/Fairseq-S2T
ckpt=/home/xuchen/st/checkpoints/mustc-v2/st
model_txt=$1
set=$2
test_subset=$3
#data_dir=/home/xuchen/st/data/mustc-v2/st_lcrm/en-de
#test_subset=(tst-COMMON)
data_dir=/media/data/tst/$set/en-de
#test_subset=(office)
#test_subset=(webrtc1)
#test_subset=(adap2)
data_config=config_st_share.yaml
result_file=./result
beam_size=5
lenpen=0.6
max_tokens=10000
models=()
i=0
for line in `cat $model_txt`; do
i=`expr $i + 1`
model_dir=$ckpt/$line
[[ ! -d $model_dir ]] && echo $model_dir && exit 1;
if [[ -f $model_dir/avg_10_checkpoint.pt ]]; then
model=$model_dir/avg_10_checkpoint.pt
else
model=$model_dir/checkpoint_best.pt
fi
[[ ! -f $model ]] && echo $model && exit 1;
models[$i]=$model
done
models=`echo ${models[*]} | sed 's/ /:/g'`
res_dir=$ckpt/ensemble/$set
i=0
while :
do
if [[ -d $res_dir/$i ]]; then
i=`expr $i + 1`
else
res_dir=$res_dir/$i
break
fi
done
mkdir -p $res_dir
cp $model_txt $res_dir
if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
if [[ ${gpu_num} -eq 0 ]]; then
device=()
else
source ./local/utils.sh
device=$(get_devices $gpu_num 0)
fi
fi
export CUDA_VISIBLE_DEVICES=${device}
for subset in ${test_subset[@]}; do
subset=${subset}_st
cmd="python ${root_dir}/fairseq_cli/generate.py
${data_dir}
--config-yaml ${data_config}
--gen-subset ${subset}
--task speech_to_text
--path ${models}
--results-path ${res_dir}
--skip-invalid-size-inputs-valid-test
--max-tokens ${max_tokens}
--beam ${beam_size}
--lenpen ${lenpen}
--scoring sacrebleu"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
eval $cmd
tail -n 1 ${res_dir}/generate-${subset}.txt
cd $res_dir
evaluate.sh translation-${subset}.txt $set
cd -
done
gpu_num=4
cmd="sh train.sh"
while :
do
record=$(mktemp -t temp.record.XXXXXX)
gpustat > $record
all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
count=0
for dev in ${all_devices[@]}
do
line=$((dev + 2))
use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
if [[ $use -lt 100 ]]; then
device[$count]=$dev
count=$((count + 1))
if [[ $count -eq $gpu_num ]]; then
break
fi
fi
done
if [[ ${#device[@]} -lt $gpu_num ]]; then
sleep 60s
else
echo "Run $cmd"
eval $cmd
sleep 10s
exit
fi
done
#!/usr/bin/env bash
# Copyright 2012 Johns Hopkins University (Author: Daniel Povey);
# Arnab Ghoshal, Karel Vesely
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# Parse command-line options.
# To be sourced by another script (as in ". parse_options.sh").
# Option format is: --option-name arg
# and shell variable "option_name" gets set to value "arg."
# The exception is --help, which takes no arguments, but prints the
# $help_message variable (if defined).
###
### The --config file options have lower priority to command line
### options, so we need to import them first...
###
# Now import all the configs specified by command-line, in left-to-right order
for ((argpos=1; argpos<$#; argpos++)); do
if [ "${!argpos}" == "--config" ]; then
argpos_plus1=$((argpos+1))
config=${!argpos_plus1}
[ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
. $config # source the config file.
fi
done
###
### Now we process the command line options
###
while true; do
[ -z "${1:-}" ] && break; # break if there are no arguments
case "$1" in
# If the enclosing script is called with --help option, print the help
# message and exit. Scripts should put help messages in $help_message
--help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
else printf "$help_message\n" 1>&2 ; fi;
exit 0 ;;
--*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
exit 1 ;;
# If the first command-line argument begins with "--" (e.g. --foo-bar),
# then work out the variable name as $name, which will equal "foo_bar".
--*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
# Next we test whether the variable in question is undefned-- if so it's
# an invalid option and we die. Note: $0 evaluates to the name of the
# enclosing script.
# The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
# is undefined. We then have to wrap this test inside "eval" because
# foo_bar is itself inside a variable ($name).
eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
oldval="`eval echo \\$$name`";
# Work out whether we seem to be expecting a Boolean argument.
if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
was_bool=true;
else
was_bool=false;
fi
# Set the variable to the right value-- the escaped quotes make it work if
# the option had spaces, like --cmd "queue.pl -sync y"
eval $name=\"$2\";
# Check that Boolean-valued arguments are really Boolean.
if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
exit 1;
fi
shift 2;
;;
*) break;
esac
done
# Check for an empty argument to the --cmd option, which can easily occur as a
# result of scripting errors.
[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
true; # so this script returns exit code 0.
get_devices(){
gpu_num=$1
use_cpu=$2
device=()
while :
do
record=$(mktemp -t temp.record.XXXXXX)
gpustat > $record
all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
count=0
for dev in ${all_devices[@]}
do
line=$((dev + 2))
use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
if [[ $use -lt 100 ]]; then
device[$count]=$dev
count=$((count + 1))
if [[ $count -eq $gpu_num ]]; then
break
fi
fi
done
if [[ ${#device[@]} -lt $gpu_num ]]; then
if [[ $use_cpu -eq 1 ]]; then
device=(-1)
else
sleep 60s
fi
else
break
fi
done
echo ${device[*]} | sed 's/ /,/g'
return $?
}
#! /bin/bash
# Processing MuST-C Datasets
# Copyright 2021 Natural Language Processing Laboratory
# Xu Chen (xuchenneu@163.com)
# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set -e
#set -u
set -o pipefail
export PYTHONIOENCODING=UTF-8
eval=1
time=$(date "+%m%d_%H%M")
stage=0
stop_stage=0
######## hardware ########
# devices
#device=()
gpu_num=8
update_freq=1
root_dir=~/st
code_dir=${root_dir}/Fairseq-S2T
pwd_dir=$PWD
# dataset
src_lang=en
tgt_lang=de
lang=${src_lang}-${tgt_lang}
dataset=mustc
task=speech_to_text
vocab_type=unigram
asr_vocab_size=5000
vocab_size=10000
share_dict=1
speed_perturb=0
lcrm=0
tokenizer=0
use_raw_audio=0
use_specific_dict=0
specific_prefix=valid
specific_dir=${root_dir}/data/mustc/st
asr_vocab_prefix=spm_unigram10000_st_share
st_vocab_prefix=spm_unigram10000_st_share
org_data_dir=${root_dir}/data/${dataset}
data_dir=${root_dir}/data/${dataset}/st
train_split=train
valid_split=dev
test_split=tst-COMMON
test_subset=tst-COMMON
# exp
exp_prefix=$(date "+%m%d")
extra_tag=
extra_parameter=
exp_tag=baseline
exp_name=
# config
train_config=base,ctc
# training setting
fp16=1
max_tokens=40000
step_valid=0
bleu_valid=0
# decoding setting
sacrebleu=1
dec_model=checkpoint_best.pt
n_average=10
beam_size=5
len_penalty=1.0
if [[ ${share_dict} -eq 1 ]]; then
data_config=config_share.yaml
else
data_config=config.yaml
fi
if [[ ${speed_perturb} -eq 1 ]]; then
data_dir=${data_dir}_sp
exp_prefix=${exp_prefix}_sp
fi
if [[ ${lcrm} -eq 1 ]]; then
data_dir=${data_dir}_lcrm
exp_prefix=${exp_prefix}_lcrm
fi
if [[ ${use_specific_dict} -eq 1 ]]; then
data_dir=${data_dir}_${specific_prefix}
exp_prefix=${exp_prefix}_${specific_prefix}
fi
if [[ ${tokenizer} -eq 1 ]]; then
data_dir=${data_dir}_tok
exp_prefix=${exp_prefix}_tok
fi
if [[ ${use_raw_audio} -eq 1 ]]; then
data_dir=${data_dir}_raw
exp_prefix=${exp_prefix}_raw
fi
. ./local/parse_options.sh || exit 1;
if [[ -z ${exp_name} ]]; then
config_string=${train_config//,/_}
exp_name=${exp_prefix}_${config_string}_${exp_tag}
if [[ -n ${extra_tag} ]]; then
exp_name=${exp_name}_${extra_tag}
fi
fi
model_dir=${root_dir}/checkpoints/${dataset}/st/${exp_name}
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "stage -1: Data Download"
# pass
fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
### Task dependent. You have to make data the following preparation part by yourself.
### But you can utilize Kaldi recipes in most cases
echo "stage 0: ASR Data Preparation"
if [[ ! -e ${data_dir} ]]; then
mkdir -p ${data_dir}
fi
feature_zip=fbank80.zip
if [[ ${speed_perturb} -eq 1 ]]; then
feature_zip=fbank80_sp.zip
fi
if [[ ! -f ${data_dir}/${feature_zip} && -f ${data_dir}/../feature_zip ]]; then
ln -s ${data_dir}/../feature_zip ${data_dir}
fi
# create ASR vocabulary if necessary
cmd="python ${code_dir}/examples/speech_to_text/prep_audio_data.py
--data-root ${org_data_dir}
--output-root ${data_dir}/asr4st
--task asr
--raw
--src-lang ${src_lang}
--splits ${valid_split},${test_split},${train_split}
--vocab-type ${vocab_type}
--vocab-size ${asr_vocab_size}"
[[ $eval -eq 1 && ${share_dict} -ne 1 && ${use_specific_dict} -ne 1 ]] && (echo -e "\033[34mRun command: \n${cmd} \033[0m" && eval $cmd)
asr_prefix=spm_${vocab_type}${asr_vocab_size}_asr
echo "stage 0: ST Data Preparation"
cmd="python ${code_dir}/examples/speech_to_text/prep_audio_data.py
--data-root ${org_data_dir}
--output-root ${data_dir}
--task st
--add-src
--src-lang ${src_lang}
--tgt-lang ${tgt_lang}
--splits ${valid_split},${test_split},${train_split}
--cmvn-type utterance
--vocab-type ${vocab_type}
--vocab-size ${vocab_size}"
if [[ ${use_raw_audio} -eq 1 ]]; then
cmd="$cmd
--raw"
fi
if [[ ${use_specific_dict} -eq 1 ]]; then
cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}
cp -r ${specific_dir}/${st_vocab_prefix}.* ${data_dir}
if [[ $share_dict -eq 1 ]]; then
cmd="$cmd
--share
--st-spm-prefix ${st_vocab_prefix}"
else
cmd="$cmd
--st-spm-prefix ${st_vocab_prefix}
--asr-prefix ${asr_vocab_prefix}"
fi
else
if [[ $share_dict -eq 1 ]]; then
cmd="$cmd
--share"
else
cmd="$cmd
--asr-prefix ${asr_prefix}"
fi
fi
if [[ ${speed_perturb} -eq 1 ]]; then
cmd="$cmd
--speed-perturb"
fi
if [[ ${lcrm} -eq 1 ]]; then
cmd="$cmd
--lowercase-src
--rm-punc-src"
fi
if [[ ${tokenizer} -eq 1 ]]; then
cmd="$cmd
--tokenizer"
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
if [[ -f ${data_dir}/${feature_zip} && ! -f ${data_dir}/../${feature_zip} ]]; then
mv ${data_dir}/${feature_zip} ${data_dir}/..
ln -s ${data_dir}/../${feature_zip} ${data_dir}
fi
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "stage 1: ST Network Training"
[[ ! -d ${data_dir} ]] && echo "The data dir ${data_dir} is not existing!" && exit 1;
if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
if [[ ${gpu_num} -eq 0 ]]; then
device=""
else
source ./local/utils.sh
device=$(get_devices $gpu_num 0)
fi
fi
echo -e "dev=${device} data=${data_dir} model=${model_dir}"
if [[ ! -d ${model_dir} ]]; then
mkdir -p ${model_dir}
else
echo "${model_dir} exists."
fi
cp ${BASH_SOURCE[0]} ${model_dir}
cp ${PWD}/train.sh ${model_dir}
extra_parameter="${extra_parameter}
--train-config ${pwd_dir}/conf/basis.yaml"
cp ${pwd_dir}/conf/basis.yaml ${model_dir}
config_list="${train_config//,/ }"
idx=1
for config in ${config_list[@]}
do
config_path=${pwd_dir}/conf/${config}.yaml
if [[ ! -f ${config_path} ]]; then
echo "No config file ${config_path}"
exit
fi
cp ${config_path} ${model_dir}
extra_parameter="${extra_parameter}
--train-config${idx} ${config_path}"
idx=$((idx + 1))
done
cmd="python3 -u ${code_dir}/fairseq_cli/train.py
${data_dir}
--config-yaml ${data_config}
--task ${task}
--max-tokens ${max_tokens}
--skip-invalid-size-inputs-valid-test
--update-freq ${update_freq}
--log-interval 100
--save-dir ${model_dir}
--tensorboard-logdir ${model_dir}"
if [[ -n ${extra_parameter} ]]; then
cmd="${cmd}
${extra_parameter}"
fi
if [[ ${gpu_num} -gt 0 ]]; then
cmd="${cmd}
--distributed-world-size $gpu_num
--ddp-backend no_c10d"
fi
if [[ $fp16 -eq 1 ]]; then
cmd="${cmd}
--fp16"
fi
if [[ $step_valid -eq 1 ]]; then
validate_interval=1
save_interval=1
keep_last_epochs=10
no_epoch_checkpoints=0
save_interval_updates=500
keep_interval_updates=10
else
validate_interval=1
keep_last_epochs=10
fi
if [[ $bleu_valid -eq 1 ]]; then
cmd="$cmd
--eval-bleu
--eval-bleu-args '{\"beam\": 1}'
--eval-tokenized-bleu
--eval-bleu-remove-bpe
--best-checkpoint-metric bleu
--maximize-best-checkpoint-metric"
fi
if [[ -n $no_epoch_checkpoints && $no_epoch_checkpoints -eq 1 ]]; then
cmd="$cmd
--no-epoch-checkpoints"
fi
if [[ -n $validate_interval ]]; then
cmd="${cmd}
--validate-interval $validate_interval "
fi
if [[ -n $save_interval ]]; then
cmd="${cmd}
--save-interval $save_interval "
fi
if [[ -n $keep_last_epochs ]]; then
cmd="${cmd}
--keep-last-epochs $keep_last_epochs "
fi
if [[ -n $save_interval_updates ]]; then
cmd="${cmd}
--save-interval-updates $save_interval_updates"
if [[ -n $keep_interval_updates ]]; then
cmd="${cmd}
--keep-interval-updates $keep_interval_updates"
fi
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
# save info
log=./history.log
echo "${time} | ${device} | ${data_dir} | ${exp_name} | ${model_dir} " >> $log
tail -n 50 ${log} > tmp.log
mv tmp.log $log
export CUDA_VISIBLE_DEVICES=${device}
cmd="nohup ${cmd} >> ${model_dir}/train.log 2>&1 &"
if [[ $eval -eq 1 ]]; then
eval $cmd
sleep 2s
tail -n "$(wc -l ${model_dir}/train.log | awk '{print $1+1}')" -f ${model_dir}/train.log
fi
fi
wait
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "stage 2: ST Decoding"
if [[ ${n_average} -ne 1 ]]; then
# Average models
dec_model=avg_${n_average}_checkpoint.pt
if [[ ! -f ${model_dir}/${dec_model} ]]; then
cmd="python ${code_dir}/scripts/average_checkpoints.py
--inputs ${model_dir}
--num-best-checkpoints ${n_average}
--output ${model_dir}/${dec_model}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval $cmd
fi
else
dec_model=${dec_model}
fi
if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
if [[ ${gpu_num} -eq 0 ]]; then
device=""
else
source ./local/utils.sh
device=$(get_devices $gpu_num 0)
fi
fi
export CUDA_VISIBLE_DEVICES=${device}
result_file=${model_dir}/decode_result
[[ -f ${result_file} ]] && rm ${result_file}
test_subset=${test_subset//,/ }
for subset in ${test_subset[@]}; do
subset=${subset}
cmd="python ${code_dir}/fairseq_cli/generate.py
${data_dir}
--config-yaml ${data_config}
--gen-subset ${subset}
--task speech_to_text
--path ${model_dir}/${dec_model}
--results-path ${model_dir}
--max-tokens ${max_tokens}
--beam ${beam_size}
--lenpen ${len_penalty}"
if [[ ${sacrebleu} -eq 1 ]]; then
cmd="${cmd}
--scoring sacrebleu"
if [[ ${tokenizer} -eq 1 ]]; then
cmd="${cmd}
--tokenizer moses
--moses-source-lang ${src_lang}
--moses-target-lang ${tgt_lang}"
fi
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
if [[ $eval -eq 1 ]]; then
eval $cmd
tail -n 1 ${model_dir}/generate-${subset}.txt >> ${result_file}
fi
done
cat ${result_file}
fi
#! /bin/bash
# training the model
gpu_num=8
update_freq=1
max_tokens=40000
extra_tag=
extra_parameter=
#extra_tag="${extra_tag}"
#extra_parameter="${extra_parameter} "
exp_tag=
#config_list=(base)
config_list=(ctc)
#config_list=(sate_ctc)
#config_list=(ctc conformer rpr)
#config_list=(base sate)
#config_list=(pds_base)
#config_list=(pds_base conformer)
# exp full name
exp_name=
train_config=$(echo ${config_list[*]} | sed 's/ /,/g')
cmd="./run.sh
--stage 1
--stop_stage 1
--gpu_num ${gpu_num}
--update_freq ${update_freq}
--train_config ${train_config}
--max_tokens ${max_tokens}
"
if [[ -n ${exp_name} ]]; then
cmd="$cmd --exp_name ${exp_name}"
fi
if [[ -n ${exp_tag} ]]; then
cmd="$cmd --exp_tag ${exp_tag}"
fi
if [[ -n ${extra_tag} ]]; then
cmd="$cmd --extra_tag ${extra_tag}"
fi
if [[ -n ${extra_parameter} ]]; then
cmd="$cmd --extra_parameter \"${extra_parameter}\""
fi
echo ${cmd}
eval ${cmd}
...@@ -5,6 +5,7 @@ clip-norm: 10.0 ...@@ -5,6 +5,7 @@ clip-norm: 10.0
lr-scheduler: inverse_sqrt lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7 warmup-init-lr: 1e-7
warmup-updates: 10000 warmup-updates: 10000
weight-decay: 1e-6
lr: 2e-3 lr: 2e-3
adam_betas: (0.9,0.98) adam_betas: (0.9,0.98)
...@@ -13,12 +14,21 @@ label_smoothing: 0.1 ...@@ -13,12 +14,21 @@ label_smoothing: 0.1
subsampling-type: conv1d subsampling-type: conv1d
subsmapling-layers: 2 subsmapling-layers: 2
subsampling-filter: 512 subsampling-filter: 1024
subsampling-kernel: 5 subsampling-kernel: 5
subsampling-stride: 2 subsampling-stride: 2
subsampling-norm: none subsampling-norm: none
subsampling-activation: glu subsampling-activation: glu
ctc-weight: 0.2
intermedia-ctc-layers: 6,9
intermedia-adapter: league
intermedia-ctc-weight: 0.1
intermedia-drop-prob: 0.5
ctc-self-distill-weight: 0
post-process: sentencepiece
dropout: 0.1 dropout: 0.1
activation-fn: relu activation-fn: relu
encoder-embed-dim: 256 encoder-embed-dim: 256
...@@ -33,8 +43,5 @@ decoder-attention-heads: 4 ...@@ -33,8 +43,5 @@ decoder-attention-heads: 4
attention-dropout: 0.1 attention-dropout: 0.1
activation-dropout: 0.1 activation-dropout: 0.1
macaron-style: True #load-pretrained-encoder-from:
use-cnn-module: True #load-pretrained-decoder-from:
cnn-module-kernel: 31 \ No newline at end of file
encoder-activation-fn: swish
encoder-attention-type: rel_pos_legacy
...@@ -2,43 +2,61 @@ set -e ...@@ -2,43 +2,61 @@ set -e
eval=1 eval=1
lcrm=0 lcrm=1
tokenizer=0 tokenizer=0
root_dir=~/st/Fairseq-S2T vocab_type=unigram
data_dir=~/st/data/test vocab_size=5000
vocab_dir=~/st/data/mustc/st/en-de use_raw_audio=0
speed_perturb=0
dataset=mustc
root_dir=~/st
code_dir=${root_dir}/Fairseq-S2T
org_data_dir=${root_dir}/data/${dataset}
data_dir=${root_dir}/data/${dataset}/asr
use_specific_dict=0
specific_prefix=st
specific_dir=${root_dir}/data/mustc/st
asr_vocab_prefix=spm_unigram10000_st_share asr_vocab_prefix=spm_unigram10000_st_share
src_lang=en src_lang=en
tgt_lang=de tgt_lang=zh
subsets=(2019) subsets=(2019)
cp -r ${vocab_dir}/${asr_vocab_prefix}.* ${data_dir}/${src_lang}-${tgt_lang} splits=$(echo ${subsets[*]} | sed 's/ /_/g')
rm -rf ${data_dir}/${src_lang}-${tgt_lang}/fbank80.zip cmd="python ${code_dir}/examples/speech_to_text/prep_audio_data.py
--data-root ${org_data_dir}
splits=$(echo ${subsets[*]} | sed 's/ /,/g')
cmd="python ${root_dir}/examples/speech_to_text/prep_st_data.py
--data-root ${data_dir}
--output-root ${data_dir} --output-root ${data_dir}
--splits ${splits}
--task asr --task asr
--src-lang ${src_lang} --src-lang ${src_lang}
--tgt-lang ${tgt_lang} --splits ${splits}
--add-src --vocab-type ${vocab_type}
--share --vocab-size ${vocab_size}"
--asr-prefix ${asr_vocab_prefix}
--cmvn-type utterance" if [[ ${use_raw_audio} -eq 1 ]]; then
cmd="$cmd
if [[ ${lcrm} -eq 1 ]]; then --raw"
cmd="$cmd fi
if [[ ${use_specific_dict} -eq 1 ]]; then
cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}
cmd="$cmd
--asr-prefix ${asr_vocab_prefix}"
fi
if [[ ${speed_perturb} -eq 1 ]]; then
cmd="$cmd
--speed-perturb"
fi
if [[ ${lcrm} -eq 1 ]]; then
cmd="$cmd
--lowercase-src --lowercase-src
--rm-punc-src" --rm-punc-src"
fi fi
if [[ ${tokenizer} -eq 1 ]]; then if [[ ${tokenizer} -eq 1 ]]; then
cmd="$cmd cmd="$cmd
--tokenizer" --tokenizer"
fi fi
echo -e "\033[34mRun command: \n${cmd} \033[0m" echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd} [[ $eval -eq 1 ]] && eval ${cmd}
...@@ -5,44 +5,59 @@ eval=1 ...@@ -5,44 +5,59 @@ eval=1
lcrm=1 lcrm=1
tokenizer=0 tokenizer=0
root_dir=~/st/Fairseq-S2T vocab_type=unigram
data_dir=/home/xuchen/st/data/test vocab_size=5000
vocab_dir=/home/xuchen/st/data/mustc/st_lcrm/en-de use_raw_audio=0
speed_perturb=0
dataset=mustc
root_dir=~/st
code_dir=${root_dir}/Fairseq-S2T
org_data_dir=${root_dir}/data/${dataset}
data_dir=${root_dir}/data/${dataset}/st
use_specific_dict=0
specific_prefix=st
specific_dir=${root_dir}/data/mustc/st
asr_vocab_prefix=spm_unigram10000_st_share asr_vocab_prefix=spm_unigram10000_st_share
st_vocab_prefix=spm_unigram10000_st_share
src_lang=en src_lang=en
tgt_lang=de tgt_lang=zh
splits=(2019) subsets=(2019)
splits=$(echo ${splits[*]} | sed 's/ /_/g') splits=$(echo ${subsets[*]} | sed 's/ /_/g')
cmd="python ${code_dir}/examples/speech_to_text/prep_audio_data.py
cp -r ${vocab_dir}/${asr_vocab_prefix}.* ${data_dir}/${src_lang}-${tgt_lang} --data-root ${org_data_dir}
cp -r ${vocab_dir}/${st_vocab_prefix}.* ${data_dir}/${src_lang}-${tgt_lang}
rm -rf ${data_dir}/${src_lang}-${tgt_lang}/fbank80.zip
cmd="python ${root_dir}/examples/speech_to_text/prep_st_data.py
--data-root ${data_dir}
--output-root ${data_dir} --output-root ${data_dir}
--splits ${splits} --task asr
--task st
--src-lang ${src_lang} --src-lang ${src_lang}
--tgt-lang ${tgt_lang} --tgt-lang ${tgt_lang}
--add-src --splits ${splits}
--share --vocab-type ${vocab_type}
--asr-prefix ${asr_vocab_prefix} --vocab-size ${vocab_size}"
--st-spm-prefix ${st_vocab_prefix}
--cmvn-type utterance" if [[ ${use_raw_audio} -eq 1 ]]; then
cmd="$cmd
if [[ ${lcrm} -eq 1 ]]; then --raw"
cmd="$cmd fi
if [[ ${use_specific_dict} -eq 1 ]]; then
cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}
cmd="$cmd
--asr-prefix ${asr_vocab_prefix}"
fi
if [[ ${speed_perturb} -eq 1 ]]; then
cmd="$cmd
--speed-perturb"
fi
if [[ ${lcrm} -eq 1 ]]; then
cmd="$cmd
--lowercase-src --lowercase-src
--rm-punc-src" --rm-punc-src"
fi fi
if [[ ${tokenizer} -eq 1 ]]; then if [[ ${tokenizer} -eq 1 ]]; then
cmd="$cmd cmd="$cmd
--tokenizer" --tokenizer"
fi fi
echo -e "\033[34mRun command: \n${cmd} \033[0m" echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd} [[ $eval -eq 1 ]] && eval ${cmd}
...@@ -129,11 +129,12 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then ...@@ -129,11 +129,12 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
if [[ ! -e ${data_dir} ]]; then if [[ ! -e ${data_dir} ]]; then
mkdir -p ${data_dir} mkdir -p ${data_dir}
fi fi
if [[ ! -f ${data_dir}/fbank80.zip && -f ${data_dir}/../fbank80.zip ]]; then feature_zip=fbank80.zip
ln -s ${data_dir}/../fbank80.zip ${data_dir} if [[ ${speed_perturb} -eq 1 ]]; then
feature_zip=fbank80_sp.zip
fi fi
if [[ ! -f ${data_dir}/fbank80_sp.zip && -f ${data_dir}/../fbank80_sp.zip ]]; then if [[ ! -f ${data_dir}/${feature_zip} && -f ${data_dir}/../feature_zip ]]; then
ln -s ${data_dir}/../fbank80_sp.zip ${data_dir} ln -s ${data_dir}/../feature_zip ${data_dir}
fi fi
# create ASR vocabulary if necessary # create ASR vocabulary if necessary
...@@ -204,13 +205,9 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then ...@@ -204,13 +205,9 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
echo -e "\033[34mRun command: \n${cmd} \033[0m" echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd} [[ $eval -eq 1 ]] && eval ${cmd}
if [[ ! -f ${data_dir}/../fbank80.zip ]]; then if [[ -f ${data_dir}/${feature_zip} && ! -f ${data_dir}/../${feature_zip} ]]; then
mv ${data_dir}/fbank80.zip ${data_dir}/.. mv ${data_dir}/${feature_zip} ${data_dir}/..
ln -s ${data_dir}/../fbank80.zip ${data_dir} ln -s ${data_dir}/../${feature_zip} ${data_dir}
fi
if [[ ! -f ${data_dir}/../fbank80_sp.zip ]]; then
mv ${data_dir}/fbank80_sp.zip ${data_dir}/..
ln -s ${data_dir}/../fbank80_sp.zip ${data_dir}
fi fi
fi fi
......
...@@ -26,7 +26,8 @@ PAD_TOKEN, PAD_TOKEN_ID = "<pad>", 1 ...@@ -26,7 +26,8 @@ PAD_TOKEN, PAD_TOKEN_ID = "<pad>", 1
def gen_vocab( def gen_vocab(
input_path: Path, output_path_prefix: Path, model_type="bpe", input_path: Path, output_path_prefix: Path, model_type="bpe",
vocab_size=1000, special_symbols: Optional[List[str]] = None vocab_size=1000, special_symbols: Optional[List[str]] = None,
normalization_rule_name=None
): ):
# Train SentencePiece Model # Train SentencePiece Model
arguments = [ arguments = [
...@@ -43,6 +44,8 @@ def gen_vocab( ...@@ -43,6 +44,8 @@ def gen_vocab(
f"--eos_id={EOS_TOKEN_ID}", f"--eos_id={EOS_TOKEN_ID}",
f"--pad_id={PAD_TOKEN_ID}", f"--pad_id={PAD_TOKEN_ID}",
] ]
if normalization_rule_name is not None:
arguments.append(f"--normalization_rule_name={normalization_rule_name}")
if special_symbols is not None: if special_symbols is not None:
_special_symbols = ",".join(special_symbols) _special_symbols = ",".join(special_symbols)
arguments.append(f"--user_defined_symbols={_special_symbols}") arguments.append(f"--user_defined_symbols={_special_symbols}")
......
...@@ -68,10 +68,6 @@ class AudioDataset(Dataset): ...@@ -68,10 +68,6 @@ class AudioDataset(Dataset):
if 0 < self.size < total_length: if 0 < self.size < total_length:
segments = segments[:self.size] segments = segments[:self.size]
# for idx, seg in enumerate(content):
# segments[idx] = seg
# if 0 < self.size < idx:
# break
else: else:
self.mode = "easy" self.mode = "easy"
...@@ -141,7 +137,7 @@ class AudioDataset(Dataset): ...@@ -141,7 +137,7 @@ class AudioDataset(Dataset):
for i, segment in enumerate(seg_group): for i, segment in enumerate(seg_group):
offset = int(float(segment["offset"]) * sample_rate) offset = int(float(segment["offset"]) * sample_rate)
n_frames = int(float(segment["duration"]) * sample_rate) n_frames = int(float(segment["duration"]) * sample_rate)
_id = f"{wav_path.stem}_{i}" _id = f"{split}_{wav_path.stem}_{i}"
item = dict() item = dict()
item["audio"] = wav_path.as_posix() item["audio"] = wav_path.as_posix()
...@@ -240,7 +236,7 @@ def process(args): ...@@ -240,7 +236,7 @@ def process(args):
if not Path.exists(zip_path) or args.overwrite: if not Path.exists(zip_path) or args.overwrite:
gen_feature_flag = True gen_feature_flag = True
if gen_feature_flag: if True and gen_feature_flag:
if args.speed_perturb: if args.speed_perturb:
feature_root = output_root / "fbank80_sp" feature_root = output_root / "fbank80_sp"
else: else:
...@@ -265,7 +261,7 @@ def process(args): ...@@ -265,7 +261,7 @@ def process(args):
for idx in tqdm(range(len(dataset))): for idx in tqdm(range(len(dataset))):
item = dataset[idx] item = dataset[idx]
utt_id = item["id"] utt_id = item['id']
features_path = (feature_root / f"{utt_id}.npy").as_posix() features_path = (feature_root / f"{utt_id}.npy").as_posix()
if os.path.exists(features_path): if os.path.exists(features_path):
...@@ -291,7 +287,7 @@ def process(args): ...@@ -291,7 +287,7 @@ def process(args):
create_zip(feature_root, zip_path) create_zip(feature_root, zip_path)
# Clean up # Clean up
shutil.rmtree(feature_root) # shutil.rmtree(feature_root)
gen_manifest_flag = False gen_manifest_flag = False
for split in splits: for split in splits:
......
...@@ -115,6 +115,7 @@ def process(args): ...@@ -115,6 +115,7 @@ def process(args):
output_root / tgt_spm_filename_prefix, output_root / tgt_spm_filename_prefix,
args.vocab_type, args.vocab_type,
args.vocab_size, args.vocab_size,
normalization_rule_name="identity" if tgt_lang == "zh" else None
) )
if not args.share: if not args.share:
...@@ -126,6 +127,7 @@ def process(args): ...@@ -126,6 +127,7 @@ def process(args):
output_root / src_spm_filename_prefix, output_root / src_spm_filename_prefix,
args.vocab_type, args.vocab_type,
args.vocab_size, args.vocab_size,
normalization_rule_name="identity" if tgt_lang == "zh" else None
) )
# Generate config YAML # Generate config YAML
......
...@@ -235,8 +235,8 @@ class CtcCriterion(FairseqCriterion): ...@@ -235,8 +235,8 @@ class CtcCriterion(FairseqCriterion):
ctc_self_distill_num += 1 ctc_self_distill_num += 1
loss = F.kl_div( loss = F.kl_div(
F.log_softmax(inter_ctc_logit, dim=-1), F.log_softmax(inter_ctc_logit, dim=-1, dtype=torch.float32),
F.softmax(ctc_logit, dim=-1), F.softmax(ctc_logit, dim=-1, dtype=torch.float32),
reduction="none", reduction="none",
) )
loss = loss.sum(-1).transpose(0, 1).masked_fill_(~non_padding_mask, 0.0) loss = loss.sum(-1).transpose(0, 1).masked_fill_(~non_padding_mask, 0.0)
......
...@@ -360,6 +360,7 @@ class SpeechToTextDataset(FairseqDataset): ...@@ -360,6 +360,7 @@ class SpeechToTextDataset(FairseqDataset):
target = torch.cat((torch.LongTensor([lang_tag_idx]), target), 0) target = torch.cat((torch.LongTensor([lang_tag_idx]), target), 0)
transcript = None transcript = None
if self.src_dict is not None and self.src_texts is not None and self.src_bpe_tokenizer is not None: if self.src_dict is not None and self.src_texts is not None and self.src_bpe_tokenizer is not None:
tokenized = self.tokenize_text(self.src_texts[index], True) tokenized = self.tokenize_text(self.src_texts[index], True)
transcript = self.src_dict.encode_line( transcript = self.src_dict.encode_line(
...@@ -443,6 +444,7 @@ class SpeechToTextDataset(FairseqDataset): ...@@ -443,6 +444,7 @@ class SpeechToTextDataset(FairseqDataset):
"ntokens": ntokens, "ntokens": ntokens,
"nsentences": len(samples), "nsentences": len(samples),
} }
return out return out
def num_tokens(self, index): def num_tokens(self, index):
......
...@@ -49,14 +49,16 @@ class CTCCompressStrategy: ...@@ -49,14 +49,16 @@ class CTCCompressStrategy:
for t_idx, same in enumerate(pred): for t_idx, same in enumerate(pred):
new_processed_inputs_cnt = processed_inputs_cnt + same[1] new_processed_inputs_cnt = processed_inputs_cnt + same[1]
# Get the probabilities of the prediction for the different time steps as weight # Get the probabilities of the prediction for the different time steps as weight
weights = F.softmax(prob_ctc[b_idx, processed_inputs_cnt:new_processed_inputs_cnt, same[0]]) weights = F.softmax(
prob_ctc[b_idx, processed_inputs_cnt:new_processed_inputs_cnt, same[0]], dtype=torch.float32
)
weights_matrix[b_idx, processed_inputs_cnt:new_processed_inputs_cnt, t_idx] = \ weights_matrix[b_idx, processed_inputs_cnt:new_processed_inputs_cnt, t_idx] = \
weights / weights.sum() weights / weights.sum()
processed_inputs_cnt = new_processed_inputs_cnt processed_inputs_cnt = new_processed_inputs_cnt
return weights_matrix return weights_matrix
class InterAdapter(nn.Module): class Adapter(nn.Module):
def __init__(self, dim, adapter_type, dictionary, embed_tokens=None, strategy=None): def __init__(self, dim, adapter_type, dictionary, embed_tokens=None, strategy=None):
super().__init__() super().__init__()
...@@ -84,6 +86,7 @@ class InterAdapter(nn.Module): ...@@ -84,6 +86,7 @@ class InterAdapter(nn.Module):
self.gate_linear2 = nn.Linear(dim, dim) self.gate_linear2 = nn.Linear(dim, dim)
if self.adapter_type == "shrink": if self.adapter_type == "shrink":
assert strategy is not None
self.ctc_compress = getattr(CTCCompressStrategy, strategy) self.ctc_compress = getattr(CTCCompressStrategy, strategy)
logger.info("CTC Compress Strategy: %s" % strategy) logger.info("CTC Compress Strategy: %s" % strategy)
elif self.adapter_type == "league": elif self.adapter_type == "league":
...@@ -94,7 +97,7 @@ class InterAdapter(nn.Module): ...@@ -94,7 +97,7 @@ class InterAdapter(nn.Module):
def forward(self, x, padding): def forward(self, x, padding):
representation, distribution = x representation, distribution = x
dim1, dim2, dim = representation.size() seq_len, bsz, dim = representation.size()
org_distribution = distribution org_distribution = distribution
lengths = (~padding).long().sum(-1) lengths = (~padding).long().sum(-1)
...@@ -103,7 +106,9 @@ class InterAdapter(nn.Module): ...@@ -103,7 +106,9 @@ class InterAdapter(nn.Module):
elif self.adapter_type == "context": elif self.adapter_type == "context":
distribution = distribution.view(-1, distribution.size(-1)) distribution = distribution.view(-1, distribution.size(-1))
out = torch.mm(distribution, self.embed_adapter.weight).view(dim1, dim2, -1) out = torch.mm(
distribution, self.embed_adapter.weight.float()
).view(seq_len, bsz, -1).type_as(representation)
elif self.adapter_type == "league": elif self.adapter_type == "league":
linear_out = self.linear_adapter(representation) linear_out = self.linear_adapter(representation)
...@@ -112,19 +117,25 @@ class InterAdapter(nn.Module): ...@@ -112,19 +117,25 @@ class InterAdapter(nn.Module):
threshold = distribution.sort(dim=-1, descending=True)[0][:, :, cutoff:cutoff+1] threshold = distribution.sort(dim=-1, descending=True)[0][:, :, cutoff:cutoff+1]
distribution = torch.where(distribution > threshold, distribution, torch.zeros_like(distribution)) distribution = torch.where(distribution > threshold, distribution, torch.zeros_like(distribution))
distribution = distribution.view(-1, distribution.size(-1)) distribution = distribution.view(-1, distribution.size(-1))
soft_out = torch.mm(distribution, self.embed_adapter.weight).view(dim1, dim2, -1) soft_out = torch.mm(
distribution, self.embed_adapter.weight.float()
).view(seq_len, bsz, -1).type_as(representation)
out = linear_out + soft_out out = linear_out + soft_out
elif self.adapter_type == "gated_league": elif self.adapter_type == "gated_league":
linear_out = self.linear_adapter(representation) linear_out = self.linear_adapter(representation)
distribution = distribution.view(-1, distribution.size(-1)) distribution = distribution.view(-1, distribution.size(-1))
soft_out = torch.mm(distribution, self.embed_adapter.weight).view(dim1, dim2, -1) soft_out = torch.mm(
distribution, self.embed_adapter.weight.float()
).view(seq_len, bsz, -1).type_as(representation)
coef = (self.gate_linear(torch.cat([linear_out, soft_out], dim=-1))).sigmoid() coef = (self.gate_linear(torch.cat([linear_out, soft_out], dim=-1))).sigmoid()
out = coef * linear_out + (1 - coef) * soft_out out = coef * linear_out + (1 - coef) * soft_out
elif self.adapter_type == "inter_league": elif self.adapter_type == "inter_league":
distribution = distribution.view(-1, distribution.size(-1)) distribution = distribution.view(-1, distribution.size(-1))
soft_out = torch.mm(distribution, self.embed_adapter.weight).view(dim1, dim2, -1) soft_out = torch.mm(
distribution, self.embed_adapter.weight.float()
).view(seq_len, bsz, -1).type_as(representation)
out = representation + soft_out out = representation + soft_out
elif self.adapter_type == "none": elif self.adapter_type == "none":
...@@ -142,10 +153,11 @@ class InterAdapter(nn.Module): ...@@ -142,10 +153,11 @@ class InterAdapter(nn.Module):
new_lengths = [len(p) for p in batch_predicted] new_lengths = [len(p) for p in batch_predicted]
weights_matrix = self.ctc_compress(prob_ctc, batch_predicted, new_lengths, weights_matrix = self.ctc_compress(prob_ctc, batch_predicted, new_lengths,
representation.dtype, representation.device) prob_ctc.dtype, prob_ctc.device)
# x is T x B x C -> B x C x T; weights_matrix is B x T x T' # x is T x B x C -> B x C x T; weights_matrix is B x T x T'
compressed_output = representation.permute(1, 2, 0).bmm(weights_matrix) # B x C x T' representation = representation.permute(1, 2, 0)
compressed_output = representation.float().bmm(weights_matrix).type_as(representation) # B x C x T'
out = compressed_output.permute(2, 0, 1) out = compressed_output.permute(2, 0, 1)
out_lengths = lengths.new(new_lengths) out_lengths = lengths.new(new_lengths)
......
...@@ -38,10 +38,10 @@ class CTC(nn.Module): ...@@ -38,10 +38,10 @@ class CTC(nn.Module):
return x return x
def softmax(self, x, temperature=1.0): def softmax(self, x, temperature=1.0):
return F.softmax(self.ctc_projection(x) / temperature, dim=-1) return F.softmax(self.ctc_projection(x) / temperature, dim=-1, dtype=torch.float32)
def log_softmax(self, x, temperature=1.0): def log_softmax(self, x, temperature=1.0):
return F.log_softmax(self.ctc_projection(x) / temperature, dim=-1) return F.log_softmax(self.ctc_projection(x) / temperature, dim=-1, dtype=torch.float32)
def argmax(self, x): def argmax(self, x):
return torch.argmax(self.ctc_projection(x), dim=-1) return torch.argmax(self.ctc_projection(x), dim=-1)
......
...@@ -13,7 +13,7 @@ from fairseq.models import ( ...@@ -13,7 +13,7 @@ from fairseq.models import (
register_model_architecture, register_model_architecture,
) )
from fairseq.models.speech_to_text import S2TTransformerModel from fairseq.models.speech_to_text import S2TTransformerModel
from fairseq.models.speech_to_text.modules import CTC, InterAdapter from fairseq.models.speech_to_text.modules import CTC, Adapter
from fairseq.modules import ( from fairseq.modules import (
FairseqDropout, FairseqDropout,
...@@ -673,14 +673,14 @@ class PDSS2TTransformerEncoder(FairseqEncoder): ...@@ -673,14 +673,14 @@ class PDSS2TTransformerEncoder(FairseqEncoder):
strategy = None strategy = None
if args.intermedia_adapter == "shrink": if args.intermedia_adapter == "shrink":
strategy = getattr(args, "ctc_compress_strategy", "avg") strategy = getattr(args, "ctc_compress_strategy", "avg")
adapter = InterAdapter(embed_dim, args.intermedia_adapter, adapter = Adapter(embed_dim, args.intermedia_adapter,
task.source_dictionary, strategy=strategy) task.source_dictionary, strategy=strategy)
inter_adapter = adapter inter_adapter = adapter
else: else:
adapter = inter_adapter adapter = inter_adapter
else: else:
adapter = InterAdapter(embed_dim, "none", adapter = Adapter(embed_dim, "none",
task.source_dictionary) task.source_dictionary)
else: else:
ctc = None ctc = None
adapter = None adapter = None
...@@ -830,7 +830,7 @@ class PDSS2TTransformerEncoder(FairseqEncoder): ...@@ -830,7 +830,7 @@ class PDSS2TTransformerEncoder(FairseqEncoder):
logit = ctc(x.clone()) logit = ctc(x.clone())
intermedia_ctc_logits.append([logit, encoder_padding_mask]) intermedia_ctc_logits.append([logit, encoder_padding_mask])
prob = F.softmax(logit, dim=-1) prob = utils.softmax(logit, dim=-1)
x, encoder_padding_mask = adapter([x, prob], encoder_padding_mask) x, encoder_padding_mask = adapter([x, prob], encoder_padding_mask)
if self.fusion_stages_num != 0: if self.fusion_stages_num != 0:
......
...@@ -14,7 +14,7 @@ from fairseq.models import ( ...@@ -14,7 +14,7 @@ from fairseq.models import (
register_model, register_model,
register_model_architecture, register_model_architecture,
) )
from fairseq.models.speech_to_text.modules import InterAdapter, CTC from fairseq.models.speech_to_text.modules import Adapter, CTC
from fairseq.modules import ( from fairseq.modules import (
FairseqDropout, FairseqDropout,
LayerNorm, LayerNorm,
...@@ -382,6 +382,12 @@ class S2TCTCModel(FairseqEncoderModel): ...@@ -382,6 +382,12 @@ class S2TCTCModel(FairseqEncoderModel):
type=int, type=int,
help="cutoff of the distribution", help="cutoff of the distribution",
) )
parser.add_argument(
"--intermedia-drop-prob",
default=0,
type=float,
help="probability of dropping the followed layers",
)
pass pass
@classmethod @classmethod
...@@ -424,9 +430,9 @@ class S2TCTCModel(FairseqEncoderModel): ...@@ -424,9 +430,9 @@ class S2TCTCModel(FairseqEncoderModel):
else: else:
logits = net_output["ctc_logit"][0] logits = net_output["ctc_logit"][0]
if log_probs: if log_probs:
return utils.log_softmax(logits.float(), dim=-1) return utils.log_softmax(logits, dim=-1)
else: else:
return utils.softmax(logits.float(), dim=-1) return utils.softmax(logits, dim=-1)
def forward(self, src_tokens, src_lengths, prev_output_tokens=None): def forward(self, src_tokens, src_lengths, prev_output_tokens=None):
""" """
...@@ -513,11 +519,12 @@ class S2TCTCEncoder(FairseqEncoder): ...@@ -513,11 +519,12 @@ class S2TCTCEncoder(FairseqEncoder):
strategy = None strategy = None
if args.intermedia_adapter == "shrink": if args.intermedia_adapter == "shrink":
strategy = getattr(args, "ctc_compress_strategy", None) strategy = getattr(args, "ctc_compress_strategy", "avg")
elif args.intermedia_adapter == "league": elif args.intermedia_adapter == "league":
strategy = getattr(args, "intermedia_distribution_cutoff", -1) strategy = getattr(args, "intermedia_distribution_cutoff", -1)
self.adapter = InterAdapter(dim, args.intermedia_adapter, self.adapter = Adapter(dim, args.intermedia_adapter,
task.source_dictionary, strategy=strategy) task.source_dictionary, strategy=strategy)
self.intermedia_drop_prob = getattr(args, "intermedia_drop_prob", 0)
def add_to_dict(self, x, dis, idx): def add_to_dict(self, x, dis, idx):
sim = 0 sim = 0
...@@ -595,11 +602,16 @@ class S2TCTCEncoder(FairseqEncoder): ...@@ -595,11 +602,16 @@ class S2TCTCEncoder(FairseqEncoder):
# interleave CTC # interleave CTC
if layer_idx in self.intermedia_ctc_layers: if layer_idx in self.intermedia_ctc_layers:
if self.intermedia_drop_prob > 0:
p = torch.rand(1).uniform_()
if p < self.intermedia_drop_prob:
break
norm_x = self.layer_norm(x) norm_x = self.layer_norm(x)
logit = self.ctc(norm_x) logit = self.ctc(norm_x)
intermedia_ctc_logits.append(logit) intermedia_ctc_logits.append(logit)
prob = F.softmax(logit, dim=-1) prob = F.softmax(logit, dim=-1, dtype=torch.float32)
x, encoder_padding_mask = self.adapter([x, prob], encoder_padding_mask) x, encoder_padding_mask = self.adapter([x, prob], encoder_padding_mask)
# gather cosine similarity # gather cosine similarity
...@@ -702,7 +714,7 @@ class CTCDecoder(object): ...@@ -702,7 +714,7 @@ class CTCDecoder(object):
model_path=self.lm_model, model_path=self.lm_model,
alpha=self.lm_weight, alpha=self.lm_weight,
beta=0, beta=0,
cutoff_top_n=self.vocab_size, cutoff_top_n=40,
cutoff_prob=1.0, cutoff_prob=1.0,
beam_width=self.beam_size, beam_width=self.beam_size,
num_processes=20, num_processes=20,
...@@ -725,7 +737,9 @@ class CTCDecoder(object): ...@@ -725,7 +737,9 @@ class CTCDecoder(object):
src_lengths=src_lengths) src_lengths=src_lengths)
ctc_logit = encoder_outs["ctc_logit"][0].transpose(0, 1) ctc_logit = encoder_outs["ctc_logit"][0].transpose(0, 1)
beam_results, beam_scores, timesteps, out_lens = self.ctc_decoder.decode(F.softmax(ctc_logit, -1), src_lengths) beam_results, beam_scores, time_steps, out_lens = self.ctc_decoder.decode(
utils.softmax(ctc_logit, -1), src_lengths
)
finalized = [] finalized = []
for idx in range(bsz): for idx in range(bsz):
......
...@@ -18,7 +18,7 @@ from fairseq.models.speech_to_text import ( ...@@ -18,7 +18,7 @@ from fairseq.models.speech_to_text import (
PDSS2TTransformerModel, PDSS2TTransformerModel,
PDSS2TTransformerEncoder, PDSS2TTransformerEncoder,
) )
from fairseq.models.speech_to_text.modules import CTCCompressStrategy from fairseq.models.speech_to_text.modules import CTCCompressStrategy, Adapter
from fairseq.modules import ( from fairseq.modules import (
FairseqDropout, FairseqDropout,
LayerNorm, LayerNorm,
...@@ -140,95 +140,103 @@ class S2TSATEModel(S2TTransformerModel): ...@@ -140,95 +140,103 @@ class S2TSATEModel(S2TTransformerModel):
return encoder return encoder
class Adapter(nn.Module): # class Adapter(nn.Module):
def __init__(self, args, dictionary, embed_tokens): # def __init__(self, args, dictionary, embed_tokens):
super().__init__() # super().__init__()
#
embed_dim = args.encoder_embed_dim # embed_dim = args.encoder_embed_dim
#
self.adapter_type = args.adapter # self.adapter_type = args.adapter
if self.adapter_type in ["linear", "league", "gated_league", "gated_league2"]: # if self.adapter_type in ["linear", "league", "gated_league", "gated_league2"]:
self.linear_adapter = nn.Sequential( # self.linear_adapter = nn.Sequential(
nn.Linear(embed_dim, embed_dim), # nn.Linear(embed_dim, embed_dim),
LayerNorm(args.encoder_embed_dim), # LayerNorm(args.encoder_embed_dim),
nn.ReLU(), # nn.ReLU(),
) # )
elif self.adapter_type == "linear2": # elif self.adapter_type == "linear2":
self.linear_adapter = nn.Sequential( # self.linear_adapter = nn.Sequential(
nn.Linear(embed_dim, embed_dim), # nn.Linear(embed_dim, embed_dim),
) # )
#
if self.adapter_type in ["embed", "context", "league", "gated_league", "gated_league2"]: # if self.adapter_type in ["embed", "context", "league", "gated_league", "gated_league2"]:
if embed_tokens is None: # if embed_tokens is None:
num_embeddings = len(dictionary) # num_embeddings = len(dictionary)
self.embed_adapter = Embedding(num_embeddings, embed_dim, dictionary.pad()) # self.embed_adapter = Embedding(num_embeddings, embed_dim, dictionary.pad())
else: # else:
self.embed_adapter = embed_tokens # self.embed_adapter = embed_tokens
#
if self.adapter_type == "gated_league": # if self.adapter_type == "gated_league":
self.gate_linear = nn.Linear(2 * embed_dim, embed_dim) # self.gate_linear = nn.Linear(2 * embed_dim, embed_dim)
elif self.adapter_type == "gated_league2": # elif self.adapter_type == "gated_league2":
self.gate_linear1 = nn.Linear(embed_dim, embed_dim) # self.gate_linear1 = nn.Linear(embed_dim, embed_dim)
self.gate_linear2 = nn.Linear(embed_dim, embed_dim) # self.gate_linear2 = nn.Linear(embed_dim, embed_dim)
#
if self.adapter_type == "shrink": # if self.adapter_type == "shrink":
self.ctc_compress_method = getattr(CTCCompressStrategy, args.ctc_compress_strategy) # self.ctc_compress_method = getattr(CTCCompressStrategy, args.ctc_compress_strategy)
#
def forward(self, x, padding): # def forward(self, x, padding):
#
representation, distribution = x # representation, distribution = x
batch, seq_len, embed_dim = representation.size() # batch, seq_len, embed_dim = representation.size()
org_distribution = distribution # org_distribution = distribution
if distribution is not None: # if distribution is not None:
distribution = distribution.view(-1, distribution.size(-1)) # distribution = distribution.view(-1, distribution.size(-1))
lengths = (~padding).long().sum(-1) # lengths = (~padding).long().sum(-1)
#
if self.adapter_type == "linear": # if self.adapter_type == "linear":
out = self.linear_adapter(representation) # out = self.linear_adapter(representation)
#
elif self.adapter_type == "context": # elif self.adapter_type == "context":
out = torch.mm(distribution, self.embed_adapter.weight).view(batch, seq_len, -1) # out = torch.mm(
# distribution, self.embed_adapter.weight.float()
elif self.adapter_type == "league": # ).view(batch, seq_len, -1).type_as(representation)
linear_out = self.linear_adapter(representation) #
soft_out = torch.mm(distribution, self.embed_adapter.weight).view(batch, seq_len, -1) # elif self.adapter_type == "league":
out = linear_out + soft_out # linear_out = self.linear_adapter(representation)
# soft_out = torch.mm(
elif self.adapter_type == "gated_league": # distribution, self.embed_adapter.weight.float()
linear_out = self.linear_adapter(representation) # ).view(batch, seq_len, -1).type_as(linear_out)
soft_out = torch.mm(distribution, self.embed_adapter.weight).view(batch, seq_len, -1) # out = linear_out + soft_out
coef = (self.gate_linear(torch.cat([linear_out, soft_out], dim=-1))).sigmoid() #
out = coef * linear_out + (1 - coef) * soft_out # elif self.adapter_type == "gated_league":
# linear_out = self.linear_adapter(representation)
elif self.adapter_type == "none": # soft_out = torch.mm(
out = representation # distribution, self.embed_adapter.weight.float()
# ).view(batch, seq_len, -1).type_as(linear_out)
elif self.adapter_type == "shrink": # coef = (self.gate_linear(torch.cat([linear_out, soft_out], dim=-1))).sigmoid()
from itertools import groupby # out = coef * linear_out + (1 - coef) * soft_out
#
with torch.no_grad(): # elif self.adapter_type == "none":
batch_predicted = [] # out = representation
prob_ctc = org_distribution.transpose(0, 1) # T x B x D -> B x T x D #
for b in range(prob_ctc.shape[0]): # elif self.adapter_type == "shrink":
predicted = prob_ctc[b][: lengths[b]].argmax(-1).tolist() # from itertools import groupby
batch_predicted.append([(p[0], len(list(p[1]))) for p in groupby(predicted)]) #
# with torch.no_grad():
new_lengths = [len(p) for p in batch_predicted] # batch_predicted = []
weights_matrix = self.ctc_compress_method(prob_ctc, batch_predicted, new_lengths, # prob_ctc = org_distribution.transpose(0, 1) # T x B x D -> B x T x D
representation.dtype, representation.device) # for b in range(prob_ctc.shape[0]):
# predicted = prob_ctc[b][: lengths[b]].argmax(-1).tolist()
# x is T x B x C -> B x C x T; weights_matrix is B x T x T' # batch_predicted.append([(p[0], len(list(p[1]))) for p in groupby(predicted)])
compressed_output = representation.permute(1, 2, 0).bmm(weights_matrix) # B x C x T' #
out = compressed_output.permute(2, 0, 1) # new_lengths = [len(p) for p in batch_predicted]
# weights_matrix = self.ctc_compress_method(prob_ctc, batch_predicted, new_lengths,
out_lengths = lengths.new(new_lengths) # prob_ctc.dtype, prob_ctc.device)
padding = lengths_to_padding_mask(out_lengths) #
# # x is T x B x C -> B x C x T; weights_matrix is B x T x T'
else: # data_type = representation.dtype
out = None # representation = representation.permute(1, 2, 0).float()
logging.error("Unsupported adapter type: {}.".format(self.adapter_type)) # compressed_output = representation.bmm(weights_matrix).type_as(data_type) # B x C x T'
# out = compressed_output.permute(2, 0, 1)
return out, padding #
# out_lengths = lengths.new(new_lengths)
# padding = lengths_to_padding_mask(out_lengths)
#
# else:
# out = None
# logging.error("Unsupported adapter type: {}.".format(self.adapter_type))
#
# return out, padding
class TextEncoder(FairseqEncoder): class TextEncoder(FairseqEncoder):
...@@ -301,7 +309,18 @@ class S2TSATEEncoder(FairseqEncoder): ...@@ -301,7 +309,18 @@ class S2TSATEEncoder(FairseqEncoder):
# adapter # adapter
self.temperature = args.temperature self.temperature = args.temperature
self.adapter = Adapter(args, task.source_dictionary, embed_tokens) # self.adapter = Adapter(args, task.source_dictionary, embed_tokens)
strategy = None
if args.adapter == "shrink":
strategy = getattr(args, "ctc_compress_strategy", "avg")
elif args.adapter == "league":
strategy = getattr(args, "intermedia_distribution_cutoff", -1)
self.adapter = Adapter(args.encoder_embed_dim,
args.adapter,
task.source_dictionary,
embed_tokens,
strategy=strategy)
if args.share_ctc_and_adapter and hasattr(self.adapter, "embed_adapter"): if args.share_ctc_and_adapter and hasattr(self.adapter, "embed_adapter"):
self.acoustic_encoder.ctc.ctc_projection.weight = self.adapter.embed_adapter.weight self.acoustic_encoder.ctc.ctc_projection.weight = self.adapter.embed_adapter.weight
...@@ -332,7 +351,7 @@ class S2TSATEEncoder(FairseqEncoder): ...@@ -332,7 +351,7 @@ class S2TSATEEncoder(FairseqEncoder):
if "ctc_logit" in acoustic_encoder_out and len(acoustic_encoder_out["ctc_logit"]) > 0: if "ctc_logit" in acoustic_encoder_out and len(acoustic_encoder_out["ctc_logit"]) > 0:
ctc_logit = acoustic_encoder_out["ctc_logit"][0] ctc_logit = acoustic_encoder_out["ctc_logit"][0]
ctc_prob = F.softmax(ctc_logit / self.temperature, dim=-1) ctc_prob = F.softmax(ctc_logit / self.temperature, dim=-1, dtype=torch.float32)
else: else:
ctc_logit = None ctc_logit = None
ctc_prob = None ctc_prob = None
......
...@@ -2,6 +2,7 @@ import logging ...@@ -2,6 +2,7 @@ import logging
import math import math
from typing import Dict, List, Optional, Tuple from typing import Dict, List, Optional, Tuple
import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
...@@ -13,7 +14,7 @@ from fairseq.models import ( ...@@ -13,7 +14,7 @@ from fairseq.models import (
register_model, register_model,
register_model_architecture, register_model_architecture,
) )
from fairseq.models.speech_to_text.modules import InterAdapter, CTC from fairseq.models.speech_to_text.modules import Adapter, CTC
from fairseq.models.transformer import Embedding, TransformerDecoder from fairseq.models.transformer import Embedding, TransformerDecoder
from fairseq.modules import ( from fairseq.modules import (
FairseqDropout, FairseqDropout,
...@@ -388,6 +389,12 @@ class S2TTransformerModel(FairseqEncoderDecoderModel): ...@@ -388,6 +389,12 @@ class S2TTransformerModel(FairseqEncoderDecoderModel):
type=int, type=int,
help="cutoff of the distribution", help="cutoff of the distribution",
) )
parser.add_argument(
"--intermedia-drop-prob",
default=0,
type=float,
help="probability of dropping the followed layers",
)
pass pass
@classmethod @classmethod
...@@ -494,6 +501,7 @@ class S2TTransformerEncoder(FairseqEncoder): ...@@ -494,6 +501,7 @@ class S2TTransformerEncoder(FairseqEncoder):
self.padding_idx = 1 self.padding_idx = 1
self.subsample = subsampling(args) self.subsample = subsampling(args)
self.linear = nn.Linear(dim, dim)
self.attn_type = getattr(args, "encoder_attention_type", "selfattn") self.attn_type = getattr(args, "encoder_attention_type", "selfattn")
...@@ -573,8 +581,9 @@ class S2TTransformerEncoder(FairseqEncoder): ...@@ -573,8 +581,9 @@ class S2TTransformerEncoder(FairseqEncoder):
strategy = getattr(args, "ctc_compress_strategy", None) strategy = getattr(args, "ctc_compress_strategy", None)
elif args.intermedia_adapter == "league": elif args.intermedia_adapter == "league":
strategy = getattr(args, "intermedia_distribution_cutoff", -1) strategy = getattr(args, "intermedia_distribution_cutoff", -1)
self.adapter = InterAdapter(dim, args.intermedia_adapter, self.adapter = Adapter(dim, args.intermedia_adapter,
task.source_dictionary, strategy=strategy) task.source_dictionary, strategy=strategy)
self.intermedia_drop_prob = getattr(args, "intermedia_drop_prob", 0)
@staticmethod @staticmethod
def pooling_ratio(): def pooling_ratio():
...@@ -631,6 +640,7 @@ class S2TTransformerEncoder(FairseqEncoder): ...@@ -631,6 +640,7 @@ class S2TTransformerEncoder(FairseqEncoder):
x += positions x += positions
positions = None positions = None
x = self.linear(x)
x = self.dropout_module(x) x = self.dropout_module(x)
# add emb into history # add emb into history
...@@ -662,12 +672,17 @@ class S2TTransformerEncoder(FairseqEncoder): ...@@ -662,12 +672,17 @@ class S2TTransformerEncoder(FairseqEncoder):
# interleave CTC # interleave CTC
if layer_idx in self.intermedia_ctc_layers: if layer_idx in self.intermedia_ctc_layers:
if self.intermedia_drop_prob > 0:
p = torch.rand(1).uniform_()
if p < self.intermedia_drop_prob:
break
norm_x = self.layer_norm(x) norm_x = self.layer_norm(x)
logit = self.ctc(norm_x) logit = self.ctc(norm_x)
intermedia_ctc_logits.append(logit)
intermedia_ctc_logits.append(logit)
# prob = self.ctc.softmax(norm_x) # prob = self.ctc.softmax(norm_x)
prob = F.softmax(logit, dim=-1) prob = utils.softmax(logit, dim=-1)
x, encoder_padding_mask = self.adapter([x, prob], encoder_padding_mask) x, encoder_padding_mask = self.adapter([x, prob], encoder_padding_mask)
# gather cosine similarity # gather cosine similarity
...@@ -872,6 +887,7 @@ def base_architecture(args): ...@@ -872,6 +887,7 @@ def base_architecture(args):
# intermedia CTC # intermedia CTC
args.intermedia_ctc_layers = getattr(args, "intermedia_ctc_layers", None) args.intermedia_ctc_layers = getattr(args, "intermedia_ctc_layers", None)
args.intermedia_adapter = getattr(args, "intermedia_adapter", None) args.intermedia_adapter = getattr(args, "intermedia_adapter", None)
args.intermedia_drop_prob = getattr(args, "intermedia_drop_prob", 0)
@register_model_architecture("s2t_transformer", "s2t_transformer_s") @register_model_architecture("s2t_transformer", "s2t_transformer_s")
......
...@@ -478,6 +478,7 @@ class TransformerEncoder(FairseqEncoder): ...@@ -478,6 +478,7 @@ class TransformerEncoder(FairseqEncoder):
self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim) self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim)
self.attn_type = getattr(args, "encoder_attention_type", "selfattn")
if self.attn_type == "rel_pos": if self.attn_type == "rel_pos":
self.embed_positions = RelPositionalEncoding( self.embed_positions = RelPositionalEncoding(
args.max_source_positions, args.encoder_embed_dim args.max_source_positions, args.encoder_embed_dim
......
...@@ -121,7 +121,7 @@ class DynamicLinearCombination(nn.Module): ...@@ -121,7 +121,7 @@ class DynamicLinearCombination(nn.Module):
self.weight_mask = self.weight_mask.cuda() self.weight_mask = self.weight_mask.cuda()
if self.normalize_learned_weight: if self.normalize_learned_weight:
weight = self.weight.masked_fill((self.weight_mask == 0).unsqueeze(2), float('-inf')) weight = self.weight.masked_fill((self.weight_mask == 0).unsqueeze(2), float('-inf'))
self.normalized_weight = F.softmax(weight, dim=1) self.normalized_weight = F.softmax(weight, dim=1, dtype=torch.float32)
return return
# following layer # following layer
......
...@@ -9,6 +9,7 @@ ...@@ -9,6 +9,7 @@
import math import math
import torch import torch
from torch import nn from torch import nn
import torch.nn.functional as F
from fairseq.modules.rotary_positional_embedding import ( from fairseq.modules.rotary_positional_embedding import (
RotaryPositionalEmbedding, RotaryPositionalEmbedding,
apply_rotary_pos_emb, apply_rotary_pos_emb,
...@@ -73,10 +74,8 @@ class ESPNETMultiHeadedAttention(nn.Module): ...@@ -73,10 +74,8 @@ class ESPNETMultiHeadedAttention(nn.Module):
mask.unsqueeze(1).unsqueeze(2).to(bool), mask.unsqueeze(1).unsqueeze(2).to(bool),
float("-inf"), # (batch, head, time1, time2) float("-inf"), # (batch, head, time1, time2)
) )
self.attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) self.attn = F.softmax(scores, dim=-1, dtype=torch.float32).type_as(scores) # (batch, head, time1, time2)
# self.attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2)
else:
self.attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2)
p_attn = self.dropout(self.attn) p_attn = self.dropout(self.attn)
x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) x = torch.matmul(p_attn, value) # (batch, head, time1, d_k)
x = ( x = (
...@@ -91,7 +90,7 @@ class ESPNETMultiHeadedAttention(nn.Module): ...@@ -91,7 +90,7 @@ class ESPNETMultiHeadedAttention(nn.Module):
query (torch.Tensor): Query tensor T X B X C query (torch.Tensor): Query tensor T X B X C
key (torch.Tensor): Key tensor T X B X C key (torch.Tensor): Key tensor T X B X C
value (torch.Tensor): Value tensor T X B X C value (torch.Tensor): Value tensor T X B X C
mask (torch.Tensor): Mask tensor T X B key_padding_mask (torch.Tensor): Mask tensor T X B
Returns: Returns:
torch.Tensor: Output tensor T X B X D. torch.Tensor: Output tensor T X B X D.
""" """
......
...@@ -151,6 +151,7 @@ class MultiheadAttention(nn.Module): ...@@ -151,6 +151,7 @@ class MultiheadAttention(nn.Module):
assert list(query.size()) == [tgt_len, bsz, embed_dim] assert list(query.size()) == [tgt_len, bsz, embed_dim]
if ( if (
False and
not self.onnx_trace not self.onnx_trace
and not is_tpu # don't use PyTorch version on TPUs and not is_tpu # don't use PyTorch version on TPUs
and incremental_state is None and incremental_state is None
...@@ -349,9 +350,7 @@ class MultiheadAttention(nn.Module): ...@@ -349,9 +350,7 @@ class MultiheadAttention(nn.Module):
if before_softmax: if before_softmax:
return attn_weights, v return attn_weights, v
attn_weights_float = utils.softmax( attn_weights_float = F.softmax(attn_weights, dim=-1, dtype=torch.float32)
attn_weights, dim=-1, onnx_trace=self.onnx_trace
)
attn_weights = attn_weights_float.type_as(attn_weights) attn_weights = attn_weights_float.type_as(attn_weights)
attn_probs = self.dropout_module(attn_weights) attn_probs = self.dropout_module(attn_weights)
......
...@@ -205,6 +205,7 @@ class RelativeMultiheadAttention(MultiheadAttention): ...@@ -205,6 +205,7 @@ class RelativeMultiheadAttention(MultiheadAttention):
# In this branch incremental_state is never None # In this branch incremental_state is never None
assert incremental_state is not None assert incremental_state is not None
incremental_state = self._set_input_buffer(incremental_state, saved_state) incremental_state = self._set_input_buffer(incremental_state, saved_state)
assert k is not None assert k is not None
src_len = k.size(1) src_len = k.size(1)
...@@ -271,7 +272,7 @@ class RelativeMultiheadAttention(MultiheadAttention): ...@@ -271,7 +272,7 @@ class RelativeMultiheadAttention(MultiheadAttention):
) )
else: else:
attn_weights = attn_weights.transpose(0, 2) attn_weights = attn_weights.transpose(0, 2)
attn_weights = attn_weights.masked_fill(key_padding_mask, float("-inf")) attn_weights = attn_weights.float().masked_fill(key_padding_mask, float("-inf")).type_as(attn_weights)
attn_weights = attn_weights.transpose(0, 2) attn_weights = attn_weights.transpose(0, 2)
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
...@@ -292,7 +293,6 @@ class RelativeMultiheadAttention(MultiheadAttention): ...@@ -292,7 +293,6 @@ class RelativeMultiheadAttention(MultiheadAttention):
else: else:
attn = self._relative_attention_inner(attn_probs, v, relation_values, transpose=False) attn = self._relative_attention_inner(attn_probs, v, relation_values, transpose=False)
# attn = torch.bmm(attn_probs, v)
assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim] assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim]
if self.onnx_trace and attn.size(1) == 1: if self.onnx_trace and attn.size(1) == 1:
# when ONNX tracing a single decoder step (sequence length == 1) # when ONNX tracing a single decoder step (sequence length == 1)
...@@ -316,11 +316,11 @@ class RelativeMultiheadAttention(MultiheadAttention): ...@@ -316,11 +316,11 @@ class RelativeMultiheadAttention(MultiheadAttention):
def _generate_relative_positions_matrix(length, max_relative_length, device, incremental_state): def _generate_relative_positions_matrix(length, max_relative_length, device, incremental_state):
if not incremental_state: if not incremental_state:
# training process # training process
range_vec = torch.arange(length).to(device) range_vec = torch.arange(length, device=device)
range_mat = range_vec.repeat(length, 1) range_mat = range_vec.repeat(length, 1)
distance_mat = range_mat - range_mat.transpose(0, 1) distance_mat = range_mat - range_mat.transpose(0, 1)
else: else:
distance_mat = torch.arange(-length + 1, 1).view(1, -1).to(device) distance_mat = torch.arange(-length + 1, 0, device=device).view(1, -1)
distance_mat_clipped = torch.clamp(distance_mat, -max_relative_length, max_relative_length) distance_mat_clipped = torch.clamp(distance_mat, -max_relative_length, max_relative_length)
...@@ -337,7 +337,7 @@ class RelativeMultiheadAttention(MultiheadAttention): ...@@ -337,7 +337,7 @@ class RelativeMultiheadAttention(MultiheadAttention):
Args: Args:
x: Tensor with shape [batch_size*heads, length, length or depth]. x: Tensor with shape [batch_size*heads, length, length or depth].
y: Tensor with shap e [batch_size*heads, length, depth]. y: Tensor with shape [batch_size*heads, length, depth].
z: Tensor with shape [length, length, depth]. z: Tensor with shape [length, length, depth].
transpose: Whether to transpose inner matrices of y and z. Should be true if transpose: Whether to transpose inner matrices of y and z. Should be true if
last dimension of x is depth, not length. last dimension of x is depth, not length.
......
...@@ -161,7 +161,13 @@ class S2TTransformerEncoderLayer(nn.Module): ...@@ -161,7 +161,13 @@ class S2TTransformerEncoderLayer(nn.Module):
else: else:
print("The maximum encoder relative length %d can not be -1!" % max_relative_length) print("The maximum encoder relative length %d can not be -1!" % max_relative_length)
exit(1) exit(1)
elif self.attn_type in ["rel_pos", "rel_pos_legacy"]: elif self.attn_type == "rel_pos":
return RelPositionMultiHeadedAttention(
embed_dim,
attention_heads,
dropout=dropout,
)
elif self.attn_type == "rel_pos_legacy":
return LegacyRelPositionMultiHeadedAttention( return LegacyRelPositionMultiHeadedAttention(
embed_dim, embed_dim,
attention_heads, attention_heads,
...@@ -236,8 +242,8 @@ class S2TTransformerEncoderLayer(nn.Module): ...@@ -236,8 +242,8 @@ class S2TTransformerEncoderLayer(nn.Module):
# Note that we cannot use -inf here, because at some edge cases, # Note that we cannot use -inf here, because at some edge cases,
# the attention weight (before softmax) for some padded element in query # the attention weight (before softmax) for some padded element in query
# will become -inf, which results in NaN in model parameters # will become -inf, which results in NaN in model parameters
# if attn_mask is not None: if attn_mask is not None:
# attn_mask = attn_mask.masked_fill(attn_mask.to(torch.bool), -1e8) attn_mask = attn_mask.masked_fill(attn_mask.to(torch.bool), -1e8)
# whether to use macaron style # whether to use macaron style
if self.macaron_norm is not None: if self.macaron_norm is not None:
......
...@@ -142,9 +142,6 @@ class Conv1dSubsampling(nn.Module): ...@@ -142,9 +142,6 @@ class Conv1dSubsampling(nn.Module):
get_activation_class(act, dim=1) get_activation_class(act, dim=1)
) for layer_id in range(num_layers)]) ) for layer_id in range(num_layers)])
out_dim = filters[-1]
self.linear = nn.Linear(out_dim, out_dim)
def forward(self, x, x_len): def forward(self, x, x_len):
# (B, T, D) -> (B, D, T) # (B, T, D) -> (B, D, T)
...@@ -157,7 +154,6 @@ class Conv1dSubsampling(nn.Module): ...@@ -157,7 +154,6 @@ class Conv1dSubsampling(nn.Module):
if x_len is not None: if x_len is not None:
x_len = torch.div(x_len - 1, 2, rounding_mode='floor') + 1 x_len = torch.div(x_len - 1, 2, rounding_mode='floor') + 1
x = x.transpose(1, 2) x = x.transpose(1, 2)
x = self.linear(x)
return x, x_len return x, x_len
......
...@@ -169,7 +169,7 @@ class TransformerEncoderLayer(nn.Module): ...@@ -169,7 +169,7 @@ class TransformerEncoderLayer(nn.Module):
`attn_mask[tgt_i, src_j] = 1` means that when calculating the `attn_mask[tgt_i, src_j] = 1` means that when calculating the
embedding for `tgt_i`, we exclude (mask out) `src_j`. This is embedding for `tgt_i`, we exclude (mask out) `src_j`. This is
useful for strided self-attention. useful for strided self-attention.
positions (Tensor): the position embedding for relative position encoding pos_emb (Tensor): the position embedding for relative position encoding
Returns: Returns:
encoded output of shape `(seq_len, batch, embed_dim)` encoded output of shape `(seq_len, batch, embed_dim)`
...@@ -180,7 +180,9 @@ class TransformerEncoderLayer(nn.Module): ...@@ -180,7 +180,9 @@ class TransformerEncoderLayer(nn.Module):
# the attention weight (before softmax) for some padded element in query # the attention weight (before softmax) for some padded element in query
# will become -inf, which results in NaN in model parameters # will become -inf, which results in NaN in model parameters
if attn_mask is not None: if attn_mask is not None:
attn_mask = attn_mask.masked_fill(attn_mask.to(torch.bool), -1e8) attn_mask = attn_mask.masked_fill(
attn_mask.to(torch.bool), -1e8 if x.dtype == torch.float32 else -1e4
)
residual = x residual = x
if self.normalize_before: if self.normalize_before:
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论