shell

478c694b · xuchen · e248f2f0 · 478c694b · 478c694b · 478c694b
Commit 478c694b authored Aug 03, 2023 by xuchen
--- a/egs/mustc/asr/decode.sh
+++ b/egs/mustc/asr/decode.sh
 #!/usr/bin/env bash

-gpu_num=1
+gpu_num=0

 data_dir=
 test_subset=(dev tst-COMMON)
@@ -15,12 +15,12 @@ ctc_infer=0
 n_average=10
 beam_size=5
 len_penalty=1.0
-max_tokens=80000
+max_tokens=50000
 dec_model=checkpoint_best.pt

 cmd="./run.sh
-    --stage 3
-    --stop_stage 3
+    --stage 2
+    --stop_stage 2
    --gpu_num ${gpu_num}
    --exp_name ${exp_name}
    --n_average ${n_average}

--- a/egs/mustc/asr/run.sh
+++ b/egs/mustc/asr/run.sh
@@ -2,8 +2,7 @@

 # Processing MuST-C Datasets

-# Copyright 2021 Natural Language Processing Laboratory 
-# Xu Chen (xuchenneu@163.com)
+# Copyright 2021 Chen Xu (xuchennlp@outlook.com)

 # Set bash to 'debug' mode, it will exit on :
 # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
@@ -16,22 +15,21 @@ eval=1
 time=$(date "+%m%d_%H%M")

 stage=1
-stop_stage=4
+stop_stage=2

-######## hardware ########
-# devices
+######## Hardware ########
+# Devices
 device=(0)
 gpu_num=8
 update_freq=1
-hdfs_get=0

-root_dir=/opt/tiger
-data_root_dir=/mnt/bn/nas-xc-1
-
-code_dir=${root_dir}/s2t
 pwd_dir=$PWD
+root_dir=${ST_ROOT}
+data_root_dir=${root_dir}
+
+code_dir=${root_dir}/S2T

-# dataset
+# Dataset
 src_lang=en
 tgt_lang=de
 dataset=must_c
@@ -63,24 +61,22 @@ valid_split=dev
 test_split=tst-COMMON
 test_subset=dev,tst-COMMON

-# exp
+# Exp
+sub_tag=
 exp_prefix=$(date "+%m%d")
-# exp_subfix=${ARNOLD_JOB_ID}_${ARNOLD_TASK_ID}_${ARNOLD_TRIAL_ID}
 extra_tag=
 extra_parameter=
 exp_tag=baseline
 exp_name=

-# config
+# Training Settings
 train_config=base,ctc
-data_config=config.yaml
-
-# training setting
 fp16=1
 max_tokens=40000
 step_valid=0
+data_config=config.yaml

-# decoding setting
+# Decoding Settings
 cer=0
 ctc_infer=0
 ctc_self_ensemble=0
@@ -92,6 +88,7 @@ len_penalty=1.0
 infer_score=0
 infer_parameters=

+# Parsing Options
 if [[ ${speed_perturb} -eq 1 ]]; then
    data_dir=${data_dir}_sp
    exp_prefix=${exp_prefix}_sp
@@ -124,19 +121,6 @@ if [[ ! -d ${data_dir} ]]; then
    exit
 fi

-# setup nccl envs
-export NCCL_IB_DISABLE=0
-export NCCL_IB_HCA=$ARNOLD_RDMA_DEVICE:1
-export NCCL_IB_GID_INDEX=3
-export NCCL_SOCKET_IFNAME=eth0
-
-HOSTS=$ARNOLD_WORKER_HOSTS
-HOST=(${HOSTS//,/ })
-HOST_SPLIT=(${HOST//:/ })
-PORT=${HOST_SPLIT[1]}
-INIT_METHOD="tcp://${ARNOLD_WORKER_0_HOST}:${ARNOLD_WORKER_0_PORT}"
-DIST_RANK=$((ARNOLD_ID * ARNOLD_WORKER_GPU))
-
 export PATH=$PATH:${code_dir}/scripts
 . ./local/parse_options.sh || exit 1;

@@ -150,21 +134,27 @@ if [[ -z ${exp_name} ]]; then
        exp_name=${exp_name}_${exp_subfix}
    fi
 fi
-model_dir=${code_dir}/checkpoints/${data_model_subfix}/${exp_name}

-echo "stage: $stage"
-echo "stop_stage: $stop_stage"
+ckpt_dir=${root_dir}/checkpoints/
+model_dir=${root_dir}/checkpoints/${data_model_subfix}/${sub_tag}/${exp_name}
+
+# Start
 cd ${code_dir}
+echo "Start Stage: $stage"
+echo "Stop  Stage: $stop_stage"
+
+if [[ `pip list | grep fairseq | wc -l` -eq 0 ]]; then 
+    echo "Default Stage: env configure"
+    pip3 install -e ${code_dir}
+fi

 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
-    echo "stage -1: Data Download"
-    # pass
+    echo "Stage -1: Data Download"
 fi

 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    ### Task dependent. You have to make data the following preparation part by yourself.
-    ### But you can utilize Kaldi recipes in most cases
-    echo "stage 0: ASR Data Preparation"
+    echo "Stage 0: ASR Data Preparation"
    if [[ ! -e ${data_dir} ]]; then
        mkdir -p ${data_dir}
    fi
@@ -205,32 +195,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    [[ $eval -eq 1 ]] && eval ${cmd}
 fi

-if [[ `pip list | grep fairseq | wc -l` -eq 0 ]]; then 
-    echo "default stage: env configure"
-    pip3 install -e ${code_dir} -i https://bytedpypi.byted.org/simple  --no-build-isolation --default-timeout=10000
-fi
-
-if [[ -d /mnt/bn/nas-xc-1/checkpoints && ! -d ${code_dir}/checkpoints ]]; then
-    ln -s /mnt/bn/nas-xc-1/checkpoints ${code_dir}
-fi
-# if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-
-    if [ ${hdfs_get} -eq 1 ]; then
-        ln_data_dir=`echo ${data_dir} | sed -e "s#${data_root_dir}#${code_dir}#"`
-        echo ${ln_data_dir}
-        mkdir -p ${ln_data_dir}
-        ln -s ${data_dir}/../* ${ln_data_dir}
-        rm -r ${ln_data_dir}
-
-        hdfs_path=`echo ${data_dir} | sed -e "s#${data_root_dir}#hdfs://haruna/home/byte_arnold_lq_mlnlc/user/xuchen/#"`
-        hdfs dfs -get ${hdfs_path} ${ln_data_dir}
-        sed -i -e "s#${data_root_dir}#${code_dir}#" ${ln_data_dir}/config*
-        data_dir=${ln_data_dir}
-    fi
-# fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    echo "stage 2: ASR Network Training"
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    echo "Stage 1: Network Training"
    [[ ! -d ${data_dir} ]] && echo "The data dir ${data_dir} is not existing!" && exit 1;

    if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
@@ -240,6 +206,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
            source ./local/utils.sh
            device=$(get_devices $gpu_num 0)
 		fi
+        export CUDA_VISIBLE_DEVICES=${device}
    fi

    echo -e "data=${data_dir} model=${model_dir}"
@@ -327,22 +294,17 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    echo -e "\033[34mRun command: \n${cmd} \033[0m"

    # save info
-    log=./history.log
+    log=${ckpt_dir}/history.log
    echo "${time} | ${data_dir} | ${exp_name} | ${model_dir} " >> $log
    tail -n 50 ${log} > tmp.log
    mv tmp.log $log
-    # export CUDA_VISIBLE_DEVICES=${device}

    log=${model_dir}/train.log
    cmd="${cmd} 2>&1 | tee -a ${log}"
    #cmd="nohup ${cmd} >> ${log} 2>&1 &"
    if [[ $eval -eq 1 ]]; then
        # tensorboard
-        if [[ -z ${ARNOLD_TENSORBOARD_CURRENT_PORT} ]]; then
-            port=6666
-        else
-            port=${ARNOLD_TENSORBOARD_CURRENT_PORT}
-        fi
+        port=6666
        tensorboard --logdir ${model_dir} --port ${port} --bind_all &
    
        echo "${cmd}" > ${model_dir}/cmd
@@ -352,8 +314,8 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
 	fi
 fi

-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-    echo "stage 3: ASR Decoding"
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    echo "Stage 2: Decoding"
    if [[ ${n_average} -ne 1 ]]; then
        # Average models
 		dec_model=avg_${n_average}_checkpoint.pt
@@ -377,18 +339,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
            source ./local/utils.sh
            device=$(get_devices $gpu_num 0)
 		fi
+        export CUDA_VISIBLE_DEVICES=${device}
    fi
-    # export CUDA_VISIBLE_DEVICES=${device}

    suffix=beam${beam_size}_alpha${len_penalty}_tokens${max_tokens}
+    if [[ ${n_average} -ne 1 ]]; then
+        suffix=${suffix}_${n_average}
+    fi
    if [[ -n ${cer} && ${cer} -eq 1 ]]; then
        suffix=${suffix}_cer
    else
        suffix=${suffix}_wer
    fi
-    if [[ ${n_average} -ne 1 ]]; then
-        suffix=${suffix}_${n_average}
-    fi
    if [[ ${infer_score} -eq 1 ]]; then
        suffix=${suffix}_score
    fi
@@ -435,9 +397,9 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then

        cd ${code_dir}
        if [[ $eval -eq 1 ]]; then
-            src_ctc_file=translation-${subset}.txt.ctc
-            if [[ -f ${model_dir}/${src_ctc_file} ]]; then
-                rm ${model_dir}/${src_ctc_file}
+            ctc_file=translation-${subset}.ctc
+            if [[ ${ctc_infer} -eq 1 && -f ${model_dir}/${ctc_file} ]]; then
+                rm ${model_dir}/${ctc_file}
            fi

            eval $cmd
@@ -448,33 +410,34 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then

            cd ${pwd_dir}
            if [[ -f ${model_dir}/enc_dump ]]; then
-                mv ${model_dir}/enc_dump ${model_dir}/${subset}-${suffix}-enc-dump
+                mv ${model_dir}/enc_dump ${model_dir}/dump-${subset}-enc-${suffix}
            fi
            if [[ -f ${model_dir}/dec_dump ]]; then
-                mv ${model_dir}/dec_dump ${model_dir}/${subset}-${suffix}-dec-dump
+                mv ${model_dir}/dec_dump ${model_dir}/dump-${subset}-dec-${suffix}
            fi
+
            trans_file=translation-${subset}-${suffix}.txt
-            if [[ ${ctc_infer} -eq 1 && -f ${model_dir}/${src_ctc_file} ]]; then
+            if [[ ${ctc_infer} -eq 1 && -f ${model_dir}/${ctc_file} ]]; then
                ref_file=${model_dir}/${subset}.${src_lang}
                if [[ ! -f ${ref_file} ]]; then
                    python3 ./local/extract_txt_from_tsv.py ${data_dir}/${subset}.tsv ${ref_file} "src_text"
                fi
                if [[ -f ${ref_file} ]]; then
-                    src_ctc=$(mktemp -t temp.record.XXXXXX)
+                    ctc=$(mktemp -t temp.record.XXXXXX)
                    cd ./local
-                    ./cal_wer.sh ${model_dir} ${subset} ${trans_file} ${src_ctc_file} ${ref_file} > ${src_ctc}
+                    ./cal_wer.sh ${model_dir} ${subset} ${trans_file} ${ctc_file} ${ref_file} > ${ctc}
                    cd ..

                    echo "CTC WER" >> ${result_file}
-                    tail -n 2 ${src_ctc} >> ${result_file}
+                    tail -n 2 ${ctc} >> ${result_file}

                    src_bleu=$(mktemp -t temp.record.XXXXXX)
                    cd local
-                    ./cal_ctc_bleu.sh ${model_dir} ${subset} ${trans_file} ${src_ctc_file} ${ref_file} ${tokenizer} ${src_lang} > ${src_bleu}
+                    ./cal_ctc_bleu.sh ${model_dir} ${subset} ${trans_file} ${ctc_file} ${ref_file} ${tokenizer} ${src_lang} > ${src_bleu}
                    cd ..
                    cat ${src_bleu} >> ${result_file}

-                    rm ${src_ctc} ${src_bleu}
+                    rm ${ctc} ${src_bleu}
                else
                    echo "No reference for source language."
                fi
@@ -484,11 +447,3 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
 	echo
    cat ${result_file}
 fi
-
-# if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
-#     cd ${fairseq_dir}
-#     echo "Stage 4: Upload model and log"
-#     echo "Path: hdfs://haruna/home/byte_arnold_lq_mlnlc/user/xuchen/s2t/checkpoints/${data_model_subfix}/${exp_name}"
-#     hdfs dfs -mkdir -p hdfs://haruna/home/byte_arnold_lq_mlnlc/user/xuchen/s2t/checkpoints/${data_model_subfix}
-#     hdfs dfs -put -f ${model_dir} hdfs://haruna/home/byte_arnold_lq_mlnlc/user/xuchen/s2t/checkpoints/${data_model_subfix}
-# fi
--- a/egs/mustc/st/run.sh
+++ b/egs/mustc/st/run.sh
@@ -2,7 +2,7 @@

 # Processing MuST-C Datasets

-# Copyright 2021 Chen Xu (xuchenneu@163.com)
+# Copyright 2021 Chen Xu (xuchennlp@outlook.com)

 # Set bash to 'debug' mode, it will exit on :
 # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
@@ -17,19 +17,19 @@ time=$(date "+%m%d_%H%M")
 stage=1
 stop_stage=2

-######## hardware ########
-# devices
+######## Hardware ########
+# Devices
 device=(0)
 gpu_num=8
 update_freq=1

 pwd_dir=$PWD
-root_dir=${pwd_dir}/../../../..
-data_root_dir=${root_dir}/data
+root_dir=${pwd_dir}/../../../../
+data_root_dir=${root_dir}

 code_dir=${root_dir}/S2T

-# dataset
+# Dataset
 src_lang=en
 tgt_lang=de
 dataset=must_c
@@ -63,7 +63,7 @@ valid_split=dev
 test_split=tst-COMMON
 test_subset=dev,tst-COMMON

-# exp
+# Exp
 sub_tag=
 exp_prefix=$(date "+%m%d")
 extra_tag=
@@ -71,16 +71,14 @@ extra_parameter=
 exp_tag=baseline
 exp_name=

-# config
+# Training Settings
 train_config=base,ctc
-
-# training setting
 fp16=1
 max_tokens=40000
 step_valid=0
 bleu_valid=0

-# decoding setting
+# Decoding Settings
 sacrebleu=1
 dec_model=checkpoint_best.pt
 ctc_infer=0
@@ -90,6 +88,7 @@ len_penalty=1.0
 infer_score=0
 infer_parameters=

+# Parsing Options
 if [[ ${share_dict} -eq 1 ]]; then
 	data_config=config_share.yaml
 else
@@ -136,12 +135,14 @@ if [[ -z ${exp_name} ]]; then
        exp_name=${exp_name}_${exp_subfix}
    fi
 fi
+
 ckpt_dir=${code_dir}/checkpoints/
 model_dir=${code_dir}/checkpoints/${data_model_subfix}/${sub_tag}/${exp_name}

+# Start
+cd ${code_dir}
 echo "Start Stage: $stage"
 echo "Stop  Stage: $stop_stage"
-cd ${code_dir}

 if [[ `pip list | grep fairseq | wc -l` -eq 0 ]]; then 
    echo "Default Stage: env configure"
@@ -150,12 +151,10 @@ fi

 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
    echo "Stage -1: Data Download"
-    # pass
 fi

 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    ### Task dependent. You have to make data the following preparation part by yourself.
-    ### But you can utilize Kaldi recipes in most cases
    echo "Stage 0: ASR Data Preparation"
    if [[ ! -e ${data_dir} ]]; then
        mkdir -p ${data_dir}
@@ -255,6 +254,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
            source ./local/utils.sh
            device=$(get_devices $gpu_num 0)
 		fi
+        export CUDA_VISIBLE_DEVICES=${device}
    fi

    echo -e "data=${data_dir} model=${model_dir}"
@@ -308,11 +308,6 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
 		cmd="${cmd}
        --distributed-world-size $gpu_num
        --ddp-backend no_c10d"
-		if [[ ${DIST_RANK} -ne 0 ]]; then
-		cmd="${cmd}
-        --distributed-init-method ${INIT_METHOD}
-        --distributed-rank ${DIST_RANK}"
-		fi
 	fi
    if [[ $fp16 -eq 1 ]]; then
        cmd="${cmd}
@@ -362,7 +357,6 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    echo "${time} | ${data_dir} | ${exp_name} | ${model_dir} " >> $log
    tail -n 50 ${log} > tmp.log
    mv tmp.log $log
-    # export CUDA_VISIBLE_DEVICES=${device}

    log=${model_dir}/train.log
    cmd="${cmd} 2>&1 | tee -a ${log}"
@@ -404,8 +398,8 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
            source ./local/utils.sh
            device=$(get_devices $gpu_num 0)
 		fi
+        export CUDA_VISIBLE_DEVICES=${device}
    fi
-    # export CUDA_VISIBLE_DEVICES=${device}

    suffix=beam${beam_size}_alpha${len_penalty}_tokens${max_tokens}
    if [[ ${n_average} -ne 1 ]]; then