acc update

d88a22ef · xuchen · 478c694b · d88a22ef · d88a22ef · d88a22ef
Commit d88a22ef authored Aug 10, 2023 by xuchen
--- a/egs/mustc/asr/run.sh
+++ b/egs/mustc/asr/run.sh
@@ -85,8 +85,8 @@ dec_model=checkpoint_best.pt
 n_average=10
 beam_size=5
 len_penalty=1.0
-infer_score=0
-infer_parameters=
+infer_score=1
+infer_parameters="--cal-monotonic-cross-attn-weights --cal-localness --localness-window 0.1 --cal-topk-cross-attn-weights --topk-cross-attn-weights 15 --cal-entropy"

 # Parsing Options
 if [[ ${speed_perturb} -eq 1 ]]; then
@@ -370,6 +370,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        --max-tokens ${max_tokens}
        --beam ${beam_size}
        --lenpen ${len_penalty}
+        --batch-size 1
        --scoring wer
        --wer-tokenizer 13a
        --wer-lowercase

--- a/egs/mustc/mt/decode.sh
+++ b/egs/mustc/mt/decode.sh
@@ -21,12 +21,12 @@ sacrebleu=1
 n_average=10
 beam_size=5
 len_penalty=1.0
-max_tokens=40000
+max_tokens=50000
 dec_model=checkpoint_best.pt

 cmd="./run.sh
-    --stage 3
-    --stop_stage 3
+    --stage 2
+    --stop_stage 2
    --src_lang ${src_lang}
    --tgt_lang ${tgt_lang}
    --share_dict ${share_dict}

--- a/egs/mustc/mt/run.sh
+++ b/egs/mustc/mt/run.sh
@@ -2,8 +2,7 @@

 # Processing MuST-C Datasets

-# Copyright 2021 Natural Language Processing Laboratory 
-# Xu Chen (xuchenneu@163.com)
+# Copyright 2021 Chen Xu (xuchennlp@outlook.com)

 # Set bash to 'debug' mode, it will exit on :
 # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
@@ -16,22 +15,21 @@ eval=1
 time=$(date "+%m%d_%H%M")

 stage=1
-stop_stage=4
+stop_stage=2

-######## hardware ########
-# devices
+######## Hardware ########
+# Devices
 device=(0)
 gpu_num=1
 update_freq=1
-hdfs_get=0

-root_dir=/opt/tiger
-data_root_dir=/mnt/bn/nas-xc-1
-
-code_dir=${root_dir}/s2t
 pwd_dir=$PWD
+root_dir=${ST_ROOT}
+data_root_dir=${root_dir}
+
+code_dir=${root_dir}/S2T

-# dataset
+# Dataset
 src_lang=en
 tgt_lang=de
 dataset=must_c
@@ -65,18 +63,16 @@ valid_subset=dev
 trans_subset=tst-COMMON
 test_subset=valid,test

-# exp
+# Exp
+sub_tag=
 exp_prefix=$(date "+%m%d")
-# exp_subfix=${ARNOLD_JOB_ID}_${ARNOLD_TASK_ID}_${ARNOLD_TRIAL_ID}
 extra_tag=
 extra_parameter=
 exp_tag=baseline
 exp_name=

-# config
+# Training Settings
 train_config=small
-
-# training setting
 fp16=1
 max_tokens=8192
 step_valid=0
@@ -88,7 +84,10 @@ dec_model=checkpoint_best.pt
 n_average=10
 beam_size=5
 len_penalty=1.0
+infer_score=1
+infer_parameters="--cal-monotonic-cross-attn-weights --cal-localness --localness-window 0.1 --cal-topk-cross-attn-weights --topk-cross-attn-weights 15 --cal-entropy"

+# Parsing Options
 . ./local/parse_options.sh || exit 1;

 if [[ ${use_specific_dict} -eq 1 ]]; then
@@ -127,19 +126,6 @@ if [[ ${tokenizer} -eq 1 ]]; then
    exp_prefix=${exp_prefix}_tok
 fi

-# setup nccl envs
-export NCCL_IB_DISABLE=0
-export NCCL_IB_HCA=$ARNOLD_RDMA_DEVICE:1
-export NCCL_IB_GID_INDEX=3
-export NCCL_SOCKET_IFNAME=eth0
-
-HOSTS=$ARNOLD_WORKER_HOSTS
-HOST=(${HOSTS//,/ })
-HOST_SPLIT=(${HOST//:/ })
-PORT=${HOST_SPLIT[1]}
-INIT_METHOD="tcp://${ARNOLD_WORKER_0_HOST}:${ARNOLD_WORKER_0_PORT}"
-DIST_RANK=$((ARNOLD_ID * ARNOLD_WORKER_GPU))
-
 export PATH=$PATH:${code_dir}/scripts
 . ./local/parse_options.sh || exit 1;

@@ -153,20 +139,27 @@ if [[ -z ${exp_name} ]]; then
        exp_name=${exp_name}_${exp_subfix}
    fi
 fi
-model_dir=${code_dir}/checkpoints/${data_model_subfix}/${exp_name}

-echo "stage: $stage"
-echo "stop_stage: $stop_stage"
+ckpt_dir=${root_dir}/checkpoints/
+model_dir=${root_dir}/checkpoints/${data_model_subfix}/${sub_tag}/${exp_name}
+
+# Start
 cd ${code_dir}
+echo "Start Stage: $stage"
+echo "Stop  Stage: $stop_stage"
+
+if [[ `pip list | grep fairseq | wc -l` -eq 0 ]]; then 
+    echo "Default Stage: env configure"
+    pip3 install -e ${code_dir}
+fi

 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
-    echo "stage -1: Data Download"
-    # pass
+    echo "Stage -1: Data Download"
 fi

 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    ### Task dependent. You have to make data the following preparation part by yourself.
-    echo "stage 0: MT Data Preparation"
+    echo "Stage 0: Data Preparation"
    if [[ ! -e ${data_dir} ]]; then
        mkdir -p ${data_dir}
    fi
@@ -230,32 +223,9 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    [[ $eval -eq 1 ]] && eval ${cmd}
 fi

-echo "stage 1: env configure"
-if [[ `pip list | grep fairseq | wc -l` -eq 0 ]]; then 
-    pip3 install -e ${code_dir} -i https://bytedpypi.byted.org/simple  --no-build-isolation --default-timeout=10000
-fi
-
-if [[ -d /mnt/bn/nas-xc-1/checkpoints && ! -d ${code_dir}/checkpoints ]]; then
-    ln -s /mnt/bn/nas-xc-1/checkpoints ${code_dir}
-fi
-# if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-
-    if [ ${hdfs_get} -eq 1 ]; then
-        ln_data_dir=`echo ${data_dir} | sed -e "s#${data_root_dir}#${code_dir}#"`
-        echo ${ln_data_dir}
-        mkdir -p ${ln_data_dir}
-        ln -s ${data_dir}/../* ${ln_data_dir}
-        rm -r ${ln_data_dir}
-
-        hdfs_path=`echo ${data_dir} | sed -e "s#${data_root_dir}#hdfs://haruna/home/byte_arnold_lq_mlnlc/user/xuchen/#"`
-        hdfs dfs -get ${hdfs_path} ${ln_data_dir}
-        data_dir=${ln_data_dir}
-    fi
-# fi
 data_dir=${data_dir}/data-bin
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    echo "stage 2: MT Network Training"
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    echo "Stage 1: Network Training"
    [[ ! -d ${data_dir} ]] && echo "The data dir ${data_dir} is not existing!" && exit 1;

    if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
@@ -265,6 +235,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
            source ./local/utils.sh
            device=$(get_devices $gpu_num 0)
 		fi
+        export CUDA_VISIBLE_DEVICES=${device}
    fi

    echo -e "data=${data_dir} model=${model_dir}"
@@ -369,18 +340,13 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    echo "${time} | ${data_dir} | ${exp_name} | ${model_dir} " >> $log
    tail -n 50 ${log} > tmp.log
    mv tmp.log $log
-    # export CUDA_VISIBLE_DEVICES=${device}

    log=${model_dir}/train.log
    cmd="${cmd} 2>&1 | tee -a ${log}"
    #cmd="nohup ${cmd} >> ${log} 2>&1 &"
    if [[ $eval -eq 1 ]]; then
        # tensorboard
-        if [[ -z ${ARNOLD_TENSORBOARD_CURRENT_PORT} ]]; then
        port=6666
-        else
-            port=${ARNOLD_TENSORBOARD_CURRENT_PORT}
-        fi
        tensorboard --logdir ${model_dir} --port ${port} --bind_all &
    
        echo "${cmd}" > ${model_dir}/cmd
@@ -390,8 +356,8 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
 	fi
 fi

-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-    echo "stage 3: MT Decoding"
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    echo "Stage 2: Decoding"
    if [[ ${n_average} -ne 1 ]]; then
        # Average models
 		dec_model=avg_${n_average}_checkpoint.pt
@@ -415,8 +381,8 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
            source ./local/utils.sh
            device=$(get_devices $gpu_num 0)
 		fi
+        export CUDA_VISIBLE_DEVICES=${device}
    fi
-    # export CUDA_VISIBLE_DEVICES=${device}

    suffix=beam${beam_size}_alpha${len_penalty}_tokens${max_tokens}
    if [[ ${n_average} -ne 1 ]]; then
@@ -427,6 +393,9 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    else
        suffix=${suffix}_multibleu
    fi
+    if [[ ${infer_score} -eq 1 ]]; then
+        suffix=${suffix}_score
+    fi
 	result_file=${model_dir}/decode_result_${suffix}
 	[[ -f ${result_file} ]] && rm ${result_file}

@@ -442,6 +411,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
        --results-path ${model_dir}
        --max-tokens ${max_tokens}
        --beam ${beam_size}
+        --batch-size 1
        --lenpen ${len_penalty}
        --post-process sentencepiece"

@@ -462,27 +432,34 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
        --target-lang ${tgt_lang}"
            fi
        fi
+        if [[ ${infer_score} -eq 1 ]]; then
+            cmd="${cmd}
+        --score-reference"
+        fi
+        if [[ -n ${infer_parameters} ]]; then
+            cmd="${cmd}
+        ${infer_parameters}"
+        fi

        echo -e "\033[34mRun command: \n${cmd} \033[0m"

        cd ${code_dir}
        if [[ $eval -eq 1 ]]; then
            eval $cmd
+            echo "" >> ${result_file}
            tail -n 2 ${model_dir}/generate-${subset}.txt >> ${result_file}
            mv ${model_dir}/generate-${subset}.txt ${model_dir}/generate-${subset}-${suffix}.txt
            mv ${model_dir}/translation-${subset}.txt ${model_dir}/translation-${subset}-${suffix}.txt

            cd ${pwd_dir}
+            if [[ -f ${model_dir}/enc_dump ]]; then
+                mv ${model_dir}/enc_dump ${model_dir}/dump-${subset}-enc-${suffix}
+            fi
+            if [[ -f ${model_dir}/dec_dump ]]; then
+                mv ${model_dir}/dec_dump ${model_dir}/dump-${subset}-dec-${suffix}
+            fi
        fi
 	done
 	echo
    cat ${result_file}
 fi
-
-# if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
-#     cd ${fairseq_dir}
-#     echo "Stage 4: Upload model and log"
-#     echo "Path: hdfs://haruna/home/byte_arnold_lq_mlnlc/user/xuchen/s2t/checkpoints/${data_model_subfix}/${exp_name}"
-#     hdfs dfs -mkdir -p hdfs://haruna/home/byte_arnold_lq_mlnlc/user/xuchen/s2t/checkpoints/${data_model_subfix}
-#     hdfs dfs -put -f ${model_dir} hdfs://haruna/home/byte_arnold_lq_mlnlc/user/xuchen/s2t/checkpoints/${data_model_subfix}
-# fi
--- a/egs/mustc/st/decode.sh
+++ b/egs/mustc/st/decode.sh
@@ -3,7 +3,7 @@
 gpu_num=1

 data_dir=
-test_subset=(dev tst-COMMON)
+test_subset=(tst-COMMON)

 exp_name=
 if [ "$#" -eq 1 ]; then
@@ -11,16 +11,16 @@ if [ "$#" -eq 1 ]; then
 fi

 sacrebleu=1
-ctc_infer=1
+ctc_infer=0
 n_average=10
 beam_size=5
 len_penalty=1.0
-max_tokens=80000
+max_tokens=50000
 dec_model=checkpoint_best.pt

 cmd="./run.sh
-    --stage 3
-    --stop_stage 3
+    --stage 2
+    --stop_stage 2
    --gpu_num ${gpu_num}
    --exp_name ${exp_name}
    --sacrebleu ${sacrebleu}

--- a/egs/mustc/st/run.sh
+++ b/egs/mustc/st/run.sh
@@ -24,7 +24,7 @@ gpu_num=8
 update_freq=1

 pwd_dir=$PWD
-root_dir=${pwd_dir}/../../../../
+root_dir=${ST_ROOT}
 data_root_dir=${root_dir}

 code_dir=${root_dir}/S2T
@@ -85,8 +85,9 @@ ctc_infer=0
 n_average=10
 beam_size=5
 len_penalty=1.0
-infer_score=0
-infer_parameters=
+infer_score=1
+infer_parameters="--cal-monotonic-cross-attn-weights --cal-localness --localness-window 0.1 --cal-topk-cross-attn-weights --topk-cross-attn-weights 15 --cal-entropy"
+

 # Parsing Options
 if [[ ${share_dict} -eq 1 ]]; then
@@ -136,8 +137,8 @@ if [[ -z ${exp_name} ]]; then
    fi
 fi

-ckpt_dir=${code_dir}/checkpoints/
-model_dir=${code_dir}/checkpoints/${data_model_subfix}/${sub_tag}/${exp_name}
+ckpt_dir=${root_dir}/checkpoints/
+model_dir=${root_dir}/checkpoints/${data_model_subfix}/${sub_tag}/${exp_name}

 # Start
 cd ${code_dir}
@@ -427,6 +428,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        --path ${model_dir}/${dec_model}
        --results-path ${model_dir}
        --max-tokens ${max_tokens}
+        --batch-size 1
        --beam ${beam_size}
        --skip-invalid-size-inputs-valid-test
        --lenpen ${len_penalty}"

--- a/fairseq/data/indexed_dataset.py
+++ b/fairseq/data/indexed_dataset.py
@@ -109,8 +109,8 @@ _code_to_dtype = {
    3: np.int16,
    4: np.int32,
    5: np.int64,
-    6: np.float,
-    7: np.double,
+    6: np.float32,
+    7: np.float64,
    8: np.uint16,
    9: np.uint32,
    10: np.uint64,
@@ -316,8 +316,8 @@ class IndexedDatasetBuilder:
        np.int16: 2,
        np.int32: 4,
        np.int64: 8,
-        np.float: 4,
-        np.double: 8,
+        np.float32: 4,
+        np.float64: 8,
    }

    def __init__(self, out_file, dtype=np.int32):

--- a/fairseq/models/transformer.py
+++ b/fairseq/models/transformer.py
@@ -563,6 +563,17 @@ class TransformerEncoder(FairseqEncoder):
            x = self.quant_noise(x)
        return x, embed

+    def set_flag(self, **kwargs):
+        for layer in self.layers:
+            if hasattr(layer, "set_flag"):
+                layer.set_flag(**kwargs)
+
+    def dump(self, fstream, info=""):
+        for i, layer in enumerate(self.layers):
+            layer.dump(fstream, "%s Layer %d" % (info, i)) if hasattr(
+                layer, "dump"
+            ) else None
+
    def forward(
        self,
        src_tokens,

--- a/fairseq/modules/multihead_attention.py
+++ b/fairseq/modules/multihead_attention.py
@@ -113,9 +113,9 @@ class MultiheadAttention(nn.Module):
        if kwargs.get("cal_localness", False) and not self.encoder_decoder_attention: 
            self.cal_localness = True
            self.localness_window = kwargs.get("localness_window", 0.1)
-        if kwargs.get("cal_entropy", False) and self.encoder_decoder_attention: 
+        if kwargs.get("cal_entropy", False): # and self.encoder_decoder_attention: 
            self.cal_entropy = True
-        if kwargs.get("cal_topk_cross_attn_weights", False) and self.encoder_decoder_attention: 
+        if kwargs.get("cal_topk_cross_attn_weights", False):
            self.cal_topk = True
            self.weights_topk = kwargs.get("topk_cross_attn_weights", 1)
        if kwargs.get("cal_monotonic_cross_attn_weights", False) and self.encoder_decoder_attention: 
@@ -123,7 +123,7 @@ class MultiheadAttention(nn.Module):

    def dump(self, fstream, info):
        if self.cal_localness:
-            print("%s window size: %f localness: %.2f" % (info, self.localness_window, self.localness), file=fstream)
+            print("%s window size: %.2f localness: %.4f" % (info, self.localness_window, self.localness), file=fstream)
        
        if self.cal_entropy:
            print("%s Entropy: %.2f" % (info, self.entropy), file=fstream)
@@ -423,36 +423,55 @@ class MultiheadAttention(nn.Module):
                # average attention weights over heads
                attn_weights = attn_weights.mean(dim=0)

-        self.cal_localness_func(attn_weights_float, bsz, src_len, tgt_len)
+        self.cal_localness_func(attn_weights_float, bsz, src_len, tgt_len, key_padding_mask)
        self.cal_entropy_func(attn_weights_float, bsz, src_len, tgt_len)
        self.cal_topk_func(attn_weights_float, bsz, src_len, tgt_len)
        self.cal_monotonic_func(attn_weights_float, bsz, src_len, tgt_len)

        return attn, attn_weights

-    def cal_localness_func(self, attn_weights_float, bsz, src_len, tgt_len):
+    def cal_localness_func(self, attn_weights_float, bsz, src_len, tgt_len, key_padding_mask):
        if not self.training and self.cal_localness:
            weights = attn_weights_float.view(
                bsz, self.num_heads, tgt_len, src_len
-            ).transpose(1, 0).mean(dim=0)
+            ).transpose(1, 0).mean(0).cpu()

            localness = 0
-            item_localness = 0
            window = int(src_len * self.localness_window)
-            for i in range(window, src_len - window):
+            # print(src_len)
+            # print(window)
+            # for i in range(window, src_len - window):
+            #     item_localness = 0
+            #     for j in range(-window, window + 1):
+            #         # if j == 0:
+            #             # continue
+            #         item_localness += weights[:, :, i, i + j]
+            #     localness += item_localness
+            for i in range(bsz):
+                sum_num = 0
                item_localness = 0
-                for j in range(-window, window + 1):
-                    # if j == 0:
-                        # continue
-                    item_localness += weights[:, i, i + j]
-                localness += item_localness
-            localness = localness / (src_len - 2 * window)
-            localness *= 100
+                # print(weights[i, :, :])
+                for j in range(window, src_len - window):
+                    if key_padding_mask is not None and key_padding_mask[i, j] == True:
+                        continue
+
+                    unit_localness = 0
+                    for k in range(-window, window + 1):
+                        unit_localness += weights[i, j, j + k]
+                    # print(j)
+                    # print(unit_localness)
+                    item_localness += unit_localness
+                    sum_num += 1
+                # exit()
+                if sum_num > 0:
+                    localness += item_localness / sum_num
+            localness = localness / bsz

            if self.localness_num == 0:
                self.localness = localness.mean()
            else:
                self.localness = (self.localness * self.localness_num + localness.mean()) / (self.localness_num + 1)
+            # print(self.localness)
            self.localness_num += 1
    
    def cal_entropy_func(self, attn_weights_float, bsz, src_len, tgt_len):

--- a/fairseq/scoring/tokenizer.py
+++ b/fairseq/scoring/tokenizer.py
@@ -5,8 +5,11 @@

 import unicodedata

+import sacrebleu as sb
+
 from fairseq.dataclass import ChoiceEnum

+SACREBLEU_V2_ABOVE = int(sb.__version__[0]) >= 2

 class EvaluationTokenizer(object):
    """A generic evaluation-time tokenizer, which leverages built-in tokenizers
@@ -24,7 +27,12 @@ class EvaluationTokenizer(object):

    SPACE = chr(32)
    SPACE_ESCAPE = chr(9601)
-    ALL_TOKENIZER_TYPES = ChoiceEnum(["none", "13a", "intl", "zh", "ja-mecab"])
+    _ALL_TOKENIZER_TYPES = (
+        sb.BLEU.TOKENIZERS
+        if SACREBLEU_V2_ABOVE
+        else ["none", "13a", "intl", "zh", "ja-mecab"]
+    )
+    ALL_TOKENIZER_TYPES = ChoiceEnum(_ALL_TOKENIZER_TYPES)

    def __init__(
        self,
@@ -33,13 +41,16 @@ class EvaluationTokenizer(object):
        punctuation_removal: bool = False,
        character_tokenization: bool = False,
    ):
-        from sacrebleu.tokenizers import TOKENIZERS
-
-        assert tokenizer_type in TOKENIZERS, f"{tokenizer_type}, {TOKENIZERS}"
+        assert (
+            tokenizer_type in self._ALL_TOKENIZER_TYPES
+        ), f"{tokenizer_type}, {self._ALL_TOKENIZER_TYPES}"
        self.lowercase = lowercase
        self.punctuation_removal = punctuation_removal
        self.character_tokenization = character_tokenization
-        self.tokenizer = TOKENIZERS[tokenizer_type]
+        if SACREBLEU_V2_ABOVE:
+            self.tokenizer = sb.BLEU(tokenize=str(tokenizer_type)).tokenizer
+        else:
+            self.tokenizer = sb.tokenizers.TOKENIZERS[tokenizer_type]()

    @classmethod
    def remove_punctuation(cls, sent: str):
@@ -51,7 +62,7 @@ class EvaluationTokenizer(object):
        )

    def tokenize(self, sent: str):
-        tokenized = self.tokenizer()(sent)
+        tokenized = self.tokenizer(sent)

        if self.punctuation_removal:
            tokenized = self.remove_punctuation(tokenized)