set -e eval=1 root_dir=~/st/Fairseq-S2T data_dir=/home/xuchen/st/data/wmt/test vocab_dir=/home/xuchen/st/data/wmt/mt/en-de/unigram32000_share src_vocab_prefix=spm_unigram32000_share tgt_vocab_prefix=spm_unigram32000_share src_lang=en tgt_lang=de tokenize=1 splits=(newstest2014 newstest2016) for split in ${splits[@]}; do src_file=${data_dir}/${split}.${src_lang} tgt_file=${data_dir}/${split}.${tgt_lang} if [[ ${tokenize} -eq 1 ]]; then cmd="tokenizer.perl -l ${src_lang} --threads 8 -no-escape < ${src_file} > ${src_file}.tok" echo -e "\033[34mRun command: \n${cmd} \033[0m" [[ $eval -eq 1 ]] && eval ${cmd} cmd="tokenizer.perl -l ${tgt_lang} --threads 8 -no-escape < ${tgt_file} > ${tgt_file}.tok" echo -e "\033[34mRun command: \n${cmd} \033[0m" [[ $eval -eq 1 ]] && eval ${cmd} src_file=${src_file}.tok tgt_file=${tgt_file}.tok fi cmd="spm_encode --model ${vocab_dir}/${src_vocab_prefix}.model --output_format=piece < ${src_file} > ${src_file}.spm" echo -e "\033[34mRun command: \n${cmd} \033[0m" [[ $eval -eq 1 ]] && eval ${cmd} cmd="spm_encode --model ${vocab_dir}/${tgt_vocab_prefix}.model --output_format=piece < ${tgt_file} > ${tgt_file}.spm" echo -e "\033[34mRun command: \n${cmd} \033[0m" [[ $eval -eq 1 ]] && eval ${cmd} src_file=${src_file}.spm tgt_file=${tgt_file}.spm mkdir -p ${data_dir}/final cmd="cp ${src_file} ${data_dir}/final/${split}.${src_lang}" echo -e "\033[34mRun command: \n${cmd} \033[0m" [[ $eval -eq 1 ]] && eval ${cmd} cmd="cp ${tgt_file} ${data_dir}/final/${split}.${tgt_lang}" echo -e "\033[34mRun command: \n${cmd} \033[0m" [[ $eval -eq 1 ]] && eval ${cmd} done n_set=${#splits[*]} for ((i=0;i<$n_set;i++)); do dataset[$i]=${data_dir}/final/${splits[$i]} done pref=`echo ${dataset[*]} | sed 's/ /,/g'` cmd="python ${root_dir}/fairseq_cli/preprocess.py --source-lang ${src_lang} --target-lang ${tgt_lang} --testpref ${pref} --destdir ${data_dir}/data-bin --srcdict ${vocab_dir}/${src_vocab_prefix}.txt --tgtdict ${vocab_dir}/${tgt_vocab_prefix}.txt --workers 64" echo -e "\033[34mRun command: \n${cmd} \033[0m" [[ $eval -eq 1 ]] && eval ${cmd}