Commit de171aee by xuchen

add the pipeline of the MT task

parent 4fbd2ef6
train-subset: train
valid-subset: valid
max-epoch: 20
max-update: 100000
skip-invalid-size-inputs-valid-test: True
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-params:
#load-pretrained-encoder-from:
arch: transformer
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 8000
lr: 5e-4
adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy
label_smoothing: 0.1
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 8
#decoder-embed-dim: 256
#decoder-ffn-embed-dim: 2048
#decoder-attention-heads: 4
#attention-dropout: 0.1
#activation-dropout: 0.1
#! /bin/bash
gpu_num=1
test_subset=(test)
exp_name=
if [ "$#" -eq 1 ]; then
exp_name=$1
fi
n_average=5
beam_size=5
max_tokens=4096
cmd="./run.sh
--stage 2
--stop_stage 2
--gpu_num ${gpu_num}
--exp_name ${exp_name}
--test_subset ${test_subset}
--n_average ${n_average}
--beam_size ${beam_size}
--max_tokens ${max_tokens}
"
echo $cmd
eval $cmd
gpu_num=1
while :
do
all_devices=$(seq 0 `gpustat | sed '1,2d' | wc -l`);
count=0
for dev in ${all_devices[@]}
do
line=`expr $dev + 2`
use=`gpustat -p | head -n $line | tail -1 | cut -d '|' -f4 | wc -w`
if [[ $use -eq 0 ]]; then
device[$count]=$dev
count=`expr $count + 1`
if [[ $count -eq $gpu_num ]]; then
break
fi
fi
done
if [[ ${#device[@]} -lt $gpu_num ]]; then
sleep 60s
else
echo "Run $cmd"
eval $cmd
sleep 10s
exit
fi
done
#!/usr/bin/env bash
# Copyright 2012 Johns Hopkins University (Author: Daniel Povey);
# Arnab Ghoshal, Karel Vesely
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# Parse command-line options.
# To be sourced by another script (as in ". parse_options.sh").
# Option format is: --option-name arg
# and shell variable "option_name" gets set to value "arg."
# The exception is --help, which takes no arguments, but prints the
# $help_message variable (if defined).
###
### The --config file options have lower priority to command line
### options, so we need to import them first...
###
# Now import all the configs specified by command-line, in left-to-right order
for ((argpos=1; argpos<$#; argpos++)); do
if [ "${!argpos}" == "--config" ]; then
argpos_plus1=$((argpos+1))
config=${!argpos_plus1}
[ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
. $config # source the config file.
fi
done
###
### Now we process the command line options
###
while true; do
[ -z "${1:-}" ] && break; # break if there are no arguments
case "$1" in
# If the enclosing script is called with --help option, print the help
# message and exit. Scripts should put help messages in $help_message
--help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
else printf "$help_message\n" 1>&2 ; fi;
exit 0 ;;
--*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
exit 1 ;;
# If the first command-line argument begins with "--" (e.g. --foo-bar),
# then work out the variable name as $name, which will equal "foo_bar".
--*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
# Next we test whether the variable in question is undefned-- if so it's
# an invalid option and we die. Note: $0 evaluates to the name of the
# enclosing script.
# The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
# is undefined. We then have to wrap this test inside "eval" because
# foo_bar is itself inside a variable ($name).
eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
oldval="`eval echo \\$$name`";
# Work out whether we seem to be expecting a Boolean argument.
if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
was_bool=true;
else
was_bool=false;
fi
# Set the variable to the right value-- the escaped quotes make it work if
# the option had spaces, like --cmd "queue.pl -sync y"
eval $name=\"$2\";
# Check that Boolean-valued arguments are really Boolean.
if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
exit 1;
fi
shift 2;
;;
*) break;
esac
done
# Check for an empty argument to the --cmd option, which can easily occur as a
# result of scripting errors.
[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
true; # so this script returns exit code 0.
MAIN_ROOT=$PWD/../../..
KALDI_ROOT=$MAIN_ROOT/tools/kaldi
export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PATH
[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
. $KALDI_ROOT/tools/config/common_path.sh
export LC_ALL=C
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$MAIN_ROOT/src/lib
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$MAIN_ROOT/tools/chainer_ctc/ext/warp-ctc/build
. "${MAIN_ROOT}"/tools/activate_python.sh && . "${MAIN_ROOT}"/tools/extra_path.sh
export PATH=$MAIN_ROOT/utils:$MAIN_ROOT/espnet/bin:$PATH
export OMP_NUM_THREADS=1
# check extra module installation
if ! which tokenizer.perl > /dev/null; then
echo "Error: it seems that moses is not installed." >&2
echo "Error: please install moses as follows." >&2
echo "Error: cd ${MAIN_ROOT}/tools && make moses.done" >&2
return 1
fi
# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
get_devices(){
gpu_num=$1
use_cpu=$2
device=()
while :
do
record=`mktemp -t temp.record.XXXXXX`
gpustat > $record
all_devices=$(seq 0 `cat $record | sed '1,2d' | wc -l`);
count=0
for dev in ${all_devices[@]}
do
line=`expr $dev + 2`
use=`cat $record | head -n $line | tail -1 | cut -d '|' -f3 | cut -d '/' -f1`
if [[ $use -lt 100 ]]; then
device[$count]=$dev
count=`expr $count + 1`
if [[ $count -eq $gpu_num ]]; then
break
fi
fi
done
if [[ ${#device[@]} -lt $gpu_num ]]; then
if [[ $use_cpu -eq 1 ]]; then
device=(-1)
else
sleep 60s
fi
else
break
fi
done
echo ${device[*]} | sed 's/ /,/g'
return $?
}
#! /bin/bash
# Processing MuST-C Datasets
# Copyright 2021 Natural Language Processing Laboratory
# Xu Chen (xuchenneu@163.com)
# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set -e
#set -u
set -o pipefail
export PYTHONIOENCODING=UTF-8
eval=1
time=$(date "+%m%d_%H%M")
stage=0
stop_stage=0
######## hardware ########
# devices
device=()
gpu_num=8
update_freq=1
root_dir=~/st/Fairseq-S2T
pwd_dir=$PWD
# dataset
src_lang=swa
tgt_lang=en
lang=${src_lang}-${tgt_lang}
dataset=lower
task=translation
vocab_type=unigram
vocab_size=32000
share_dict=1
org_data_dir=/media/data/${dataset}/mt
data_dir=~/st/data/${dataset}/mt/${lang}
train_prefix=train
valid_prefix=dev
test_prefix=test
test_subset=(test)
# exp
extra_tag=
extra_parameter=
exp_tag=baseline
exp_name=
# config
train_config=train.yaml
# training setting
fp16=1
max_tokens=4096
step_valid=0
bleu_valid=0
# decoding setting
n_average=10
beam_size=5
. ./local/parse_options.sh || exit 1;
if [[ $step_valid -eq 1 ]]; then
validate_interval=10000
save_interval=10000
no_epoch_checkpoints=1
save_interval_updates=5000
keep_interval_updates=3
else
validate_interval=1
keep_last_epochs=10
fi
# full path
train_config=$pwd_dir/conf/${train_config}
if [[ -z ${exp_name} ]]; then
exp_name=$(basename ${train_config%.*})_${exp_tag}
if [[ -n ${extra_tag} ]]; then
exp_name=${exp_name}_${extra_tag}
fi
fi
model_dir=$root_dir/../checkpoints/$dataset/mt/${exp_name}
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "stage -1: Data Download"
# pass
fi
data_dir=${data_dir}/${vocab_type}${vocab_size}
if [[ $share_dict -eq 1 ]]; then
data_dir=${data_dir}_share
fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
### Task dependent. You have to make data the following preparation part by yourself.
echo "stage 0: MT Data Preparation"
if [[ ! -e ${data_dir} ]]; then
mkdir -p ${data_dir}
fi
src_vocab_prefix=spm_${vocab_type}${vocab_size}_${src_lang}
tgt_vocab_prefix=spm_${vocab_type}${vocab_size}_${tgt_lang}
cmd="python ${root_dir}/examples/speech_to_text/prep_mt_data.py
--data-root ${org_data_dir}
--output-root ${data_dir}
--splits ${train_prefix},${valid_prefix},${test_prefix}
--src-lang ${src_lang}
--tgt-lang ${tgt_lang}
--vocab-type ${vocab_type}
--vocab-size ${vocab_size}"
if [[ $share_dict -eq 1 ]]; then
cmd="$cmd
--share"
src_vocab_prefix=spm_unigram${vocab_size}_share
tgt_vocab_prefix=spm_unigram${vocab_size}_share
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
mkdir -p ${data_dir}/data
for split in ${train_prefix} ${valid_prefix} ${test_prefix}; do
cmd="spm_encode
--model ${data_dir}/${src_vocab_prefix}.model
--output_format=piece
< ${org_data_dir}/${lang}/data/${split}/${split}.${src_lang}
> ${data_dir}/data/${split}.${src_lang}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
cmd="spm_encode
--model ${data_dir}/${tgt_vocab_prefix}.model
--output_format=piece
< ${org_data_dir}/${lang}/data/${split}/${split}.${tgt_lang}
> ${data_dir}/data/${split}.${tgt_lang}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
done
cmd="python ${root_dir}/fairseq_cli/preprocess.py
--source-lang ${src_lang} --target-lang ${tgt_lang}
--trainpref ${data_dir}/data/${train_prefix}
--validpref ${data_dir}/data/${valid_prefix}
--testpref ${data_dir}/data/${test_prefix}
--destdir ${data_dir}/data-bin
--srcdict ${data_dir}/${src_vocab_prefix}.txt
--tgtdict ${data_dir}/${tgt_vocab_prefix}.txt
--workers 64"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
fi
data_dir=${data_dir}/data-bin
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "stage 1: MT Network Training"
[[ ! -d ${data_dir} ]] && echo "The data dir ${data_dir} is not existing!" && exit 1;
if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
if [[ ${gpu_num} -eq 0 ]]; then
device=()
else
source ./local/utils.sh
device=$(get_devices $gpu_num 0)
fi
fi
echo -e "dev=${device} data=${data_dir} model=${model_dir}"
if [[ ! -d ${model_dir} ]]; then
mkdir -p ${model_dir}
else
echo "${model_dir} exists."
fi
cp ${BASH_SOURCE[0]} ${model_dir}
cp ${PWD}/train.sh ${model_dir}
cp ${train_config} ${model_dir}
cmd="python3 -u ${root_dir}/fairseq_cli/train.py
${data_dir}
--source-lang ${src_lang}
--target-lang ${tgt_lang}
--train-config ${train_config}
--task ${task}
--max-tokens ${max_tokens}
--update-freq ${update_freq}
--log-interval 100
--save-dir ${model_dir}
--tensorboard-logdir ${model_dir}"
if [[ -n ${extra_parameter} ]]; then
cmd="${cmd}
${extra_parameter}"
fi
if [[ ${gpu_num} -gt 0 ]]; then
cmd="${cmd}
--distributed-world-size $gpu_num
--ddp-backend no_c10d"
fi
if [[ $fp16 -eq 1 ]]; then
cmd="${cmd}
--fp16"
fi
if [[ $bleu_valid -eq 1 ]]; then
cmd="$cmd
--eval-bleu
--eval-bleu-args '{\"beam\": 1}'
--eval-tokenized-bleu
--eval-bleu-remove-bpe
--best-checkpoint-metric bleu
--maximize-best-checkpoint-metric"
fi
if [[ -n $no_epoch_checkpoints && $no_epoch_checkpoints -eq 1 ]]; then
cmd="$cmd
--no-epoch-checkpoints"
fi
if [[ -n $validate_interval ]]; then
cmd="${cmd}
--validate-interval $validate_interval "
fi
if [[ -n $save_interval ]]; then
cmd="${cmd}
--save-interval $save_interval "
fi
if [[ -n $keep_last_epochs ]]; then
cmd="${cmd}
--keep-last-epochs $keep_last_epochs "
fi
if [[ -n $save_interval_updates ]]; then
cmd="${cmd}
--save-interval-updates $save_interval_updates"
if [[ -n $keep_interval_updates ]]; then
cmd="${cmd}
--keep-interval-updates $keep_interval_updates"
fi
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
# save info
log=./history.log
echo "${time} | ${device} | ${data_dir} | ${model_dir} " >> $log
cat $log | tail -n 50 > tmp.log
mv tmp.log $log
export CUDA_VISIBLE_DEVICES=${device}
cmd="nohup ${cmd} >> ${model_dir}/train.log 2>&1 &"
if [[ $eval -eq 1 ]]; then
eval $cmd
sleep 2s
tail -n `wc -l ${model_dir}/train.log | awk '{print $1+1}'` -f ${model_dir}/train.log
fi
fi
wait
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "stage 2: MT Decoding"
if [[ ${n_average} -ne 1 ]]; then
# Average models
dec_model=avg_${n_average}_checkpoint.pt
cmd="python ${root_dir}/scripts/average_checkpoints.py
--inputs ${model_dir}
--num-epoch-checkpoints ${n_average}
--output ${model_dir}/${dec_model}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval $cmd
else
dec_model=checkpoint_best.pt
fi
if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
if [[ ${gpu_num} -eq 0 ]]; then
device=()
else
source ./local/utils.sh
device=$(get_devices $gpu_num 0)
fi
fi
export CUDA_VISIBLE_DEVICES=${device}
#tmp_file=$(mktemp ${model_dir}/tmp-XXXXX)
#trap 'rm -rf ${tmp_file}' EXIT
result_file=${model_dir}/decode_result
[[ -f ${result_file} ]] && rm ${result_file}
for subset in ${test_subset[@]}; do
cmd="python ${root_dir}/fairseq_cli/generate.py
${data_dir}
--source-lang ${src_lang}
--target-lang ${tgt_lang}
--config-yaml ${data_config}
--gen-subset ${subset}
--task ${task}
--path ${model_dir}/${dec_model}
--results-path ${model_dir}
--max-tokens ${max_tokens}
--beam ${beam_size}
--scoring sacrebleu"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
if [[ $eval -eq 1 ]]; then
eval $cmd
tail -n 1 ${model_dir}/generate-${subset}.txt >> ${result_file}
fi
done
cat ${result_file}
fi
#! /bin/bash
# training the model
gpu_num=8
update_freq=1
max_tokens=4096
extra_tag=
extra_parameter=
#extra_tag="${extra_tag}"
#extra_parameter="${extra_parameter} "
exp_tag=baseline_997
train_config=train.yaml
cmd="./run.sh
--stage 1
--stop_stage 1
--gpu_num ${gpu_num}
--update_freq ${update_freq}
--train_config ${train_config}
--max_tokens ${max_tokens}
"
if [[ -n ${exp_tag} ]]; then
cmd="$cmd --exp_tag ${exp_tag}"
fi
if [[ -n ${extra_tag} ]]; then
cmd="$cmd --extra_tag ${extra_tag}"
fi
if [[ -n ${extra_parameter} ]]; then
cmd="$cmd --extra_parameter \"${extra_parameter}\""
fi
echo $cmd
eval $cmd
...@@ -13,7 +13,7 @@ get_devices(){ ...@@ -13,7 +13,7 @@ get_devices(){
do do
line=`expr $dev + 2` line=`expr $dev + 2`
use=`cat $record | head -n $line | tail -1 | cut -d '|' -f3 | cut -d '/' -f1` use=`cat $record | head -n $line | tail -1 | cut -d '|' -f3 | cut -d '/' -f1`
if [[ $use -lt 10 ]]; then if [[ $use -lt 100 ]]; then
device[$count]=$dev device[$count]=$dev
count=`expr $count + 1` count=`expr $count + 1`
if [[ $count -eq $gpu_num ]]; then if [[ $count -eq $gpu_num ]]; then
......
...@@ -33,13 +33,16 @@ tgt_lang=de ...@@ -33,13 +33,16 @@ tgt_lang=de
lang=${src_lang}-${tgt_lang} lang=${src_lang}-${tgt_lang}
dataset=mustc dataset=mustc
task=translation_with_tokenizer task=translation
vocab_type=unigram vocab_type=unigram
vocab_size=10000 vocab_size=10000
share_dict=1 share_dict=1
org_data_dir=/media/data/${dataset} org_data_dir=/media/data/${dataset}
data_dir=~/st/data/${dataset}/mt/${lang} data_dir=~/st/data/${dataset}/mt/${lang}
train_prefix=train
valid_prefix=dev
test_prefix=tst-COMMON
test_subset=(tst-COMMON) test_subset=(tst-COMMON)
# exp # exp
...@@ -49,7 +52,7 @@ exp_tag=baseline ...@@ -49,7 +52,7 @@ exp_tag=baseline
exp_name= exp_name=
# config # config
train_config=st_train_ctc.yaml train_config=train.yaml
# training setting # training setting
fp16=1 fp16=1
...@@ -74,12 +77,6 @@ else ...@@ -74,12 +77,6 @@ else
keep_last_epochs=10 keep_last_epochs=10
fi fi
if [[ ${share_dict} -eq 1 ]]; then
data_config=config_share.yaml
else
data_config=config.yaml
fi
# full path # full path
train_config=$pwd_dir/conf/${train_config} train_config=$pwd_dir/conf/${train_config}
if [[ -z ${exp_name} ]]; then if [[ -z ${exp_name} ]]; then
...@@ -96,6 +93,11 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then ...@@ -96,6 +93,11 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
# pass # pass
fi fi
data_dir=${data_dir}/${vocab_type}${vocab_size}
if [[ $share_dict -eq 1 ]]; then
data_dir=${data_dir}_share
fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
### Task dependent. You have to make data the following preparation part by yourself. ### Task dependent. You have to make data the following preparation part by yourself.
echo "stage 0: MT Data Preparation" echo "stage 0: MT Data Preparation"
...@@ -103,10 +105,12 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then ...@@ -103,10 +105,12 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
mkdir -p ${data_dir} mkdir -p ${data_dir}
fi fi
src_vocab_prefix=spm_${vocab_type}${vocab_size}_${src_lang}
tgt_vocab_prefix=spm_${vocab_type}${vocab_size}_${tgt_lang}
cmd="python ${root_dir}/examples/speech_to_text/prep_mt_data.py cmd="python ${root_dir}/examples/speech_to_text/prep_mt_data.py
--data-root ${org_data_dir} --data-root ${org_data_dir}
--output-root ${data_dir} --output-root ${data_dir}
--splits train,dev,tst-COMMON,tst-HE --splits ${train_prefix},${valid_prefix},${test_prefix}
--src-lang ${src_lang} --src-lang ${src_lang}
--tgt-lang ${tgt_lang} --tgt-lang ${tgt_lang}
--vocab-type ${vocab_type} --vocab-type ${vocab_type}
...@@ -114,12 +118,50 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then ...@@ -114,12 +118,50 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
if [[ $share_dict -eq 1 ]]; then if [[ $share_dict -eq 1 ]]; then
cmd="$cmd cmd="$cmd
--share" --share"
src_vocab_prefix=spm_unigram${vocab_size}_share
tgt_vocab_prefix=spm_unigram${vocab_size}_share
fi fi
echo -e "\033[34mRun command: \n${cmd} \033[0m" echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd} [[ $eval -eq 1 ]] && eval ${cmd}
mkdir -p ${data_dir}/data
for split in ${train_prefix} ${valid_prefix} ${test_prefix}; do
cmd="spm_encode
--model ${data_dir}/${src_vocab_prefix}.model
--output_format=piece
< ${org_data_dir}/${lang}/data/${split}/${split}.${src_lang}
> ${data_dir}/data/${split}.${src_lang}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
cmd="spm_encode
--model ${data_dir}/${tgt_vocab_prefix}.model
--output_format=piece
< ${org_data_dir}/${lang}/data/${split}/${split}.${tgt_lang}
> ${data_dir}/data/${split}.${tgt_lang}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
done
cmd="python ${root_dir}/fairseq_cli/preprocess.py
--source-lang ${src_lang} --target-lang ${tgt_lang}
--trainpref ${data_dir}/data/train/${train_prefix}
--validpref ${data_dir}/data/dev/${valid_prefix}
--testpref ${data_dir}/data/test/${test_prefix}
--destdir ${data_dir}/data-bin
--srcdict ${data_dir}/${src_vocab_prefix}.txt
--tgtdict ${data_dir}/${tgt_vocab_prefix}.txt
--workers 64"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
fi fi
data_dir=${data_dir}/data-bin
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "stage 1: MT Network Training" echo "stage 1: MT Network Training"
[[ ! -d ${data_dir} ]] && echo "The data dir ${data_dir} is not existing!" && exit 1; [[ ! -d ${data_dir} ]] && echo "The data dir ${data_dir} is not existing!" && exit 1;
...@@ -149,7 +191,6 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ...@@ -149,7 +191,6 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
${data_dir} ${data_dir}
--source-lang ${src_lang} --source-lang ${src_lang}
--target-lang ${tgt_lang} --target-lang ${tgt_lang}
--config-yaml ${data_config}
--train-config ${train_config} --train-config ${train_config}
--task ${task} --task ${task}
--max-tokens ${max_tokens} --max-tokens ${max_tokens}
......
...@@ -25,6 +25,7 @@ log = logging.getLogger(__name__) ...@@ -25,6 +25,7 @@ log = logging.getLogger(__name__)
MANIFEST_COLUMNS = ["src_text", "tgt_text"] MANIFEST_COLUMNS = ["src_text", "tgt_text"]
class MTData(Dataset): class MTData(Dataset):
""" """
Create a Dataset for MuST-C. Each item is a tuple of the form: Create a Dataset for MuST-C. Each item is a tuple of the form:
...@@ -34,8 +35,8 @@ class MTData(Dataset): ...@@ -34,8 +35,8 @@ class MTData(Dataset):
def __init__(self, root: str, src_lang, tgt_lang: str, split: str) -> None: def __init__(self, root: str, src_lang, tgt_lang: str, split: str) -> None:
_root = Path(root) / f"{src_lang}-{tgt_lang}" / "data" / split _root = Path(root) / f"{src_lang}-{tgt_lang}" / "data" / split
txt_root = _root / "txt" txt_root = _root
assert _root.is_dir() and txt_root.is_dir() assert _root.is_dir() and txt_root.is_dir(), (_root, txt_root)
# Load source and target text # Load source and target text
self.data = [] self.data = []
for _lang in [src_lang, tgt_lang]: for _lang in [src_lang, tgt_lang]:
...@@ -83,8 +84,6 @@ def process(args): ...@@ -83,8 +84,6 @@ def process(args):
if is_train_split: if is_train_split:
src_train_text.extend(manifest["src_text"]) src_train_text.extend(manifest["src_text"])
tgt_train_text.extend(manifest["tgt_text"]) tgt_train_text.extend(manifest["tgt_text"])
df = pd.DataFrame.from_dict(manifest)
save_df_to_tsv(df, output_root / f"{split}.tsv")
# Generate vocab and yaml # Generate vocab and yaml
v_size_str = "" if args.vocab_type == "char" else str(args.vocab_size) v_size_str = "" if args.vocab_type == "char" else str(args.vocab_size)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论