Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
S
S2T
概览
Overview
Details
Activity
Cycle Analytics
版本库
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
问题
0
Issues
0
列表
Board
标记
里程碑
合并请求
0
Merge Requests
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
Snippets
成员
Collapse sidebar
Close sidebar
活动
图像
聊天
创建新问题
作业
提交
Issue Boards
Open sidebar
xuchen
S2T
Commits
478c694b
Commit
478c694b
authored
Aug 03, 2023
by
xuchen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
shell
parent
e248f2f0
显示空白字符变更
内嵌
并排
正在显示
3 个修改的文件
包含
69 行增加
和
120 行删除
+69
-120
egs/mustc/asr/decode.sh
+4
-4
egs/mustc/asr/run.sh
+50
-95
egs/mustc/st/run.sh
+15
-21
没有找到文件。
egs/mustc/asr/decode.sh
查看文件 @
478c694b
#!/usr/bin/env bash
gpu_num
=
1
gpu_num
=
0
data_dir
=
test_subset
=(
dev tst-COMMON
)
...
...
@@ -15,12 +15,12 @@ ctc_infer=0
n_average
=
10
beam_size
=
5
len_penalty
=
1.0
max_tokens
=
8
0000
max_tokens
=
5
0000
dec_model
=
checkpoint_best.pt
cmd
=
"./run.sh
--stage
3
--stop_stage
3
--stage
2
--stop_stage
2
--gpu_num
${
gpu_num
}
--exp_name
${
exp_name
}
--n_average
${
n_average
}
...
...
egs/mustc/asr/run.sh
查看文件 @
478c694b
...
...
@@ -2,8 +2,7 @@
# Processing MuST-C Datasets
# Copyright 2021 Natural Language Processing Laboratory
# Xu Chen (xuchenneu@163.com)
# Copyright 2021 Chen Xu (xuchennlp@outlook.com)
# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
...
...
@@ -16,22 +15,21 @@ eval=1
time
=
$(
date
"+%m%d_%H%M"
)
stage
=
1
stop_stage
=
4
stop_stage
=
2
########
h
ardware ########
#
d
evices
########
H
ardware ########
#
D
evices
device
=(
0
)
gpu_num
=
8
update_freq
=
1
hdfs_get
=
0
root_dir
=
/opt/tiger
data_root_dir
=
/mnt/bn/nas-xc-1
code_dir
=
${
root_dir
}
/s2t
pwd_dir
=
$PWD
root_dir
=
${
ST_ROOT
}
data_root_dir
=
${
root_dir
}
code_dir
=
${
root_dir
}
/S2T
#
d
ataset
#
D
ataset
src_lang
=
en
tgt_lang
=
de
dataset
=
must_c
...
...
@@ -63,24 +61,22 @@ valid_split=dev
test_split
=
tst-COMMON
test_subset
=
dev,tst-COMMON
# exp
# Exp
sub_tag
=
exp_prefix
=
$(
date
"+%m%d"
)
# exp_subfix=${ARNOLD_JOB_ID}_${ARNOLD_TASK_ID}_${ARNOLD_TRIAL_ID}
extra_tag
=
extra_parameter
=
exp_tag
=
baseline
exp_name
=
#
config
#
Training Settings
train_config
=
base,ctc
data_config
=
config.yaml
# training setting
fp16
=
1
max_tokens
=
40000
step_valid
=
0
data_config
=
config.yaml
#
decoding setting
#
Decoding Settings
cer
=
0
ctc_infer
=
0
ctc_self_ensemble
=
0
...
...
@@ -92,6 +88,7 @@ len_penalty=1.0
infer_score
=
0
infer_parameters
=
# Parsing Options
if
[[
${
speed_perturb
}
-eq
1
]]
;
then
data_dir
=
${
data_dir
}
_sp
exp_prefix
=
${
exp_prefix
}
_sp
...
...
@@ -124,19 +121,6 @@ if [[ ! -d ${data_dir} ]]; then
exit
fi
# setup nccl envs
export
NCCL_IB_DISABLE
=
0
export
NCCL_IB_HCA
=
$ARNOLD_RDMA_DEVICE
:1
export
NCCL_IB_GID_INDEX
=
3
export
NCCL_SOCKET_IFNAME
=
eth0
HOSTS
=
$ARNOLD_WORKER_HOSTS
HOST
=(
${
HOSTS
//,/
}
)
HOST_SPLIT
=(
${
HOST
//
:/
}
)
PORT
=
${
HOST_SPLIT
[1]
}
INIT_METHOD
=
"tcp://
${
ARNOLD_WORKER_0_HOST
}
:
${
ARNOLD_WORKER_0_PORT
}
"
DIST_RANK
=
$((
ARNOLD_ID
*
ARNOLD_WORKER_GPU
))
export
PATH
=
$PATH
:
${
code_dir
}
/scripts
.
./local/parse_options.sh
||
exit
1
;
...
...
@@ -150,21 +134,27 @@ if [[ -z ${exp_name} ]]; then
exp_name
=
${
exp_name
}
_
${
exp_subfix
}
fi
fi
model_dir
=
${
code_dir
}
/checkpoints/
${
data_model_subfix
}
/
${
exp_name
}
echo
"stage:
$stage
"
echo
"stop_stage:
$stop_stage
"
ckpt_dir
=
${
root_dir
}
/checkpoints/
model_dir
=
${
root_dir
}
/checkpoints/
${
data_model_subfix
}
/
${
sub_tag
}
/
${
exp_name
}
# Start
cd
${
code_dir
}
echo
"Start Stage:
$stage
"
echo
"Stop Stage:
$stop_stage
"
if
[[
`
pip list |
grep
fairseq | wc
-l
`
-eq
0
]]
;
then
echo
"Default Stage: env configure"
pip3 install
-e
${
code_dir
}
fi
if
[
${
stage
}
-le
-1
]
&&
[
${
stop_stage
}
-ge
-1
]
;
then
echo
"stage -1: Data Download"
# pass
echo
"Stage -1: Data Download"
fi
if
[
${
stage
}
-le
0
]
&&
[
${
stop_stage
}
-ge
0
]
;
then
### Task dependent. You have to make data the following preparation part by yourself.
### But you can utilize Kaldi recipes in most cases
echo
"stage 0: ASR Data Preparation"
echo
"Stage 0: ASR Data Preparation"
if
[[
!
-e
${
data_dir
}
]]
;
then
mkdir
-p
${
data_dir
}
fi
...
...
@@ -205,32 +195,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
[[
$eval
-eq
1
]]
&&
eval
${
cmd
}
fi
if
[[
`
pip list |
grep
fairseq | wc
-l
`
-eq
0
]]
;
then
echo
"default stage: env configure"
pip3 install
-e
${
code_dir
}
-i
https://bytedpypi.byted.org/simple
--no-build-isolation
--default-timeout
=
10000
fi
if
[[
-d
/mnt/bn/nas-xc-1/checkpoints
&&
!
-d
${
code_dir
}
/checkpoints
]]
;
then
ln
-s
/mnt/bn/nas-xc-1/checkpoints
${
code_dir
}
fi
# if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
if
[
${
hdfs_get
}
-eq
1
]
;
then
ln_data_dir
=
`
echo
${
data_dir
}
| sed
-e
"s#
${
data_root_dir
}
#
${
code_dir
}
#"
`
echo
${
ln_data_dir
}
mkdir
-p
${
ln_data_dir
}
ln
-s
${
data_dir
}
/../
*
${
ln_data_dir
}
rm
-r
${
ln_data_dir
}
hdfs_path
=
`
echo
${
data_dir
}
| sed
-e
"s#
${
data_root_dir
}
#hdfs://haruna/home/byte_arnold_lq_mlnlc/user/xuchen/#"
`
hdfs dfs
-get
${
hdfs_path
}
${
ln_data_dir
}
sed
-i
-e
"s#
${
data_root_dir
}
#
${
code_dir
}
#"
${
ln_data_dir
}
/config
*
data_dir
=
${
ln_data_dir
}
fi
# fi
if
[
${
stage
}
-le
2
]
&&
[
${
stop_stage
}
-ge
2
]
;
then
echo
"stage 2: ASR Network Training"
if
[
${
stage
}
-le
1
]
&&
[
${
stop_stage
}
-ge
1
]
;
then
echo
"Stage 1: Network Training"
[[
!
-d
${
data_dir
}
]]
&&
echo
"The data dir
${
data_dir
}
is not existing!"
&&
exit
1
;
if
[[
-z
${
device
}
||
${#
device
[@]
}
-eq
0
]]
;
then
...
...
@@ -240,6 +206,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
source
./local/utils.sh
device
=
$(
get_devices
$gpu_num
0
)
fi
export
CUDA_VISIBLE_DEVICES
=
${
device
}
fi
echo
-e
"data=
${
data_dir
}
model=
${
model_dir
}
"
...
...
@@ -327,22 +294,17 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo
-e
"
\0
33[34mRun command:
\n
${
cmd
}
\0
33[0m"
# save info
log
=
.
/history.log
log
=
${
ckpt_dir
}
/history.log
echo
"
${
time
}
|
${
data_dir
}
|
${
exp_name
}
|
${
model_dir
}
"
>>
$log
tail
-n
50
${
log
}
>
tmp.log
mv tmp.log
$log
# export CUDA_VISIBLE_DEVICES=${device}
log
=
${
model_dir
}
/train.log
cmd
=
"
${
cmd
}
2>&1 | tee -a
${
log
}
"
#cmd="nohup ${cmd} >> ${log} 2>&1 &"
if
[[
$eval
-eq
1
]]
;
then
# tensorboard
if
[[
-z
${
ARNOLD_TENSORBOARD_CURRENT_PORT
}
]]
;
then
port
=
6666
else
port
=
${
ARNOLD_TENSORBOARD_CURRENT_PORT
}
fi
tensorboard
--logdir
${
model_dir
}
--port
${
port
}
--bind_all
&
echo
"
${
cmd
}
"
>
${
model_dir
}
/cmd
...
...
@@ -352,8 +314,8 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
fi
fi
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
echo
"
stage 3: ASR
Decoding"
if
[
${
stage
}
-le
2
]
&&
[
${
stop_stage
}
-ge
2
]
;
then
echo
"
Stage 2:
Decoding"
if
[[
${
n_average
}
-ne
1
]]
;
then
# Average models
dec_model
=
avg_
${
n_average
}
_checkpoint.pt
...
...
@@ -377,18 +339,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
source
./local/utils.sh
device
=
$(
get_devices
$gpu_num
0
)
fi
export
CUDA_VISIBLE_DEVICES
=
${
device
}
fi
# export CUDA_VISIBLE_DEVICES=${device}
suffix
=
beam
${
beam_size
}
_alpha
${
len_penalty
}
_tokens
${
max_tokens
}
if
[[
${
n_average
}
-ne
1
]]
;
then
suffix
=
${
suffix
}
_
${
n_average
}
fi
if
[[
-n
${
cer
}
&&
${
cer
}
-eq
1
]]
;
then
suffix
=
${
suffix
}
_cer
else
suffix
=
${
suffix
}
_wer
fi
if
[[
${
n_average
}
-ne
1
]]
;
then
suffix
=
${
suffix
}
_
${
n_average
}
fi
if
[[
${
infer_score
}
-eq
1
]]
;
then
suffix
=
${
suffix
}
_score
fi
...
...
@@ -435,9 +397,9 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
cd
${
code_dir
}
if
[[
$eval
-eq
1
]]
;
then
src_ctc_file
=
translation-
${
subset
}
.txt
.ctc
if
[[
-f
${
model_dir
}
/
${
src_
ctc_file
}
]]
;
then
rm
${
model_dir
}
/
${
src_
ctc_file
}
ctc_file
=
translation-
${
subset
}
.ctc
if
[[
${
ctc_infer
}
-eq
1
&&
-f
${
model_dir
}
/
${
ctc_file
}
]]
;
then
rm
${
model_dir
}
/
${
ctc_file
}
fi
eval
$cmd
...
...
@@ -448,33 +410,34 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
cd
${
pwd_dir
}
if
[[
-f
${
model_dir
}
/enc_dump
]]
;
then
mv
${
model_dir
}
/enc_dump
${
model_dir
}
/
${
subset
}
-
${
suffix
}
-enc-dump
mv
${
model_dir
}
/enc_dump
${
model_dir
}
/
dump-
${
subset
}
-enc-
${
suffix
}
fi
if
[[
-f
${
model_dir
}
/dec_dump
]]
;
then
mv
${
model_dir
}
/dec_dump
${
model_dir
}
/
${
subset
}
-
${
suffix
}
-dec-dump
mv
${
model_dir
}
/dec_dump
${
model_dir
}
/
dump-
${
subset
}
-dec-
${
suffix
}
fi
trans_file
=
translation-
${
subset
}
-
${
suffix
}
.txt
if
[[
${
ctc_infer
}
-eq
1
&&
-f
${
model_dir
}
/
${
src_
ctc_file
}
]]
;
then
if
[[
${
ctc_infer
}
-eq
1
&&
-f
${
model_dir
}
/
${
ctc_file
}
]]
;
then
ref_file
=
${
model_dir
}
/
${
subset
}
.
${
src_lang
}
if
[[
!
-f
${
ref_file
}
]]
;
then
python3 ./local/extract_txt_from_tsv.py
${
data_dir
}
/
${
subset
}
.tsv
${
ref_file
}
"src_text"
fi
if
[[
-f
${
ref_file
}
]]
;
then
src_
ctc
=
$(
mktemp
-t
temp.record.XXXXXX
)
ctc
=
$(
mktemp
-t
temp.record.XXXXXX
)
cd
./local
./cal_wer.sh
${
model_dir
}
${
subset
}
${
trans_file
}
${
src_ctc_file
}
${
ref_file
}
>
${
src_
ctc
}
./cal_wer.sh
${
model_dir
}
${
subset
}
${
trans_file
}
${
ctc_file
}
${
ref_file
}
>
${
ctc
}
cd
..
echo
"CTC WER"
>>
${
result_file
}
tail
-n
2
${
src_
ctc
}
>>
${
result_file
}
tail
-n
2
${
ctc
}
>>
${
result_file
}
src_bleu
=
$(
mktemp
-t
temp.record.XXXXXX
)
cd local
./cal_ctc_bleu.sh
${
model_dir
}
${
subset
}
${
trans_file
}
${
src_
ctc_file
}
${
ref_file
}
${
tokenizer
}
${
src_lang
}
>
${
src_bleu
}
./cal_ctc_bleu.sh
${
model_dir
}
${
subset
}
${
trans_file
}
${
ctc_file
}
${
ref_file
}
${
tokenizer
}
${
src_lang
}
>
${
src_bleu
}
cd
..
cat
${
src_bleu
}
>>
${
result_file
}
rm
${
src_
ctc
}
${
src_bleu
}
rm
${
ctc
}
${
src_bleu
}
else
echo
"No reference for source language."
fi
...
...
@@ -484,11 +447,3 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo
cat
${
result_file
}
fi
# if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# cd ${fairseq_dir}
# echo "Stage 4: Upload model and log"
# echo "Path: hdfs://haruna/home/byte_arnold_lq_mlnlc/user/xuchen/s2t/checkpoints/${data_model_subfix}/${exp_name}"
# hdfs dfs -mkdir -p hdfs://haruna/home/byte_arnold_lq_mlnlc/user/xuchen/s2t/checkpoints/${data_model_subfix}
# hdfs dfs -put -f ${model_dir} hdfs://haruna/home/byte_arnold_lq_mlnlc/user/xuchen/s2t/checkpoints/${data_model_subfix}
# fi
egs/mustc/st/run.sh
查看文件 @
478c694b
...
...
@@ -2,7 +2,7 @@
# Processing MuST-C Datasets
# Copyright 2021 Chen Xu (xuchenn
eu@163
.com)
# Copyright 2021 Chen Xu (xuchenn
lp@outlook
.com)
# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
...
...
@@ -17,19 +17,19 @@ time=$(date "+%m%d_%H%M")
stage
=
1
stop_stage
=
2
########
h
ardware ########
#
d
evices
########
H
ardware ########
#
D
evices
device
=(
0
)
gpu_num
=
8
update_freq
=
1
pwd_dir
=
$PWD
root_dir
=
${
pwd_dir
}
/../../../..
data_root_dir
=
${
root_dir
}
/data
root_dir
=
${
pwd_dir
}
/../../../..
/
data_root_dir
=
${
root_dir
}
code_dir
=
${
root_dir
}
/S2T
#
d
ataset
#
D
ataset
src_lang
=
en
tgt_lang
=
de
dataset
=
must_c
...
...
@@ -63,7 +63,7 @@ valid_split=dev
test_split
=
tst-COMMON
test_subset
=
dev,tst-COMMON
#
e
xp
#
E
xp
sub_tag
=
exp_prefix
=
$(
date
"+%m%d"
)
extra_tag
=
...
...
@@ -71,16 +71,14 @@ extra_parameter=
exp_tag
=
baseline
exp_name
=
#
config
#
Training Settings
train_config
=
base,ctc
# training setting
fp16
=
1
max_tokens
=
40000
step_valid
=
0
bleu_valid
=
0
#
decoding setting
#
Decoding Settings
sacrebleu
=
1
dec_model
=
checkpoint_best.pt
ctc_infer
=
0
...
...
@@ -90,6 +88,7 @@ len_penalty=1.0
infer_score
=
0
infer_parameters
=
# Parsing Options
if
[[
${
share_dict
}
-eq
1
]]
;
then
data_config
=
config_share.yaml
else
...
...
@@ -136,12 +135,14 @@ if [[ -z ${exp_name} ]]; then
exp_name
=
${
exp_name
}
_
${
exp_subfix
}
fi
fi
ckpt_dir
=
${
code_dir
}
/checkpoints/
model_dir
=
${
code_dir
}
/checkpoints/
${
data_model_subfix
}
/
${
sub_tag
}
/
${
exp_name
}
# Start
cd
${
code_dir
}
echo
"Start Stage:
$stage
"
echo
"Stop Stage:
$stop_stage
"
cd
${
code_dir
}
if
[[
`
pip list |
grep
fairseq | wc
-l
`
-eq
0
]]
;
then
echo
"Default Stage: env configure"
...
...
@@ -150,12 +151,10 @@ fi
if
[
${
stage
}
-le
-1
]
&&
[
${
stop_stage
}
-ge
-1
]
;
then
echo
"Stage -1: Data Download"
# pass
fi
if
[
${
stage
}
-le
0
]
&&
[
${
stop_stage
}
-ge
0
]
;
then
### Task dependent. You have to make data the following preparation part by yourself.
### But you can utilize Kaldi recipes in most cases
echo
"Stage 0: ASR Data Preparation"
if
[[
!
-e
${
data_dir
}
]]
;
then
mkdir
-p
${
data_dir
}
...
...
@@ -255,6 +254,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
source
./local/utils.sh
device
=
$(
get_devices
$gpu_num
0
)
fi
export
CUDA_VISIBLE_DEVICES
=
${
device
}
fi
echo
-e
"data=
${
data_dir
}
model=
${
model_dir
}
"
...
...
@@ -308,11 +308,6 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
cmd
=
"
${
cmd
}
--distributed-world-size
$gpu_num
--ddp-backend no_c10d"
if
[[
${
DIST_RANK
}
-ne
0
]]
;
then
cmd
=
"
${
cmd
}
--distributed-init-method
${
INIT_METHOD
}
--distributed-rank
${
DIST_RANK
}
"
fi
fi
if
[[
$fp16
-eq
1
]]
;
then
cmd
=
"
${
cmd
}
...
...
@@ -362,7 +357,6 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo
"
${
time
}
|
${
data_dir
}
|
${
exp_name
}
|
${
model_dir
}
"
>>
$log
tail
-n
50
${
log
}
>
tmp.log
mv tmp.log
$log
# export CUDA_VISIBLE_DEVICES=${device}
log
=
${
model_dir
}
/train.log
cmd
=
"
${
cmd
}
2>&1 | tee -a
${
log
}
"
...
...
@@ -404,8 +398,8 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
source
./local/utils.sh
device
=
$(
get_devices
$gpu_num
0
)
fi
export
CUDA_VISIBLE_DEVICES
=
${
device
}
fi
# export CUDA_VISIBLE_DEVICES=${device}
suffix
=
beam
${
beam_size
}
_alpha
${
len_penalty
}
_tokens
${
max_tokens
}
if
[[
${
n_average
}
-ne
1
]]
;
then
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论