Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
F
Fairseq-S2T
概览
Overview
Details
Activity
Cycle Analytics
版本库
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
问题
0
Issues
0
列表
Board
标记
里程碑
合并请求
0
Merge Requests
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
Snippets
成员
Collapse sidebar
Close sidebar
活动
图像
聊天
创建新问题
作业
提交
Issue Boards
Open sidebar
xuchen
Fairseq-S2T
Commits
e1d3d2ed
Commit
e1d3d2ed
authored
Jul 25, 2022
by
xuchen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update shell scripts
parent
de9ef921
显示空白字符变更
内嵌
并排
正在显示
6 个修改的文件
包含
35 行增加
和
37 行删除
+35
-37
egs/tibetan/asr/conf/basis.yaml
+1
-0
egs/tibetan/asr/conf/big.yaml
+3
-0
egs/tibetan/asr/conf/ctc.yaml
+2
-1
egs/tibetan/asr/conf/inter.yaml
+5
-20
egs/tibetan/asr/local/utils.sh
+1
-1
egs/tibetan/asr/run.sh
+23
-15
没有找到文件。
egs/tibetan/asr/conf/basis.yaml
查看文件 @
e1d3d2ed
...
...
@@ -6,6 +6,7 @@ max-update: 100000
patience
:
20
best_checkpoint_metric
:
loss
maximize_best_checkpoint_metric
:
False
post-process
:
sentencepiece
no-epoch-checkpoints
:
True
#keep-last-epochs: 10
...
...
egs/tibetan/asr/conf/big.yaml
查看文件 @
e1d3d2ed
...
...
@@ -11,6 +11,9 @@ adam_betas: (0.9,0.98)
criterion
:
label_smoothed_cross_entropy_with_ctc
label_smoothing
:
0.1
encoder-embed-norm
:
True
encoder-no-scale-embedding
:
True
subsampling-type
:
conv1d
subsampling-layers
:
2
subsampling-filter
:
2048
...
...
egs/tibetan/asr/conf/ctc.yaml
查看文件 @
e1d3d2ed
ctc-weight
:
0.3
post-process
:
sentencepiec
e
share-ctc-and-embed
:
Tru
e
\ No newline at end of file
egs/tibetan/asr/conf/inter.yaml
查看文件 @
e1d3d2ed
...
...
@@ -4,27 +4,12 @@ share-target-ctc-and-embed: True
interleaved-ctc-weight
:
0.2
interleaved-ctc-layers
:
6,9
sae-ctc-temperature
:
1.0
interleaved-ctc-drop-prob
:
0
#target-ctc-weight: 0.3
#target-ctc-layer: 6
#target-interleaved-ctc-weight: 0.2
#target-interleaved-ctc-layers: 2,4
#sae-ground-truth-ratio: 0.1
sae-adapter
:
inter_league
sae-ctc-temperature
:
1
#sae-gumbel: True
#sae-distribution-hard: True
#sae-drop-prob: 0.0
#sae-distribution-cutoff: 10
#share-sae-and-ctc: True
#share-target-sae-and-ctc: True
#sae-embed-norm: True
#sae-out-norm: True
sae-drop-prob
:
0.0
sae-distribution-cutoff
:
0
share-ctc-and-sae
:
False
#ctc-self-distill-weight: 1
#target-ctc-self-distill-weight: 1
#ctc-self-distill-prob: 0.1
#cal-all-ctc: True
ctc-self-distill-weight
:
0
egs/tibetan/asr/local/utils.sh
查看文件 @
e1d3d2ed
...
...
@@ -14,7 +14,7 @@ get_devices(){
do
line
=
$((
dev
+
2
))
use
=
$(
head
-n
$line
${
record
}
| tail
-1
| cut
-d
'|'
-f3
| cut
-d
'/'
-f1
)
if
[[
$use
-lt
100
]]
;
then
if
[[
$use
-lt
100
0
]]
;
then
device[
$count
]=
$dev
count
=
$((
count
+
1
))
if
[[
$count
-eq
$gpu_num
]]
;
then
...
...
egs/tibetan/asr/run.sh
查看文件 @
e1d3d2ed
#! /bin/bash
# Processing
MuST-C Datasets
# Processing
Tibetan ASR Dataset
# Copyright 2021 Natural Language Processing Laboratory
# Xu Chen (xuchenneu@163.com)
...
...
@@ -30,8 +30,6 @@ pwd_dir=$PWD
# dataset
src_lang
=
ti
tgt_lang
=
de
lang
=
${
src_lang
}
-
${
tgt_lang
}
dataset
=
tibetan
subset
=
seda
...
...
@@ -39,10 +37,10 @@ task=speech_to_text
vocab_type
=
unigram
vocab_type
=
char
#vocab_type=word
vocab_size
=
17
00
vocab_size
=
50
00
speed_perturb
=
0
lcrm
=
0
tokenizer
=
1
tokenizer
=
0
use_raw_audio
=
0
use_specific_dict
=
0
...
...
@@ -54,9 +52,9 @@ if [[ -n ${subset} ]]; then
dataset
=
${
dataset
}
/
${
subset
}
fi
org_data_dir
=
${
root_dir
}
/data/
${
dataset
}
data_dir
=
${
root_dir
}
/data/
${
dataset
}
/asr_char
data_dir
=
${
root_dir
}
/data/
${
dataset
}
/asr_word
#
data_dir=${root_dir}/data/${dataset}/asr
#
data_dir=${root_dir}/data/${dataset}/asr_char
#
data_dir=${root_dir}/data/${dataset}/asr_word
data_dir
=
${
root_dir
}
/data/
${
dataset
}
/asr
train_split
=
train
valid_split
=
dev
test_split
=
test
...
...
@@ -105,6 +103,10 @@ if [[ ${use_raw_audio} -eq 1 ]]; then
data_dir
=
${
data_dir
}
_raw
exp_prefix
=
${
exp_prefix
}
_raw
fi
if
[[
"
${
vocab_type
}
"
==
"char"
]]
;
then
data_dir
=
${
data_dir
}
_char
exp_prefix
=
${
exp_prefix
}
_char
fi
.
./local/parse_options.sh
||
exit
1
;
...
...
@@ -268,10 +270,6 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
cmd
=
"
${
cmd
}
--save-interval
$save_interval
"
fi
if
[[
-n
$keep_last_epochs
]]
;
then
cmd
=
"
${
cmd
}
--keep-last-epochs
$keep_last_epochs
"
fi
if
[[
-n
$save_interval_updates
]]
;
then
cmd
=
"
${
cmd
}
--save-interval-updates
$save_interval_updates
"
...
...
@@ -290,11 +288,12 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
mv tmp.log
$log
export
CUDA_VISIBLE_DEVICES
=
${
device
}
cmd
=
"nohup
${
cmd
}
>>
${
model_dir
}
/train.log 2>&1 &"
log
=
${
model_dir
}
/train.log
cmd
=
"nohup
${
cmd
}
>>
${
log
}
2>&1 &"
if
[[
$eval
-eq
1
]]
;
then
eval
$cmd
sleep 2s
tail
-n
"
$(
wc
-l
${
model_dir
}
/train.log | awk
'{print $1+1}'
)
"
-f
${
model_dir
}
/train.log
tail
-n
"
$(
wc
-l
${
log
}
| awk
'{print $1+1}'
)
"
-f
${
log
}
fi
fi
wait
...
...
@@ -327,7 +326,16 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
fi
export
CUDA_VISIBLE_DEVICES
=
${
device
}
result_file
=
${
model_dir
}
/decode_result
suffix
=
beam
${
beam_size
}
_alpha
${
len_penalty
}
_tokens
${
max_tokens
}
if
[[
-z
${
cer
}
&&
${
cer
}
-eq
1
]]
;
then
suffix
=
${
suffix
}
_cer
else
suffix
=
${
suffix
}
_wer
fi
if
[[
${
n_average
}
-ne
1
]]
;
then
suffix
=
${
suffix
}
_
${
n_average
}
fi
result_file
=
${
model_dir
}
/decode_result_
${
suffix
}
[[
-f
${
result_file
}
]]
&&
rm
${
result_file
}
test_subset
=
${
test_subset
//,/
}
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论